In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import join
from parse_midi import Midi
import json

import matplotlib.pyplot as plt
from matplotlib.pyplot import cm, figure
from IPython.display import set_matplotlib_formats
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
set_matplotlib_formats('svg')
plt.rcParams['figure.dpi'] = 1200

import seaborn as sns
sns.set_style("darkgrid")


def filter_timerange(timed_notes, start, end):
    # returns all notes that occurr during start-end
    return timed_notes[(timed_notes.end > start) & (timed_notes.start < end)]


def get_onset_deltas(timed_notes):
    unique_onsets = pd.Series(timed_notes['start'].unique()).sort_values()
    onset_deltas = (unique_onsets - unique_onsets.shift())[1:]
    return onset_deltas[onset_deltas > 0.000001]


def calc_slice_duration(timed_notes, pgoal=0.05):
    onset_deltas = get_onset_deltas(timed_notes).sort_values().unique()
    onsets = timed_notes['start']
    slice_duration = 0
    p = 0
    while p <= pgoal:
        count = 0
        for t in onset_deltas:
            if slice_duration >= t:
                count += 1
        p = count / len(onset_deltas)
        slice_duration += 0.00001
    print('p:', p)
    return slice_duration




In [2]:
    
midi_dir = './midi_files'
file_paths = [join(midi_dir, x) for x in listdir(midi_dir)]
print('Reading %s midi files...' % len(file_paths))


midi_objects = []
all_data = []


for i, fp in enumerate(file_paths):
    print('>>>', fp)
    mid = Midi.from_file(fp)
    midi_objects.append(mid)
    all_data += mid.note_times()

all_data = pd.DataFrame(all_data)
all_data

Reading 21 midi files...
>>> ./midi_files/mz_570_1.mid
>>> ./midi_files/mz_570_2.mid
>>> ./midi_files/mz_570_3.mid
>>> ./midi_files/mz_545_1.mid
>>> ./midi_files/mz_332_3.mid
>>> ./midi_files/mz_330_1.mid
>>> ./midi_files/mz_332_2.mid
>>> ./midi_files/mz_545_2.mid
>>> ./midi_files/mz_330_2.mid
>>> ./midi_files/mz_330_3.mid
>>> ./midi_files/mz_332_1.mid
>>> ./midi_files/mz_545_3.mid
>>> ./midi_files/mz_331_2.mid
>>> ./midi_files/mz_333_1.mid
>>> ./midi_files/mz_331_3.mid
>>> ./midi_files/mz_331_1.mid
>>> ./midi_files/mz_333_3.mid
>>> ./midi_files/mz_333_2.mid
>>> ./midi_files/mz_311_1.mid
>>> ./midi_files/mz_311_2.mid
>>> ./midi_files/mz_311_3.mid


Unnamed: 0,start,end,midi,vel
0,0.0000,0.92160,70,50
1,0.0000,0.92160,58,40
2,0.9216,1.38240,65,45
3,0.9216,1.38240,53,36
4,1.3824,2.30400,62,50
5,1.3824,2.30400,50,40
6,2.3040,2.76480,65,48
7,2.3040,2.76480,53,38
8,2.7648,3.68640,70,52
9,2.7648,3.68640,58,41


In [15]:
def hash_timed_notes(timed_notes):
    return frozenset(timed_notes['midi'])

# def slice_notes(timed_notes, slice_duration, k={}):
#     notes_duration = max(timed_notes['end']) - min(timed_notes['start'])
#     num_slices = int(notes_duration // slice_duration) + 1
#     slices = {}
#     tns = timed_notes
#     for t in range(num_slices):
#         start = slice_duration * t
#         end = slice_duration * (t + 1)
# #         tns = tns[tns['start'] >= start]
#         tr = filter_timerange(tns, start, end)
#         if len(tr) > 0:
#             tr = tr.drop(columns=['start', 'end', 'vel']).reset_index()
#             tr_hash = hash_timed_notes(tr)
#             _id = len(k.keys())
#             if tr_hash in k:
#                 _id = k[tr_hash]
#             k[tr_hash] = _id
#             tr['slice_id'] = _id
#             slices[_id] = tr

#     slice_df = pd.concat(slices)
#     return slice_df, k
import math
def slice_notes(timed_notes, slice_duration, k={}):
    notes_duration = max(timed_notes['end']) - min(timed_notes['start'])
    num_slices = int(notes_duration // slice_duration) + 1
    slices = {}
    
    for _, note in timed_notes.iterrows():
        start = note['start']
        sindex = start / slice_duration
        eindex = note['end'] / slice_duration
        for i in range(math.floor(sindex), math.ceil(eindex)):
            if i in slices:
                slices[i].append(note)
            else:
                slices[i] = note
#         print(sindex, eindex)
    print(len(slices), notes_duration, num_slices)
#     s = {}
#     print(len(slices))
#     for sid in slices:
#         s[sid] = pd.DataFrame(slices[sid])
# #     print(slices[0])
#     print('concatting...')
#     return pd.concat(s)
    
    s = []
    for sid in slices:
        s.append(slices[sid])
    print('to list...')
    return pd.DataFrame(s)
        
    

word_duration = calc_slice_duration(pd.DataFrame(midi_objects[0].note_times()), 0.05)
sentences = {}
k = {}
for i, mid in enumerate(midi_objects[:1]):
    print('Reading Midi Object #', i)
    w = slice_notes(pd.DataFrame(mid.note_times()), word_duration, k=k)
    sentences[i] = w
    break

sents = pd.concat(sentences)

p: 0.08935361216730038
Reading Midi Object # 0
9502 577.3823999999979 10023
to list...


In [16]:
sents

Unnamed: 0,Unnamed: 1,start,end,midi,vel
0,0,0.0000,0.92160,70.0,50.0
0,0,0.0000,0.92160,70.0,50.0
0,0,0.0000,0.92160,70.0,50.0
0,0,0.0000,0.92160,70.0,50.0
0,0,0.0000,0.92160,70.0,50.0
0,0,0.0000,0.92160,70.0,50.0
0,0,0.0000,0.92160,70.0,50.0
0,0,0.0000,0.92160,70.0,50.0
0,0,0.0000,0.92160,70.0,50.0
0,0,0.0000,0.92160,70.0,50.0


In [None]:
sents.groupby('slice_id').get_group(1)


In [None]:
# def plot_piano_roll(timed_notes, axis, color=None):
#     velocity_map = cm.magma(np.linspace(0,1,90))
#     min_width = 0.25

#     for _, note in timed_notes.iterrows():
#         width = max(min_width, note['end'] - note['start'])
#         tr = (note['start'], note['start'] + width)
#         midi_note = note['midi']
#         vel = int(note['vel'])
#         c = color if color is not None else velocity_map[vel]
#         axis.plot(tr, [midi_note, midi_note], lw=5, solid_capstyle='butt', color=c)

        
# _, ax = plt.subplots(figsize=(8,4))
# plot_piano_roll(sents[:500], ax)

In [None]:

def plot_piano_word(words, axis, color=None):
    velocity_map = cm.magma(np.linspace(0,1,8))
    slice_duration = 0.05
    for i, sid in enumerate(words['slice_id'].unique()):
        word = sents[sents['slice_id'] == sid]
        c = color if color is not None else velocity_map[i % 8]
        for midi_note in word['midi']:
            axis.plot([
                i * slice_duration,
                (i + 1) * slice_duration
            ], [midi_note, midi_note], lw=5, solid_capstyle='butt', color=c)


_, ax = plt.subplots(figsize=(8,4))

for i in range(1):
    plot_piano_word(
        sents.loc[0][0:200],
        ax) 



In [None]:
all_slices = []
for s in sents['slice_id']:
    immut_slice = hash_timed_notes(
        sents.groupby('slice_id').get_group(s))
    all_slices.append(immut_slice)

all_slices[:10]

In [None]:
print(len(all_slices))
print(len(set(all_slices)))

In [None]:

compressed = sents['slice_id'].values
