In [None]:
%load_ext autoreload
%autoreload 2
import pickle
import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from utils.data import *

In [None]:
folder = 'theorytab'

In [None]:
with open(f'./data/dataframes/{folder}/all_melodies.pkl', 'rb') as f:
    songs = pickle.load(f)
dfs = [item[0] for item in songs]
len(dfs)

In [None]:
# path = f'./data/{folder}/'
# songs = get_dfs_from_midi(path, min_notes=30, min_gap=0., melody_only=True)
# dfs = [item[0] for item in songs]
# print(len(dfs), len(songs))

# with open(f'./data/dataframes/{folder}/all_melodies.pkl', 'wb') as f:
#     pickle.dump(songs, f)

In [None]:
sample_idx = np.random.randint(len(dfs))
print(f'Artist, title, segment of song: {songs[sample_idx][1:4]}')
print(f'Key of song: {songs[sample_idx][4]}')
print(f'Time signature: {songs[sample_idx][5]}')
print(f'Beats in song: \n {songs[sample_idx][6]}')
midi_data = df_to_midi(songs[sample_idx][0])
fs = 44100
audio_data = midi_data.fluidsynth(fs=fs)
ipd.Audio(audio_data, rate=fs)

# Keys

In [None]:
n_keys_per_song = [len(item[4]) for item in songs]
plt.hist(n_keys_per_song)

# Percentage of notes within Major scale

In [None]:
key_to_note = {0: 'C', 1: 'C#', 2: 'D', 3: 'D#', 4: 'E', 5: 'F', 6: 'F#', 7: 'G', 8: 'G#', 9: 'A', 10: 'A#', 11: 'B'}
key_notes = []
for i in range(12):
    key_notes.append(get_notes_from_major_scale(i))

In [None]:
# check percentage of notes in a key
key = 5

dfs_key = [item[0] for item in songs if item[4][0][0] == key]
key_percentages = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 11: []}
for df in dfs_key:
    for i in range(len(key_notes)):
        percentage = len([note for note in df['Pitch'].values if note in key_notes[i]]) / len(df['Pitch'].values)
        key_percentages[i].append(percentage)

In [None]:
fig, ax = plt.subplots(4, 3, figsize=(15, 5))
print(f'Key: {key_to_note[key]}')
for i in range(len(key_notes)):
    print(f'Mean percentage of notes in key {key_to_note[i]}: {np.mean(key_percentages[i])}')
    ax[i//3, i%3].hist(key_percentages[i], bins=100)
    ax[i//3, i%3].set_title(f'{key_to_note[i]} major scale')
plt.tight_layout()

In [None]:
# Filter out songs which have accidentals i.e. notes not in the key
songs_within_key = []
for i in range(len(songs)):
    key = songs[i][4][0][0]
    key_notes = get_notes_from_major_scale(key)
    notes = songs[i][0]['Pitch'].values
    if all([note in key_notes for note in notes]):
        songs_within_key.append(songs[i])
len(songs_within_key)

In [None]:
with open(f'./data/dataframes/{folder}/all_melodies_notes_within_key.pkl', 'wb') as f:
    pickle.dump(songs_within_key, f)

In [None]:
songs = songs_within_key

# Time Signatures

In [None]:
n_signatures_per_song = [len(item[5]) for item in songs]
plt.hist(n_signatures_per_song, bins=20)

In [None]:
beats_per_bar = [item[5][0][0] for item in songs]
plt.hist(beats_per_bar, bins=20)

In [None]:
note_value = [item[5][0][1] for item in songs]
plt.hist(note_value, bins=20)

In [None]:
# check that all beats are multiples of the first beat
beats = [item[6] for item in songs]
beats = [item / item[1] for item in beats]
beats = np.concatenate(beats)
beats += 1
beats = beats % beats == 0
beats.all()

In [None]:
# songs_beats_aligned = []
# for song in songs:
#     seconds_per_beat = song[6][1]
#     df = song[0]
#     df.loc[:, ['Start', 'End']] /= seconds_per_beat
#     df.loc[:, ['Start', 'End']] *= 0.5
#     songs_beats_aligned.append((df, song[1], song[2], song[3][:-4], song[4][0][0])) # df, artist, song-title, segment, key
#     # print(seconds_per_beat)
#     # print(songs_beats_aligned)
# with open(f'./data/dataframes/{folder}/all_melodies_within_key_beats_aligned.pkl', 'wb') as f:
#     pickle.dump(songs, f)

In [None]:
with open(f'./data/dataframes/{folder}/all_melodies_beats_aligned.pkl', 'rb') as f:
    songs = pickle.load(f)
len(songs)

# Number of notes

In [None]:
min_length = 30
min_unique_notes = 5
songs = [item for item in songs if len(item[0]) > min_length]
songs = [item for item in songs if item[0]['Pitch'].nunique() > min_unique_notes]
lens = [len(item[0]) for item in songs]
print('Number of songs:', len(songs))
print('Max length:', max(lens))
print('Min length:', min(lens))
print('Mean length:', np.mean(lens))
sorted_lens = sorted(lens)
plt.bar(range(len(sorted_lens)), sorted_lens, width=1.0);

In [None]:
largest_idx = np.argpartition(lens, -4)[-4:]
print(largest_idx)

In [None]:
sample_idx = 756
print(songs[sample_idx][1:4])
midi_data = df_to_midi(songs[sample_idx][0])
fs = 44100
audio_data = midi_data.fluidsynth(fs=fs)
ipd.Audio(audio_data, rate=fs)

In [None]:
# with open(f'./data/dataframes/{folder}/all_melodies_within_key_beats_aligned_min_5_unique.pkl', 'wb') as f:
#     pickle.dump(songs, f)

In [None]:
with open(f'./data/dataframes/{folder}/melodies_beats_min_5_unique.pkl', 'rb') as f:
    songs = pickle.load(f)
len(songs)

# Simultaneous notes

In [None]:
# check if there are more than 1 note at the same time
for i, song in enumerate(songs):
    if song[0]['Start'].duplicated().any():
        print(i)

# Duration

In [None]:
# Duration of melodies
durations = []
for item in songs:
    durations.append(item[0]['End'].max())
plt.hist(durations, bins=100);
min(durations), max(durations)

# Pitch range

In [None]:
pitch_range([item[0] for item in songs])

In [None]:
songs2 = trim_by_range(songs, min_range=5, max_range=21)

In [None]:
pitch_range([item[0] for item in songs2])

In [None]:
# with open(f'./data/dataframes/{folder}/all_melodies_within_key_beats_aligned_min_5_unique_max_range_21.pkl', 'wb') as f:
#     pickle.dump(songs2, f)

# Gaps

In [None]:
# measure gap between start of next note and end of previous note
neg_gaps = []
pos_gaps = []
for df in dfs:
    gap = df['Start'].iloc[1:].values - df['End'].iloc[:-1].values
    if gap.min() < 0:
        neg_gaps.append(df)
    if gap.max() > 0:
        pos_gaps.append(df)
print(len(neg_gaps), len(pos_gaps))

# Duration + Pitch format

In [None]:
dfs4 = note_duration_transform(dfs3)

In [None]:
dfs5, max_pitch = pitch_translation(dfs4)
max_pitch

In [None]:
pitch_range(dfs5)

In [None]:
with open(f'./data/dataframes/{key}/dfs_note_dur_offset_{int(max_pitch)}.pkl', 'wb') as f:
    pickle.dump(dfs5, f)

In [None]:
with open(f'./data/dataframes/{key}/dfs_note_dur_offset_{int(max_pitch)}.pkl', 'rb') as f:
    dfs5 = pickle.load(f)

In [None]:
dfs5[0]

In [None]:
dataset = NoteDurationDataset(dfs5, sample_len=20, scale=1., stride=10)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=0, drop_last=True)

In [None]:
len(dataloader)