In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
%load_ext autoreload
%autoreload 2

## Read data

In [2]:
from sqlalchemy import create_engine
import pandas as pd

path = "../Data/wjazzd.db"
engine = create_engine(f"sqlite:///{path}")

melody = pd.read_sql("melody", engine)
beats = pd.read_sql("beats", engine)

df_melody = pd.DataFrame(melody)
df_beats = pd.DataFrame(beats)

## Fill the chords column with the last known chord

In [107]:
#df_chords = df_beats[df_beats['chord'] != '']
df_chords = df_beats.replace({'chord': {'': np.nan}}).ffill()
#df_chords.to_csv('df_chords.csv')
df_chords = df_beats.copy()
df_chords.info()
df_chords.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132329 entries, 0 to 132328
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   beatid      132329 non-null  int64  
 1   melid       132329 non-null  int64  
 2   onset       132329 non-null  float64
 3   bar         132329 non-null  int64  
 4   beat        132329 non-null  int64  
 5   signature   132329 non-null  object 
 6   chord       132325 non-null  object 
 7   form        132329 non-null  object 
 8   bass_pitch  122540 non-null  float64
 9   chorus_id   132329 non-null  int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 10.1+ MB


Unnamed: 0,beatid,melid,onset,bar,beat,signature,chord,form,bass_pitch,chorus_id
0,1,1,9.171882,-1,1,,,I1,42.0,0
1,2,1,9.488254,-1,2,,,,42.0,0
2,3,1,9.779955,-1,3,,,,40.0,0
3,4,1,10.052608,-1,4,,,,40.0,0
4,5,1,10.339796,0,1,,Bb6,,50.0,0
5,6,1,10.631542,0,2,,Bb6,,50.0,0
6,7,1,10.918163,0,3,,Bb6,,50.0,0
7,8,1,11.217007,0,4,,Bb6,,42.0,0
8,9,1,11.514127,1,1,4/4,Bb6,A1,42.0,1
9,10,1,11.82415,1,2,,Bb6,,42.0,1


## Extract corresponding row in beats using the (melid, bar, beat) features

In [151]:
# Define new index with the key (melid, bar, beat)
df_chords_new = df_chords.set_index(['melid', 'bar', 'beat'], drop=False)
df_melody_new = df_melody.set_index(['melid', 'bar', 'beat'], drop=False)

# Concatenate the dataframes using the new index and then reset the index again
df_mel_beats = pd.concat([df_melody_new, df_chords_new.reindex(df_melody_new.index)], axis=1)
df_mel_beats = df_mel_beats.reset_index(drop=True)

# Remove duplicate columns
# Duplicated columns are:
# - melid
# - onset
# - bar
# - beat
df_mel_beats = df_mel_beats.loc[:,~df_mel_beats.columns.duplicated()]

df_mel_beats = df_mel_beats[['eventid', 'melid', 'pitch', 'bar', 'beat', 'beatid', 'chord', 'bass_pitch']]

df_mel_beats['pitch_encoded'] = np.mod(df_mel_beats['pitch'], 12)

df_mel_beats['chord_changed'] = df_mel_beats['chord'].shift() != df_mel_beats["chord"]

df_mel_beats['chord_changed'] = df_mel_beats["chord_changed"].cumsum()

pitch_sequences = [g['pitch_encoded'].tolist() for k, g in df_mel_beats.groupby('chord_changed')]
print(pitch_sequences[:7])

# Identify last row of current chord
df_mel_beats['pitch_sequence'] = (df_mel_beats['chord'].shift(-1) != df_mel_beats["chord"])

# Change type to type object to add list to cell
df_mel_beats['pitch_sequence'] = df_mel_beats['pitch_sequence'].astype(object)

df_mel_beats.drop('chord_changed', axis=1, inplace=True)

for idx, _ in df_mel_beats.iterrows():
    if df_mel_beats.at[idx, 'pitch_sequence']:
       df_mel_beats.at[idx, 'pitch_sequence'] = pitch_sequences.pop(0)

#df_mel_beats['pitch_sequence'].loc[df_mel_beats['pitch_sequence'] == True] = pitch_sequences.pop(0)
df_mel_beats.drop(df_mel_beats[df_mel_beats['pitch_sequence'] == False].index, inplace=True)

df_mel_beats.to_csv('../Data/df_mel_beats.csv')

df_mel_beats.info()
df_mel_beats.head(20)


[[5.0, 3.0, 10.0, 1.0, 3.0, 10.0, 10.0], [2.0, 9.0, 0.0], [10.0, 7.0, 10.0], [1.0, 0.0, 10.0], [0.0, 10.0], [8.0, 0.0, 11.0, 9.0], [10.0, 6.0, 7.0, 10.0]]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 26883 entries, 6 to 200808
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   eventid         26883 non-null  int64  
 1   melid           26883 non-null  int64  
 2   pitch           26883 non-null  float64
 3   bar             26883 non-null  int64  
 4   beat            26883 non-null  int64  
 5   beatid          26883 non-null  int64  
 6   chord           26883 non-null  object 
 7   bass_pitch      25837 non-null  float64
 8   pitch_encoded   26883 non-null  float64
 9   pitch_sequence  26883 non-null  object 
dtypes: float64(3), int64(5), object(2)
memory usage: 2.3+ MB


Unnamed: 0,eventid,melid,pitch,bar,beat,beatid,chord,bass_pitch,pitch_encoded,pitch_sequence
6,7,1,58.0,1,2,10,Bb6,42.0,10.0,"[5.0, 3.0, 10.0, 1.0, 3.0, 10.0, 10.0]"
9,10,1,60.0,3,4,20,G-7,42.0,0.0,"[2.0, 9.0, 0.0]"
12,13,1,58.0,4,2,22,C-7,38.0,10.0,"[10.0, 7.0, 10.0]"
15,16,1,58.0,4,4,24,F7,33.0,10.0,"[1.0, 0.0, 10.0]"
17,18,1,58.0,5,2,26,F-7,32.0,10.0,"[0.0, 10.0]"
21,22,1,57.0,5,4,28,Bb7,32.0,9.0,"[8.0, 0.0, 11.0, 9.0]"
25,26,1,58.0,6,2,30,Eb7,31.0,10.0,"[10.0, 6.0, 7.0, 10.0]"
29,30,1,62.0,6,4,32,Ab7,28.0,2.0,"[1.0, 8.0, 1.0, 2.0]"
33,34,1,58.0,7,2,34,D-7,42.0,10.0,"[10.0, 0.0, 7.0, 10.0]"
34,35,1,58.0,7,3,35,G7,31.0,10.0,[10.0]


In [5]:
import sys
sys.path.append('../')

from combine_melody_beats import combine_melody_beats

df_mel_beats.equals(combine_melody_beats(df_melody, df_beats))

True

## Note sequence encoding