# Digital Musicology (DH-401)
## Assignment 3: Similarity
Group 6
- Mickaël Achkar
- Yichen Wang
- Yinghui Jiang

In [69]:
import ms3
import pandas as pd
import numpy as np
import math
import fractions
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)

## 0. Dataset preparation

In [188]:
%%time

# Suppress SettingWithCopy warnings
pd.options.mode.chained_assignment = None

# Import CSV
chorales_dataset = pd.read_csv('DM 2022 - Assignment 3 [Dataset].csv', converters = {'mn_onset': fractions.Fraction, 'act_dur': fractions.Fraction, 'duration': fractions.Fraction, 'nominal_duration': fractions.Fraction, 'scalar': fractions.Fraction}, index_col = 0, low_memory = False)

## Add global onset column for easy time series manipulations (in units of whole notes)
# Sort accordingly
chorales_dataset = chorales_dataset.sort_values(by=['piece', 'staff', 'mn', 'mn_onset'])
# Shift duration column by 1
chorales_dataset['_shift_dur'] = np.roll(chorales_dataset['duration'], 1)
# Get invalid first shifts that will be removed
chorales_dataset['_first_shift'] = chorales_dataset.groupby(['piece','staff'])['_shift_dur'].transform('first')
# Cumulate shifted durations
chorales_dataset['_cum_dur_uncleaned'] = chorales_dataset.groupby(['piece','staff'])['_shift_dur'].transform(pd.Series.cumsum)
# Subtract invalid first shifts
chorales_dataset['global_onset'] = (chorales_dataset['_cum_dur_uncleaned'] - chorales_dataset['_first_shift'])
# Remove temp coliumns
chorales_dataset = chorales_dataset.drop(columns = ['_shift_dur','_first_shift','_cum_dur_uncleaned'])

# Ignore gracenotes (there is only one)
chorales_dataset = chorales_dataset[chorales_dataset['gracenote'].isna()].drop(columns = ['gracenote'])

# Remove Chorale043 because it is corrupted (it does not open in MuseScore and it has only 1 staff)
chorales_dataset = chorales_dataset[chorales_dataset['piece'] != 'BachChorales/Chorale043']

# Get unique piece/staff pairs
piece_staffs = chorales_dataset[['piece', 'staff']].drop_duplicates().to_numpy()

# Consider only until the fermata in or after the 4th bar
fermata_data = []
for piece, staff in piece_staffs:
    for idx, row in chorales_dataset[(chorales_dataset['piece'] == piece) & (chorales_dataset['staff'] == staff)].iterrows():
        fermata_data.append(row)
        if row['fermata'] and row['mn'] >= 4:
            break
fermata_df = pd.DataFrame(fermata_data)

# Sort by onset and staff for consistent alignment
fermata_df = fermata_df.sort_values(by=['piece', 'mn', 'mn_onset', 'staff']).reset_index(drop=True)

CPU times: user 16.1 s, sys: 293 ms, total: 16.4 s
Wall time: 17.1 s


In [202]:
random_piece = fermata_df['piece'].sample(n=1).item()
random_piece_df = fermata_df[(fermata_df['piece'] == random_piece)]
random_piece_df

Unnamed: 0,piece,mn,mn_onset,timesig,act_dur,staff,voice,duration,nominal_duration,scalar,tied,tpc,midi,fermata,global_onset
1076,BachChorales/Chorale016,0,3/4,4/4,1/4,1,1,1/4,1/4,1,,7,73,False,0
1077,BachChorales/Chorale016,0,3/4,4/4,1/4,2,1,1/4,1/4,1,,6,66,False,0
1078,BachChorales/Chorale016,0,3/4,4/4,1/4,3,1,1/4,1/4,1,,10,58,False,0
1079,BachChorales/Chorale016,0,3/4,4/4,1/4,4,1,1/4,1/4,1,,6,54,False,0
1080,BachChorales/Chorale016,1,0,4/4,1,1,1,1/4,1/4,1,,2,74,False,1/4
1081,BachChorales/Chorale016,1,0,4/4,1,2,1,1/4,1/4,1,,6,66,False,1/4
1082,BachChorales/Chorale016,1,0,4/4,1,3,1,1/4,1/4,1,,5,59,False,1/4
1083,BachChorales/Chorale016,1,0,4/4,1,4,1,1/4,1/4,1,,5,59,False,1/4
1084,BachChorales/Chorale016,1,1/4,4/4,1,1,1,1/4,1/4,1,,7,73,False,1/2
1085,BachChorales/Chorale016,1,1/4,4/4,1,2,1,1/4,1/4,1,,6,66,False,1/2


In [34]:
#average of pitches over time
#discretize pitch into 100 steps
# fermata_df.groupby(['piece','mn','mn_onset',

In [200]:
# Create dictionaries of total piece durations and smallest notes in each piece
total_piece_durations_df = fermata_df.groupby('piece')['global_onset'].max().apply(float)
minimum_note_durations_df = fermata_df.groupby('piece')['duration'].min().apply(float)
total_piece_durations = total_piece_durations_df.to_dict()
minimum_note_durations = minimum_note_durations_df.to_dict()

# Sum durations of notes in each sampling bin
summed_durations_dict = {}
# Average pitches of notes in each sampling bin
averaged_pitches_dict = {}
# Loop
for piece,total_dur in total_piece_durations.items():
    piece_df = fermata_df[fermata_df['piece'] == piece]

    sampling_period = minimum_note_durations[piece]

    bins = np.arange(0, (total_dur+sampling_period+1e-10), sampling_period)
    binned_onsets = pd.cut(piece_df['global_onset'], bins, right = False)
    summed_durations_dict[piece] = piece_df.groupby(binned_onsets)['duration'].sum().apply(float)
    averaged_pitches_dict[piece] = piece_df.groupby(binned_onsets)['midi'].mean()

In [203]:
# print(summed_durations_dict.items())
# print(averaged_pitches_dict.items())


## I. Similarity parameters
### a. Melodic contour