## Imports and File Reads

In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_theme(style='whitegrid', palette='deep')

import pretty_midi as pm

import tensorflow as tf
import torch as pt

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

In [38]:
import os

def get_size(start_path):
    total_size = 0
    file_count = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            if f.endswith('.mid'):
                fp = os.path.join(dirpath, f)
                total_size += os.path.getsize(fp)
                file_count += 1
    return total_size, file_count

def human_readable_size(size, decimal_places=2):
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024.0:
            break
        size /= 1024.0
    return f"{size:.{decimal_places}f} {unit}"

root_dir = '../aai-511_group1/midiclassics'

total_size, file_count = get_size(root_dir)

print(f"Total number of MIDI files: {file_count}")
print(f"Total size of MIDI files: {human_readable_size(total_size)}")

# Print breakdown by composer
print("\nBreakdown by composer:")
for composer in os.listdir(root_dir):
    composer_dir = os.path.join(root_dir, composer)
    if os.path.isdir(composer_dir):
        composer_size, composer_file_count = get_size(composer_dir)
        print(f"  {composer}: {human_readable_size(composer_size)} ({composer_file_count} files)")

Total number of MIDI files: 1530
Total size of MIDI files: 38.56 MB

Breakdown by composer:
  Mozart: 10.58 MB (257 files)
  Chopin: 2.83 MB (136 files)
  Beethoven: 13.25 MB (212 files)
  Bach: 11.89 MB (925 files)


In [39]:
composers = ['bach', 'beethoven', 'chopin', 'mozart']

def midi_to_dataframe(midi_path):
    try:
        # Read the MIDI file
        midi = pm.PrettyMIDI(midi_path)
        data = []
        
        # Collect notes, control changes, and pitch bends
        for i, instrument in enumerate(midi.instruments):
            for note in instrument.notes:
                data.append([i, 'note', note.start, note.end, note.pitch, note.velocity])
            for control_change in instrument.control_changes:
                data.append([i, 'control_change', control_change.time, control_change.number, control_change.value])
            for pitch_bend in instrument.pitch_bends:
                data.append([i, 'pitch_bend', pitch_bend.time, pitch_bend.pitch])
        
        # Collect tempo changes from the first track only
        if i == 0:
            for tempo_change in midi.get_tempo_changes():
                data.append([0, 'tempo_change', tempo_change[0], tempo_change[1]])
        
        # Create a dataframe from the data
        df = pd.DataFrame(data, columns=['track', 'type', 'start', 'end', 'pitch_or_number', 'velocity_or_value'])
        return df
    except Exception as e:
        print(f"Error processing {midi_path}: {e}")
        return pd.DataFrame()

composer_dataframes = {}

for composer in composers:
    folder_path = os.path.join(root_dir, composer)
    if os.path.exists(folder_path):
        all_midi_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.mid')]
        composer_dfs = [midi_to_dataframe(midi_file) for midi_file in all_midi_files]
        # Filter out empty dataframes resulting from errors
        composer_dfs = [df for df in composer_dfs if not df.empty]
        if composer_dfs:
            composer_dataframes[composer] = pd.concat(composer_dfs, ignore_index=True)

# getting each composer's dataframe
bach_df = composer_dataframes.get('bach', pd.DataFrame())
beethoven_df = composer_dataframes.get('beethoven', pd.DataFrame())
chopin_df = composer_dataframes.get('chopin', pd.DataFrame())
mozart_df = composer_dataframes.get('mozart', pd.DataFrame())


Error processing ../aai-511_group1/midiclassics/beethoven/Sonatina op33 4mov.mid: index 1 is out of bounds for axis 0 with size 1
Error processing ../aai-511_group1/midiclassics/beethoven/Sonata Presto.mid: index 1 is out of bounds for axis 0 with size 1
Error processing ../aai-511_group1/midiclassics/beethoven/Piano Sonatina No.2 Op 49.mid: index 1 is out of bounds for axis 0 with size 1
Error processing ../aai-511_group1/midiclassics/beethoven/Op33 No.4.mid: index 1 is out of bounds for axis 0 with size 1
Error processing ../aai-511_group1/midiclassics/beethoven/Piano Sonata No.27.mid: index 1 is out of bounds for axis 0 with size 1
Error processing ../aai-511_group1/midiclassics/beethoven/Op.51.mid: index 1 is out of bounds for axis 0 with size 1
Error processing ../aai-511_group1/midiclassics/beethoven/Rage over a lost pennny.mid: index 1 is out of bounds for axis 0 with size 1
Error processing ../aai-511_group1/midiclassics/beethoven/Sonatina In C.mid: index 1 is out of bounds for

In [40]:
bach_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350047 entries, 0 to 350046
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   track              350047 non-null  int64  
 1   type               350047 non-null  object 
 2   start              350047 non-null  float64
 3   end                350047 non-null  float64
 4   pitch_or_number    349905 non-null  float64
 5   velocity_or_value  339640 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 16.0+ MB


In [41]:
beethoven_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 939153 entries, 0 to 939152
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   track              939153 non-null  int64  
 1   type               939153 non-null  object 
 2   start              939153 non-null  float64
 3   end                939153 non-null  float64
 4   pitch_or_number    938908 non-null  float64
 5   velocity_or_value  653575 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 43.0+ MB


In [42]:
chopin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384568 entries, 0 to 384567
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   track              384568 non-null  int64  
 1   type               384568 non-null  object 
 2   start              384568 non-null  float64
 3   end                384568 non-null  float64
 4   pitch_or_number    384455 non-null  float64
 5   velocity_or_value  300162 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 17.6+ MB


In [43]:
mozart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386935 entries, 0 to 386934
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   track              386935 non-null  int64  
 1   type               386935 non-null  object 
 2   start              386935 non-null  float64
 3   end                386935 non-null  float64
 4   pitch_or_number    386885 non-null  float64
 5   velocity_or_value  343864 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 17.7+ MB


#### References
1. https://machinelearningmastery.com/cnn-long-short-term-memory-networks/
2. https://colinraffel.com/publications/ismir2014intuitive.pdf
3. 
4. 
5. 
6. 
7. https://mido.readthedocs.io/en/latest/resources.html
8. https://www.youtube.com/playlist?list=PLTb0GHZilEMirahR6_o7ZWzO27NclwMqK