# Composer Classification EDA

In [None]:
import pandas as pd
from mido import MidiFile
import numpy as np
import pretty_midi
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ProcessPoolExecutor, as_completed
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from mido import KeySignatureError
import logging

# nn libraries
import torch
import torch.nn as nn
import torch.optim as optim

# New method that will and should get more data
Hopefully

In [None]:
main_dir = 'midiclassics'

In [None]:
# extract the midi file features
def extract_midi_features(file_path, max_sequence_length=300):
    try:
        start_time = time.time()
        midi_data = pretty_midi.PrettyMIDI(file_path)
        tempo = midi_data.estimate_tempo()
        key_signatures = [key.key_number for key in midi_data.key_signature_changes]
        time_signatures = [(time.numerator, time.denominator) for time in midi_data.time_signature_changes]
        instrument_types = [instr.program for instr in midi_data.instruments]
        notes_histogram = midi_data.get_pitch_class_histogram()
        notes = np.zeros((max_sequence_length, 128))
        for instrument in midi_data.instruments:
            for note in instrument.notes:
                start = int(note.start * max_sequence_length / midi_data.get_end_time())
                end = int(note.end * max_sequence_length / midi_data.get_end_time())
                notes[start:end, note.pitch] = note.velocity / 127
        end_time = time.time()
        print(f"Processed {file_path} in {end_time - start_time:.2f} seconds")
        return {
            'tempo': tempo,
            'key_signatures': key_signatures,
            'time_signatures': time_signatures,
            'instrument_types': instrument_types,
            'notes_histogram': notes_histogram.tolist(),
            'notes': notes
        }
    except Exception as e:
        print(f"Failed to process {file_path}: {e}")
        return None

In [None]:
# Function to get MIDI data from the Chopin folder
def get_chopin_midi_data(main_dir, max_sequence_length=300):
    composers_data = {}
    chopin_folder = 'Chopin'
    folder_path = os.path.join(main_dir, chopin_folder)
    data = []
    
    if os.path.isdir(folder_path):
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            if file_path.endswith('.midi') or file_path.endswith('.mid') or file_path.endswith('.MID'):
                features = extract_midi_features(file_path, max_sequence_length)
                if features is not None:
                    features['file'] = file
                    data.append(features)
        composers_data[chopin_folder] = pd.DataFrame(data)
    else:
        print(f"The folder {chopin_folder} does not exist in the directory {main_dir}.")
    
    return composers_data

In [None]:
# getting the data right here 
start_time = time.time()
chopin_data = get_chopin_midi_data(main_dir)
end_time = time.time()
print(f"Total time to process all files: {end_time - start_time:.2f} seconds")

In [None]:
# getting the shape of the data  
chopin_data['Chopin'].shape

In [None]:
chopin_data 

In [None]:
# printing some of the data, it is a dictonary
chopin_data['Chopin']

# Chat GPT explination of the columns


___

### `Tempo`

Description: This represents the estimated tempo (beats per minute) of the MIDI file. Tempo is a crucial aspect of a musical piece as it dictates the speed at which the music is played.

___

### `Key Signatures`

Description: These are the musical keys in which sections of the MIDI file are written. A key signature indicates the set of notes that are generally used in the piece, which provides a sense of tonality.


___

### `Time Signatures`

Description: This indicates the time signature changes in the MIDI file. A time signature defines the number of beats in each measure and the note value that represents one beat.


___

### `Instrument Types`

Description: This represents the different types of instruments used in the MIDI file. Each instrument is identified by a program number according to the General MIDI specification.


___

### `Notes Histogram`

Description: This is a histogram of the pitch classes (notes) used in the MIDI file. It provides a frequency distribution of each pitch class (C, C#, D, etc.) over the entire piece.


___

### `Notes`

Description: This is a matrix representing the notes played in the MIDI file over time. Each row corresponds to a time slice, and each column corresponds to a MIDI pitch (from 0 to 127). The value indicates the velocity (intensity) of the note.


In [None]:
# pritng first notes histogram val 
chopin_data['Chopin']['notes_histogram'][0]

In [None]:
chopin_data['Chopin']['notes'][0]

In [None]:
chopin_data['Chopin']['instrument_types'][1]

In [None]:
tempo_data = chopin_data['Chopin']['tempo']
tempo_data.describe() 

In [None]:
# getting the min and max of the width of notes array
notes_data = chopin_data['Chopin']['notes']
notes_widths = [np.where(notes.any(axis=1))[0].max() - np.where(notes.any(axis=1))[0].min() for notes in notes_data]
notes_widths = np.array(notes_widths)
notes_widths.min(), notes_widths.max()

## Running some base EDA to see how the data looks and what We should do with it

In [None]:
tempos = chopin_data['Chopin']['tempo'].dropna()
plt.hist(tempos, bins=20, edgecolor='black')
plt.title('Tempo Distribution')
plt.xlabel('Tempo (BPM)')
plt.ylabel('Frequency')
plt.show()

In [None]:
key_signatures = chopin_data['Chopin']['key_signatures'].explode().dropna()
plt.hist(key_signatures, bins=range(22), edgecolor='black')
plt.title('Key Signature Distribution')
plt.xlabel('Key Signature (MIDI Number)')
plt.ylabel('Frequency')
plt.show()

In [None]:
time_signatures = chopin_data['Chopin']['time_signatures'].explode().dropna()
time_signatures = time_signatures.apply(lambda x: f"{x[0]}/{x[1]}")
time_signatures.value_counts().plot(kind='bar')
plt.title('Time Signature Distribution')
plt.xlabel('Time Signature')
plt.ylabel('Frequency')
plt.show()

In [None]:
instrument_types = chopin_data['Chopin']['instrument_types'].explode().dropna()
plt.hist(instrument_types, bins=range(129), edgecolor='black')
plt.title('Instrument Types Distribution')
plt.xlabel('Instrument Program Number')
plt.ylabel('Frequency')
plt.show()

In [None]:
notes_histogram = chopin_data['Chopin']['notes_histogram'].explode().dropna()
avg_notes_histogram = np.mean(notes_histogram.tolist(), axis=0)
plt.bar(range(12), avg_notes_histogram)
plt.title('Average Pitch Class Histogram')
plt.xlabel('Pitch Class')
plt.ylabel('Normalized Frequency')
plt.xticks(range(12), ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'])
plt.show()

In [None]:
specific_piece_notes = chopin_data['Chopin']['notes'].iloc[0]

plt.imshow(specific_piece_notes.T, aspect='auto', origin='lower', cmap='viridis')
plt.title('Notes Matrix')
plt.xlabel('Time (normalized)')
plt.ylabel('MIDI Pitch')
plt.colorbar(label='Velocity')
plt.show()

In [None]:
specific_piece_notes = chopin_data['Chopin']['notes'].iloc[3]

plt.imshow(specific_piece_notes.T, aspect='auto', origin='lower', cmap='viridis')
plt.title('Notes Matrix')
plt.xlabel('Time (normalized)')
plt.ylabel('MIDI Pitch')
plt.colorbar(label='Velocity')
plt.show()

In [None]:
from collections import defaultdict


def pitch_class_histogram_over_time(midi_data, max_sequence_length=300):
    # Initialize a dictionary to hold pitch class histograms for each time slice
    histograms = defaultdict(lambda: np.zeros(12))

    # Fill histograms for each time slice
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            start_time = int(note.start * max_sequence_length / midi_data.get_end_time())
            pitch_class = note.pitch % 12
            histograms[start_time][pitch_class] += note.velocity

    # Convert to a matrix
    histogram_matrix = np.zeros((max_sequence_length, 12))
    for time_slice, histogram in histograms.items():
        histogram_matrix[time_slice, :] = histogram

    return histogram_matrix

In [None]:
midi_data = pretty_midi.PrettyMIDI('midiclassics\Chopin\(2542)Prelude opus.28, No.16 in B flat minor.mid')
histogram_matrix = pitch_class_histogram_over_time(midi_data)

In [None]:
plt.figure(figsize=(12, 8))
plt.imshow(histogram_matrix.T, aspect='auto', origin='lower', cmap='viridis')
plt.title('Pitch Class Histogram Over Time')
plt.xlabel('Time (normalized)')
plt.ylabel('Pitch Class')
plt.colorbar(label='Velocity')
plt.xticks(range(12), ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'])
plt.show()

In [None]:
from collections import Counter


# Function to extract chords
def extract_chords(midi_data, time_step=0.5):
    chords = []
    for t in np.arange(0, midi_data.get_end_time(), time_step):
        notes = []
        for instrument in midi_data.instruments:
            for note in instrument.notes:
                if note.start <= t < note.end:
                    notes.append(note.pitch)
        chords.append(Counter(notes))
    return chords

chords = extract_chords(midi_data)
chords = [sorted(chord.items()) for chord in chords]

# Plot chord progression
plt.figure(figsize=(12, 8))
for t, chord in enumerate(chords):
    for note, count in chord:
        plt.plot([t, t+1], [note, note], color='black', linewidth=count)
plt.title('Chord Progression')
plt.xlabel('Time (steps)')
plt.ylabel('MIDI Pitch')
plt.show()

### What I think we should try feeding into our model...

`Tempo` The overall tempo of the piece provides information about the speed at which the piece is played.

`Key Signatures:` The key signatures used throughout the piece can give insight into the tonality and harmonic structure.

`Time Signatures:` The time signatures indicate the rhythmic structure of the piece.

`Instrument Types:` The types of instruments used can be characteristic of a composer's style.

`Notes Histogram:` A histogram of the pitch classes (notes) used in the piece can provide information about the melodic and harmonic content.

`Notes Matrix:` The detailed matrix representing which notes are played over time and their velocities.

`Rhythmic Features:` Extract features such as note density, average note duration, and rhythmic patterns.

`Melodic Intervals:` The distribution of melodic intervals (differences in pitch between consecutive notes) can be indicative of a composer's style.

`Chord Progressions:` The sequence of chords used throughout the piece.