# Machine Learning Project: Key Detection
Authors: Zach Hayes, Matt Gaetano, Max Ivry

Course: CS3540


In [1]:
import pandas as pd
import numpy as np
from scipy.io import wavfile
from scipy.fft import rfft, rfftfreq


### Data Cleaning and Feature Engineering Pipeline
Data cleaning occured earlier in process. See clean.ipynb

In [37]:
def features_from_audio(df) -> pd.DataFrame:
    """
    'Join' the existing dataframe with the audio files via
    mapping to frequency features.
    """
    # Determine the size for all instances (largest number of samples)
    longest = 0
    rate = 0
    locations = df["location"].to_numpy()
    for loc in locations:
        path = "dataset/" + loc
        rate, data = wavfile.read(path)
        length = len(data)
        if length > longest:
            longest = length

    freq_bins = rfftfreq(longest, 1 / rate)

    # Calculate the features to populate a data array
    bundles = []
    bundle = None
    for loc in locations:
        path = "dataset/" + loc
        samplerate, data = wavfile.read(path)
        # Trailing 0s to achieve the same length as the longest instance
        data_length = len(data)
        new_data = np.concatenate((data, np.zeros(longest - data_length)))
        # Compute the fft and add the instances to bundles to compute
        fft = np.abs(rfft(new_data))
        key = df[df["location"] == loc]["key"]
        instance = np.concatenate([fft, key])
        if bundle is None:
            bundle = [instance]
        elif len(bundle) < 150:
            bundle = np.append(bundle, [instance], axis=0)
        else:
            bundles.append(bundle)
            bundle = [instance]
    bundles.append(bundle)  # Last remaining bundle
    instances = np.concatenate(bundles, axis=0)

    # Return the completed dataframe
    cols = np.append(freq_bins, "target")
    return pd.DataFrame(data=instances, columns=cols)

In [38]:
def df_pipeline() -> pd.DataFrame:
    """
    From the original audio file and json pair dataset, 
    create the dataset from which to work 
    """
    df = pd.read_json("dataset/metadata.json", lines=True)
    df = features_from_audio(df)
    return df

df = df_pipeline()
print(df.head())
print(df.shape)
df.to_json("data.json", orient="records", lines=True)

: 

### Support Vector Machine Approach