# Dataset Creation Notebook (Skip if using pre-loaded dataset)

In [None]:
import os
import pandas as pd

import numpy as np
from scipy import interpolate

import torch

# library for extracting short-term features from audio
from pyAudioAnalysis import ShortTermFeatures as aF
from pyAudioAnalysis import audioBasicIO as aIO 

# to save dataset and weights
import pickle

## Relevant Files

In [2]:
lms_files = [os.path.join("sequences", file) for file in os.listdir("sequences") if ".ipynb" not in file and ".DS_Store" not in file]
lms_files.sort()
mp3_files = [os.path.join("audio", file) for file in os.listdir("audio") if ".ipynb" not in file and ".DS_Store" not in file]
mp3_files.sort()

## Create Target Data

In [3]:
dense_seqs = {}
file_lengths = []
for idx in range(len(lms_files)):
    file = open(lms_files[idx])
    lines = file.readlines()

    sequences = []

    for idx in range(len(lines)):
        line = lines[idx].strip()
        if "<channel name=" in line or line == "<timingGrids>":
            sequence = []
            while line != "</channel>" and line != "</timingGrids>":
                line = lines[idx].strip()
                sequence.append(line)
                idx+=1
            sequences.append(sequence)
    timing_grid = sequences[-1][2:-2]
    trim_sequences = sequences[:-1]
    
    timing_df = pd.DataFrame(columns=["time"])
    for line in timing_grid:
        timing_df.loc[len(timing_df)] = [line.split('"')[1]]
    
    sequence_df = pd.DataFrame(columns=["unit", "circuit", "type", "start", "end", "start_intensity", "end_intensity"])
    channel_df = pd.DataFrame(columns=["unit", "circuit", "name", "color", "length"])
    for sequence in trim_sequences:
        for idx, line in enumerate(sequence):
            line_arr = line.split('"')
            if idx == 0:
                channel_name = line_arr[1]
                color = int(line_arr[3])
                length = int(line_arr[5])
                unit = int(line_arr[9])
                circuit = int(line_arr[11])
                channel_df.loc[len(channel_df)] = [unit, circuit, channel_name, color, length]
            elif idx == (len(sequence) - 1):
                pass
            else:
                effect_type = 0 if line_arr[1] == "intensity" else 1
                if len(line_arr) > 9:
                    sequence_df.loc[len(sequence_df)] = [unit, circuit, effect_type, int(line_arr[3]), int(line_arr[5]), int(line_arr[7]), int(line_arr[9])]
                else:
                    sequence_df.loc[len(sequence_df)] = [unit, circuit, effect_type, int(line_arr[3]), int(line_arr[5]), int(line_arr[7]), int(line_arr[7])]
    
    file_lengths.append(int(channel_df.iloc[0]["length"]))
    
    for unit in np.unique(sequence_df["unit"]):
        if unit not in dense_seqs:
            dense_seqs[unit] = {}

        for circuit in np.unique(sequence_df[sequence_df["unit"] == unit]["circuit"]):
            seq = np.zeros(int(channel_df.iloc[0]["length"]))

            sparse_seqs = sequence_df[(sequence_df["unit"] == unit) & (sequence_df["circuit"] == circuit)]
            for idx, row in sparse_seqs.iterrows():
                f = interpolate.interp1d([row["start"], row["end"]], [row["start_intensity"], row["end_intensity"]])
                xnew = np.arange(row["start"], row["end"]+1, 1)
                ynew = f(xnew)
                seq[row["start"]-1:row["end"]] = ynew
            
            if circuit not in dense_seqs[unit]:
                dense_seqs[unit][circuit] = seq
            else:
                dense_seqs[unit][circuit] = np.append(dense_seqs[unit][circuit], seq)

### Create Full Input Data

In [5]:
full_signal = np.empty([np.sum(file_lengths), 68])
arr_idx = 0
for idx in range(len(mp3_files)):
    frame_rate, signal = aIO.read_audio_file(mp3_files[idx])
    signal = aIO.stereo_to_mono(signal)
    
    window, step = 0.01, 0.01
    [features, feature_names] = aF.feature_extraction(signal, frame_rate, int(frame_rate * window), 
                                    int(frame_rate * step))
    
    features = features[:, :file_lengths[idx]].transpose()
    
    full_signal[arr_idx:arr_idx+file_lengths[idx]] = features
    arr_idx += file_lengths[idx]

In [6]:
len(full_signal)

220247

In [7]:
len(dense_seqs[1][1])

220247

## Create Torch Dataset

In [2]:
from LOR_dataset import LOR_Dataset

In [10]:
dataset = LOR_Dataset(full_signal, dense_seqs, channel_df, file_lengths)

## Save Dataset for later


In [12]:
# with open('datasets/dataset_1.pickle', 'wb') as file:
#     pickle.dump(dataset, file)