## Initial data transformation and processing

In [19]:
import scipy.io as sio
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.metrics import mean_absolute_error
from math import ceil

import utils as ut
reload(ut)
%matplotlib inline

In [4]:
data_array = sio.loadmat('data/DATASET.mat')['DATASET']

We'll keep only HRIRs and HRTFs

In [5]:
# Get only real part
data_float = data_array.copy().real.astype(float)
# Array with all variables as float
all_float_array = np.hstack((data_float[:, :596], data_float[:, 996:]))

In [6]:
# Imaginary components are now separated
print 'original dataset shape:', data_array.shape
print 'all floats dataset shape:', all_float_array.shape

original dataset shape: (65600, 1039)
all floats dataset shape: (65600, 639)


Now we will include feature names:

In [7]:
def define_column_names():
    column_names = []
    # Time series
    for i in range(200):
        column_names.append('L_HRIR_{:d}'.format(i))
    for i in range(200):
        column_names.append('R_HRIR_{:d}'.format(i))

    # Frequency Series
    for i in range(98):
        column_names.append('L_HRTF_{:d}'.format(i))
    for i in range(98):
        column_names.append('R_HRTF_{:d}'.format(i))

    # Interaction Parameters
    # Time-arrival Difference
    column_names.append('ITD')
    # Intensity Difference
    column_names.append('IID')
    # Spectral Difference
    column_names.append('DS')
    # Head Parameters
    column_names.append('head_width')
    column_names.append('head_height')
    column_names.append('head_depth')
    column_names.append('pinna_offset_down')
    column_names.append('pinna_offset_back')
    column_names.append('neck_width')
    column_names.append('neck_height')
    column_names.append('neck_depth')
    column_names.append('torso_top_width')
    column_names.append('torso_top_heigth')
    column_names.append('torso_top_depth')
    column_names.append('shoulder_width')
    column_names.append('head_offset_forward')
    column_names.append('height')
    column_names.append('seated_height')
    column_names.append('head_circumference ')
    column_names.append('shoulder_circumference')

    # Ear Parameters
    column_names.append('L_cavum_concha_heigth')
    column_names.append('L_cymba_concha_heigth')
    column_names.append('L_cavum_concha_width')
    column_names.append('L_fossa_height')
    column_names.append('L_pinna_height')
    column_names.append('L_pinna_width')
    column_names.append('L_intertragal_incisure_width')
    column_names.append('L_cavum_concha_depth')
    column_names.append('R_cavum_concha_heigth')
    column_names.append('R_cymba_concha_heigth')
    column_names.append('R_cavum_concha_width')
    column_names.append('R_fossa_height')
    column_names.append('R_pinna_height')
    column_names.append('R_pinna_width')
    column_names.append('R_intertragaR_incisure_width')
    column_names.append('R_cavum_concha_depth')

    # Pinnae Rotation Parameters
    column_names.append('L_pinna_rotation_angle')
    column_names.append('L_pinna_flare_angle')
    column_names.append('R_pinna_rotation_angle')
    column_names.append('R_pinna_flare_angle')

    # Target
    column_names.append('azimuth')
    column_names.append('elevation')

    # Subject Id
    column_names.append('subject_id')
    return column_names

In [8]:
column_names = define_column_names()

In [9]:
# Build Pandas DataFrame
df = pd.DataFrame(all_float_array, columns=column_names)

In [10]:
# col_ix Dictionary can be used to get the index of any feature
col_ix = {}
for i, col in enumerate(df.columns):
    col_ix[col] = i

In [11]:
# Should match the number of colums in dataset
len(column_names)

639

Full dataset contains data from two original datasets.

For the moment we will work with the first dataset only.

In [12]:
ds_1 = df.ix[:56249].copy()
ds_1.shape

(56250, 639)

In [25]:
# TOTAL Unique Subjects for dataset 1 = 45
ds_1.loc[:, ('subject_id')] = ds_1['subject_id'].astype(int)
ds_1.subject_id.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45])

## Include all original features

In [28]:
# Include all features
ds_2 = ds_1.copy()
n = 45

# Substitute missing values with feature mean
means = ds_2.mean(axis=0)
ds_2 = ds_2.fillna(means)

In [29]:
ds_2.shape

(56250, 639)

## Include some additional features
We decided to incorporate some additional statistics for the series data (min, max, mean values) these could help in cases where subject variation distrurbs the series.

In [35]:
# Include all features
ds = ds_2.copy()
ds_3 = ds_2.copy()
n = 45

sides = ['L', 'R']
features = ['HRIR', 'HRTF']
features_length = [200, 98]

ds_3.drop('elevation', axis=1, inplace=True)
ds_3.drop('azimuth', axis=1, inplace=True)
ds_3.drop('subject_id', axis=1, inplace=True)

for i, feature in enumerate(features):
    for side in sides:
        feature_name = side + '_' + feature
        feature_name_first = feature_name + '_0'
        range_feature = range(col_ix[feature_name_first], col_ix[feature_name_first] + features_length[i])
        min_summ = ds_3.ix[:, range_feature].min(axis=1)
        max_summ = ds_3.ix[:, range_feature].max(axis=1)
        mean_summ = ds_3.ix[:, range_feature].mean(axis=1)
        ds_3[feature_name + '_min'] = min_summ
        ds_3[feature_name + '_max'] = max_summ
        ds_3[feature_name + '_mean'] = mean_summ
        
ds_3['elevation'] = ds_2['elevation']
ds_3['azimuth'] = ds_2['azimuth']
ds_3['subject_id'] = ds_2['subject_id']

ds_3.ix[:, -15:].corr()

Unnamed: 0,L_HRIR_min,L_HRIR_max,L_HRIR_mean,R_HRIR_min,R_HRIR_max,R_HRIR_mean,L_HRTF_min,L_HRTF_max,L_HRTF_mean,R_HRTF_min,R_HRTF_max,R_HRTF_mean,elevation,azimuth,subject_id
L_HRIR_min,1.0,-0.807259,-0.013108,-0.577965,0.64288,0.303125,-0.671333,-0.815561,-0.824445,0.559807,0.634257,0.676561,-0.108868,0.593286,0.040338
L_HRIR_max,-0.807259,1.0,-0.109835,0.647182,-0.596078,-0.451983,0.609885,0.71448,0.777328,-0.662731,-0.787563,-0.799674,-0.095106,-0.674137,-0.014076
L_HRIR_mean,-0.013108,-0.109835,1.0,0.201306,-0.337961,0.828242,0.220822,0.310865,0.268016,0.003543,0.103978,0.026793,0.18355,-0.073317,-0.02162
R_HRIR_min,-0.577965,0.647182,0.201306,1.0,-0.802116,-0.131083,0.544337,0.633188,0.673452,-0.685272,-0.812686,-0.822949,-0.115765,-0.557336,-0.005584
R_HRIR_max,0.64288,-0.596078,-0.337961,-0.802116,1.0,0.028877,-0.642198,-0.780938,-0.786043,0.627887,0.715531,0.779828,-0.097351,0.617118,0.006112
R_HRIR_mean,0.303125,-0.451983,0.828242,-0.131083,0.028877,1.0,-0.064818,-0.033838,-0.088306,0.289363,0.420687,0.366289,0.172434,0.263761,-0.050304
L_HRTF_min,-0.671333,0.609885,0.220822,0.544337,-0.642198,-0.064818,1.0,0.740019,0.851818,-0.411041,-0.510029,-0.530488,0.045687,-0.500875,-0.028935
L_HRTF_max,-0.815561,0.71448,0.310865,0.633188,-0.780938,-0.033838,0.740019,1.0,0.9291,-0.533124,-0.551752,-0.638753,0.087543,-0.575831,-0.050527
L_HRTF_mean,-0.824445,0.777328,0.268016,0.673452,-0.786043,-0.088306,0.851818,0.9291,1.0,-0.546399,-0.63395,-0.674138,0.044415,-0.609915,-0.018735
R_HRTF_min,0.559807,-0.662731,0.003543,-0.685272,0.627887,0.289363,-0.411041,-0.533124,-0.546399,1.0,0.746285,0.859477,0.052693,0.518223,0.029157


We identified high correlation values with azimuth, so this should help us improve our baseline model in this prediction task.

In [31]:
train, val, test = ut.get_splitted_dataset(ds_3)

In [33]:
# Write .csv for each set
train.to_csv('data/full_train.csv')
val.to_csv('data/full_val.csv')
test.to_csv('data/full_test.csv')

In [32]:
train['subject_id'].unique()

array([ 2,  3,  4,  5,  6,  8,  9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 22,
       23, 25, 26, 27, 28, 31, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44])