In [60]:
import pandas as pd
import numpy as np

In [61]:
df = pd.read_csv('data/development.csv').set_index('Id')
ev = pd.read_csv('data/evaluation.csv').set_index('Id')
df.head()

Unnamed: 0_level_0,sampling_rate,age,gender,ethnicity,mean_pitch,max_pitch,min_pitch,jitter,shimmer,energy,zcr_mean,spectral_centroid_mean,tempo,hnr,num_words,num_characters,num_pauses,silence_duration,path
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,22050,24.0,female,arabic,1821.6906,3999.717,145.43066,0.013795,0.082725,0.002254,0.210093,3112.257251,[151.99908088],-123.999726,69,281,39,23.846893,audios_development/1.wav
1,22050,22.5,female,hungarian,1297.8187,3998.859,145.37268,0.025349,0.096242,0.007819,0.078849,1688.016389,[129.19921875],-86.928478,69,281,21,19.388662,audios_development/2.wav
2,22050,22.0,female,portuguese,1332.8524,3998.8025,145.42395,0.019067,0.119456,0.002974,0.105365,2576.901706,[117.45383523],-98.45067,69,281,1,21.640998,audios_development/3.wav
3,22050,22.0,female,english,1430.3499,3998.451,147.98083,0.017004,0.102389,0.022371,0.173701,3269.751413,[117.45383523],-56.459762,69,281,9,19.644127,audios_development/4.wav
4,22050,22.0,male,dutch,1688.7234,3998.6113,145.44772,0.028027,0.124831,0.005369,0.107279,1930.897375,[112.34714674],-80.349204,69,281,11,18.041905,audios_development/5.wav


In [62]:
# We drop the sampling rate, which is the same value for all samples
# We drop the path, which is not useful for the regression task
df.drop(['sampling_rate', 'path'], axis=1, inplace=True)
ev.drop(['sampling_rate', 'path'], axis=1, inplace=True) 

In [63]:
# We convert the tempo to the float data type from the format '[float]'
df['tempo'] = df['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')
ev['tempo'] = ev['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')
df['tempo'].dtype, ev['tempo'].dtype

(dtype('float64'), dtype('float64'))

In [64]:
# We keep only the `igbo` ethicity since it is the only ethnicity which is significantly present
# in both the development and the evaluation dataset

df['igbo'] = df['ethnicity'].apply(lambda z: 1 if z == 'igbo' else 0)
df.drop('ethnicity', axis=1, inplace=True)

ev['igbo'] = ev['ethnicity'].apply(lambda z: 1 if z == 'igbo' else 0)
ev.drop('ethnicity', axis=1, inplace=True)

In [65]:
ev['gender'].value_counts()

gender
male      393
female    297
famale      1
Name: count, dtype: int64

In [66]:
gender =  {'male': 1, 'female': 0}

# We correct the famale gender
ev['gender'] = ev['gender'].map({'famale': 'female'})

# We encode the gender using dummy encoding
df['gender'] = df['gender'].map(gender)
ev['gender'] = ev['gender'].map(gender)

In [67]:
# Since most of the speakers are pronouncing the same sentence, 
# we define a feature which indicates wheter the sentence is that one
df['standard_sentence'] = (df['num_words'] == 69).astype('int')
ev['standard_sentence'] = (ev['num_words'] == 69).astype('int')

In [68]:
# We drop min_pitch and max_pitch since they heavily centered around the same value.
df.drop(['min_pitch', 'max_pitch'], axis='columns', inplace=True)
ev.drop(['min_pitch', 'max_pitch'], axis='columns', inplace=True)

In [69]:
# We apply the log function to obtain a greater spread of values. 
df['energy'] = df['energy'].apply(np.log)
ev['energy'] = ev['energy'].apply(np.log)

In [70]:
df.to_csv('proc/development.csv')
ev.to_csv('proc/evaluation.csv')