In [24]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('data/development.csv').set_index('Id')
ev = pd.read_csv('data/evaluation.csv').set_index('Id')
df.head()

In [26]:
# We drop the sampling rate, which is the same value for all samples
# We drop the path, which is not useful for the regression task
df.drop(['sampling_rate', 'path'], axis=1, inplace=True)
ev.drop(['sampling_rate', 'path'], axis=1, inplace=True) 

In [None]:
# We convert the tempo to the float data type from the format '[float]'
df['tempo'] = df['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')
ev['tempo'] = ev['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')
df['tempo'].dtype, ev['tempo'].dtype

In [28]:
# We keep only the `igbo` ethicity since it is the only ethnicity which is significantly present
# in both the development and the evaluation dataset

df['igbo'] = df['ethnicity'].apply(lambda z: 1 if z == 'igbo' else 0)
df.drop('ethnicity', axis=1, inplace=True)

ev['igbo'] = ev['ethnicity'].apply(lambda z: 1 if z == 'igbo' else 0)
ev.drop('ethnicity', axis=1, inplace=True)

In [None]:
ev['gender'].value_counts()

In [None]:
gender =  {'male': 1, 'female': 0, "famale": 0}

# We encode the gender using dummy encoding
df['gender'] = df['gender'].map(gender)
ev['gender'] = ev['gender'].map(gender)

df.head()

In [31]:
# Since most of the speakers are pronouncing the same sentence, 
# we define a feature which indicates wheter the sentence is that one
df['standard_sentence'] = (df['num_words'] == 69).astype('int')
ev['standard_sentence'] = (ev['num_words'] == 69).astype('int')

In [32]:
# We drop min_pitch and max_pitch since they heavily centered around the same value.
df.drop(['min_pitch', 'max_pitch'], axis='columns', inplace=True)
ev.drop(['min_pitch', 'max_pitch'], axis='columns', inplace=True)

In [33]:
# We apply the log function to obtain a greater spread of values. 
df['energy'] = df['energy'].apply(np.log)
ev['energy'] = ev['energy'].apply(np.log)

In [34]:
df.to_csv('proc/development.csv')
ev.to_csv('proc/evaluation.csv')