# Preprocessing

In [None]:
import pandas as pd
import numpy as np

df_dev = pd.read_csv('data/development.csv').set_index('Id')
df_eval = pd.read_csv('data/evaluation.csv').set_index('Id')

df_dev.drop(['sampling_rate', 'path'], axis=1, inplace=True)
df_eval.drop(['sampling_rate', 'path'], axis=1, inplace=True)

df_dev['tempo'] = df_dev['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')
df_eval['tempo'] = df_eval['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')
df_dev['tempo'].dtype, df_eval['tempo'].dtype

df_dev.head(50)

In [None]:
from collections import Counter
print(Counter(df_dev["gender"]))
print(Counter(df_eval["gender"]))

In [473]:
df_dev = df_dev[df_dev["age"]<=70]

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

temp = Counter(df_dev["age"])
print(temp)

plt.figure()
plt.bar(temp.keys(), temp.values())

Gender

In [None]:
print(df_dev['gender'].value_counts())
print(df_eval['gender'].value_counts())

In [None]:
gender =  {'male': 1, 'female': 0, "famale": 0}

# We encode the gender using dummy encoding
df_dev['gender'] = df_dev['gender'].map(gender)
df_eval['gender'] = df_eval['gender'].map(gender)
print(df_dev['gender'].value_counts())
print(df_eval['gender'].value_counts())

Ethnicity

In [None]:
set1 = set(df_dev["ethnicity"])
set2 = set(df_eval["ethnicity"])
set1.intersection(set2)

In [478]:
from collections import defaultdict
temp = defaultdict(lambda: 0)
for el in df_eval["ethnicity"]:
    if el in set1.intersection(set2):
        temp[el] += 1

In [480]:
# We keep only the `igbo` ethicity since it is the only ethnicity which is significantly present
# in both the development and the evaluation dataset

df_dev['igbo'] = df_dev['ethnicity'].apply(lambda z: 1 if z == 'igbo' else 0)
df_dev.drop('ethnicity', axis=1, inplace=True)

df_eval['igbo'] = df_eval['ethnicity'].apply(lambda z: 1 if z == 'igbo' else 0)
df_eval.drop('ethnicity', axis=1, inplace=True)

Energy

In [481]:
# We apply the log function to obtain a greater spread of values. 
df_dev['energy'] = df_dev['energy'].apply(np.log)
df_eval['energy'] = df_eval['energy'].apply(np.log)

Pitch

In [482]:
df_dev.drop(["max_pitch","mean_pitch"], axis='columns', inplace=True)
df_eval.drop(["max_pitch", "mean_pitch"], axis="columns", inplace=True)

In [486]:
df_dev['jitter'] = df_dev['jitter'].apply(np.log)
df_eval['jitter'] = df_eval['jitter'].apply(np.log)

df_dev['shimmer'] = df_dev['shimmer'].apply(np.log)
df_eval['shimmer'] = df_eval['shimmer'].apply(np.log)

df_dev['min_pitch'] = df_dev['min_pitch'].apply(np.log)
df_eval['min_pitch'] = df_eval['min_pitch'].apply(np.log)

df_dev['zcr_mean'] = df_dev['zcr_mean'].apply(np.log)
df_eval['zcr_mean'] = df_eval['zcr_mean'].apply(np.log)

df_dev['spectral_centroid_mean'] = df_dev['spectral_centroid_mean'].apply(np.log)
df_eval['spectral_centroid_mean'] = df_eval['spectral_centroid_mean'].apply(np.log)

df_dev['tempo'] = df_dev['tempo'].apply(np.log)
df_eval['tempo'] = df_eval['tempo'].apply(np.log)

df_dev['num_pauses'] = df_dev['num_pauses'].apply(np.log)
df_eval['num_pauses'] = df_eval['num_pauses'].apply(np.log)

df_dev['silence_duration'] = df_dev['silence_duration'].apply(np.log)
df_eval['silence_duration'] = df_eval['silence_duration'].apply(np.log)

In [488]:
df_dev.drop(["num_words","min_pitch","shimmer", "energy"], axis='columns', inplace=True)
df_eval.drop(["num_words","min_pitch","shimmer", "energy"], axis="columns", inplace=True)

# Model Selection

In [None]:
df_dev.head()

In [490]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

y_dev = df_dev["age"]
X_dev = df_dev.drop(columns=["age"])
X_dev.head()
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, random_state=0, test_size=0.2)
X_test = df_eval

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_val = ss.transform(X_val)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(15,15))
sns.heatmap(X_dev.corr(), cmap="crest", annot=True)

In [None]:
reg = RandomForestRegressor(random_state=0, n_jobs=-1, criterion="squared_error", n_estimators=200) # squared error since if small also RMSE is small

reg.fit(X_train,y_train)

y_pred = reg.predict(X_val)

from sklearn.metrics import root_mean_squared_error
root_mean_squared_error(y_val, y_pred)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR 
from sklearn.tree import DecisionTreeRegressor  
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

for model in [LinearRegression, Lasso, Ridge, RandomForestRegressor, SVR, DecisionTreeRegressor, KNeighborsRegressor, MLPRegressor]:
    reg = model()
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    print(f'{model.__name__} RMSE: {rmse}')

Choose best model

In [None]:
from sklearn.preprocessing import PolynomialFeatures

polynomial = PolynomialFeatures(2)
X_train = polynomial.fit_transform(X_train)
# reg = LinearRegression()
reg = Ridge(alpha=35)
reg.fit(X_train, y_train)
y_pred = reg.predict(polynomial.transform(X_val))
# y_pred = reg.predict(X_val)
rmse = root_mean_squared_error(y_val, y_pred)
print(f'RMSE: {rmse}')

# Train on all

In [None]:
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import QuantileRegressor

y_dev = df_dev["age"]
X_dev = df_dev.drop(columns=["age"])
X_test = df_eval

ss = StandardScaler()
# ss = MinMaxScaler()
X_dev = ss.fit_transform(X_dev)
X_test = ss.transform(X_test)

polynomial = PolynomialFeatures(2)
X_dev = polynomial.fit_transform(X_dev)
X_test = polynomial.transform(X_test)

# reg = LinearRegression()
reg = Ridge()

grid = {
    "alpha": list(range(1,200,4)),
    # "n_estimators": [100,200,250,300]
}
gridSearch = GridSearchCV(reg, grid, n_jobs=-1, verbose=6, cv=20, scoring="neg_root_mean_squared_error")
gridSearch.fit(X_dev, y_dev)
reg = gridSearch.best_estimator_
print(gridSearch.best_estimator_)
print(gridSearch.best_params_)
print(gridSearch.best_score_)

# reg = Ridge(1)
# reg.fit(X_dev, y_dev)
y_pred = reg.predict(X_test)

In [None]:
(y_pred<0).sum()

In [500]:
import pandas as pd

dataFrameOutput = pd.DataFrame({"Predicted": np.round(y_pred,2)})

dataFrameOutput.index.name = "Id"
dataFrameOutput.to_csv("./data/output.csv")