In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import root_mean_squared_error
import numpy as np
import pandas as pd

In [2]:
df_dev = pd.read_csv('proc/development_processed.csv').set_index('Id')
df_eval = pd.read_csv('proc/evaluation_processed.csv').set_index('Id')

In [3]:
# Feature selection: we remove features which have been used to extract other features
# these features are all dependent on the duration of the speech, which
# is irrelevant for the age estimation task
df_dev.drop(columns=['duration', 'num_characters', 'num_words', 'num_pauses'], inplace=True)
df_eval.drop(columns=['duration', 'num_characters', 'num_words', 'num_pauses'], inplace=True)

In [4]:
df_dev = df_dev.sample(frac=1).reset_index(drop=True)
X = df_dev.drop(columns=['age'])
y = df_dev['age']

In [5]:
results = []
mean_age = y.mean()
median_age = y.median()

for reg in [LinearRegression(), Lasso(), Ridge(), RandomForestRegressor(), DecisionTreeRegressor(), SVR(), KNeighborsRegressor(), MLPRegressor()]:
    score = -cross_val_score(make_pipeline(StandardScaler(), reg), X, y, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    results.append({'Regressor': reg.__class__.__name__, 'Score': score.mean()})

results.append({'Regressor': 'Mean', 'Score': root_mean_squared_error(y, np.full(y.shape, mean_age))})
results.append({'Regressor': 'Median', 'Score': root_mean_squared_error(y, np.full(y.shape, median_age))})

df_results = pd.DataFrame(results).set_index('Regressor')

display(df_results)

Unnamed: 0_level_0,Score
Regressor,Unnamed: 1_level_1
LinearRegression,10.485246
Lasso,10.819245
Ridge,10.483861
RandomForestRegressor,10.365455
DecisionTreeRegressor,14.983089
SVR,11.161276
KNeighborsRegressor,10.999232
MLPRegressor,10.340415
Mean,13.091083
Median,13.97733


In [6]:
for reg in [LinearRegression(), Lasso(), Ridge(), RandomForestRegressor(), DecisionTreeRegressor(), SVR(), KNeighborsRegressor(), MLPRegressor()]:
    reg.fit(X, y)
    y_pred = reg.predict(df_eval)
    
    df_out = pd.DataFrame(y_pred, index=df_eval.index, columns=['Predicted'])
    model_name = reg.__class__.__name__.lower()
    df_out.to_csv(f'out/baseline/{model_name}.csv', index_label='Id')


df_out_mean = pd.DataFrame([mean_age] * len(df_eval), index=df_eval.index, columns=['Predicted'])
df_out_mean.to_csv('out/baseline/mean_age.csv', index_label='Id')

df_out_median = pd.DataFrame([median_age] * len(df_eval), index=df_eval.index, columns=['Predicted'])
df_out_median.to_csv('out/baseline/median_age.csv', index_label='Id')

In [7]:
df_scores = pd.read_csv('proc/baseline-leaderboard.csv').set_index('Regressor')
df_results = df_results.join(df_scores)
display(df_results)
df_results.to_csv('res/baseline-results.csv')

Unnamed: 0_level_0,Score,Leaderboard
Regressor,Unnamed: 1_level_1,Unnamed: 2_level_1
LinearRegression,10.485246,9.728
Lasso,10.819245,10.053
Ridge,10.483861,9.88
RandomForestRegressor,10.365455,9.82
DecisionTreeRegressor,14.983089,14.647
SVR,11.161276,11.931
KNeighborsRegressor,10.999232,12.125
MLPRegressor,10.340415,10.819
Mean,13.091083,11.96
Median,13.97733,12.866


# HyperParameter Tuning

In [8]:
res = []

### SVR

In [9]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reg', SVR())
])

param_grid = {
    'reg__kernel': ['rbf'],
    'reg__epsilon': [0.1, 0.5, 1, 2, 5, 10],
    'reg__C': [5, 10, 20, 50, 100, 200, 1000],
    'reg__gamma': ['scale', 'auto']
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X, y)  

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

res.append({'Regressor': 'SVR', 'Score': -grid.best_score_, 'Params': grid.best_params_})

{'reg__C': 20, 'reg__epsilon': 2, 'reg__gamma': 'auto', 'reg__kernel': 'rbf'}
-10.141876795188715
Pipeline(steps=[('scaler', StandardScaler()),
                ('reg', SVR(C=20, epsilon=2, gamma='auto'))])


In [10]:
reg = grid.best_estimator_
reg.fit(X, y)
y_pred = reg.predict(df_eval)

df_out = pd.DataFrame(y_pred, index=df_eval.index, columns=['Predicted'])
df_out.to_csv('out/svr.csv')

**Score**: 9.311 using {'reg__C': 20, 'reg__epsilon': 5, 'reg__gamma': 'scale', 'reg__kernel': 'rbf'}

### Ridge

In [11]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('rfe', RFE(Ridge())),
    ('poly', PolynomialFeatures()),
    ('reg', Ridge())
])

param_grid = {
    'rfe__n_features_to_select': np.linspace(0.1, 1, 6),
    'poly__degree': [2, 3],
    'reg__alpha': [1, 10, 100, 1000]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid.fit(X, y)  

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

res.append({'Regressor': 'Ridge', 'Score': -grid.best_score_, 'Params': grid.best_params_})

{'poly__degree': 2, 'reg__alpha': 1000, 'rfe__n_features_to_select': np.float64(0.45999999999999996)}
-10.259264923881535
Pipeline(steps=[('scaler', StandardScaler()),
                ('rfe',
                 RFE(estimator=Ridge(),
                     n_features_to_select=np.float64(0.45999999999999996))),
                ('poly', PolynomialFeatures()), ('reg', Ridge(alpha=1000))])


In [12]:
rfe = grid.best_estimator_.named_steps['rfe']
selected_features_indices = rfe.get_support(indices=True)
selected_features = X.columns[selected_features_indices]
print("Selected features:", selected_features)

Selected features: Index(['gender', 'mean_pitch', 'jitter', 'energy', 'hnr', 'silence_duration',
       'spectral_bandwidth', 'MFCC-2-5', 'MFCC-3-5', 'MFCC-5-5', 'MFCC-6-5',
       'MFCC-7-5', 'MFCC-8-5', 'MFCC-9-5', 'MFCC-11-5', 'MFCC-12-5',
       'MFCC-1-50', 'MFCC-2-50', 'MFCC-5-50', 'MFCC-6-50', 'MFCC-7-50',
       'MFCC-11-50', 'MFCC-2-95', 'MFCC-3-95', 'MFCC-4-95', 'MFCC-5-95',
       'MFCC-7-95', 'MFCC-8-95', 'MFCC-11-95', 'MFCC-13-95', 'MFCCD-1-5',
       'MFCCD-3-5', 'MFCCD-5-5', 'MFCCD-6-5', 'MFCCD-2-50', 'MFCCD-3-95',
       'MFCCD-4-95', 'MFCCD-5-95', 'MFCCD-6-95', 'MFCCD-9-95', 'MFCCD-12-95',
       'silence_ratio', 'wps'],
      dtype='object')


In [13]:
reg = grid.best_estimator_
reg.fit(X, y)
y_pred = reg.predict(df_eval)

df_out = pd.DataFrame(y_pred, index=df_eval.index, columns=['Predicted'])
df_out.to_csv('out/ridge.csv')

**Score**: 9.578 using {'poly__degree': 2, 'reg__alpha': 1000, 'rfe__n_features_to_select': np.float64(0.28)}

### Linear Regression

In [14]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('rfe', RFE(LinearRegression())),
    ('poly', PolynomialFeatures()),
    ('reg', LinearRegression())
])

param_grid = {
    'rfe__n_features_to_select': np.linspace(0.1, 1, 12),
    'poly__degree': [2, 3],
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid.fit(X, y)  

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

res.append({'Regressor': 'LinearRegression', 'Score': -grid.best_score_, 'Params': grid.best_params_})

{'poly__degree': 2, 'rfe__n_features_to_select': np.float64(0.1)}
-10.415697216750718
Pipeline(steps=[('scaler', StandardScaler()),
                ('rfe',
                 RFE(estimator=LinearRegression(),
                     n_features_to_select=np.float64(0.1))),
                ('poly', PolynomialFeatures()), ('reg', LinearRegression())])


In [15]:
rfe = grid.best_estimator_.named_steps['rfe']
selected_features_indices = rfe.get_support(indices=True)
selected_features = X.columns[selected_features_indices]
print("Selected features:", selected_features)

Selected features: Index(['jitter', 'silence_duration', 'MFCC-6-5', 'MFCC-8-5', 'MFCC-7-50',
       'MFCC-8-95', 'MFCCD-6-5', 'MFCCD-5-95', 'silence_ratio'],
      dtype='object')


In [16]:
reg = grid.best_estimator_
reg.fit(X, y)
y_pred = reg.predict(df_eval)

df_out = pd.DataFrame(y_pred, index=df_eval.index, columns=['Predicted'])
df_out.to_csv('out/lr.csv')

**Score**: 9.689 using {'poly__degree': 2, 'rfe__n_features_to_select': np.float64(0.1)}

### Random Forest

In [17]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reg', RandomForestRegressor())
])

param_grid = {
    'reg__max_depth': [15, 30, 50, None],
    'reg__min_samples_split': [2, 5],
    'reg__min_samples_leaf': [1, 4, 16],
    'reg__max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X, y)  

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

res.append({'Regressor': 'RandomForestRegressor', 'Score': -grid.best_score_, 'Params': grid.best_params_})

{'reg__max_depth': None, 'reg__max_features': 'sqrt', 'reg__min_samples_leaf': 4, 'reg__min_samples_split': 2}
-10.320533002586064
Pipeline(steps=[('scaler', StandardScaler()),
                ('reg',
                 RandomForestRegressor(max_features='sqrt',
                                       min_samples_leaf=4))])


In [18]:
reg = grid.best_estimator_
y_pred = reg.predict(df_eval)

df_out = pd.DataFrame(y_pred, index=df_eval.index, columns=['Predicted'])
df_out.to_csv('out/rf.csv')

**Score**: 9.647 using {'reg__max_depth': 30, 'reg__max_features': 'sqrt', 'reg__min_samples_leaf': 4, 'reg__min_samples_split': 2}

### Final Results

In [19]:
res_df = pd.DataFrame(res).set_index('Regressor')
public_scores = pd.read_csv('proc/tuned-leaderboard.csv').set_index('Regressor')
res_df = res_df.join(public_scores)
display(res_df)
res_df.to_csv('res/tuned-results.csv')

Unnamed: 0_level_0,Score,Params,Leaderboard
Regressor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVR,10.141877,"{'reg__C': 20, 'reg__epsilon': 2, 'reg__gamma'...",9.311
Ridge,10.259265,"{'poly__degree': 2, 'reg__alpha': 1000, 'rfe__...",9.578
LinearRegression,10.415697,"{'poly__degree': 2, 'rfe__n_features_to_select...",9.689
RandomForestRegressor,10.320533,"{'reg__max_depth': None, 'reg__max_features': ...",9.647
