In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

In [2]:
df_dev = pd.read_csv('proc/development_processed.csv').set_index('Id')
df_eval = pd.read_csv('proc/evaluation_processed.csv').set_index('Id')

In [3]:
# Feature selection: we remove features which have been used to extract other features
# these features are all dependent on the duration of the speech, which
# is irrelevant for the age estimation task
df_dev.drop(columns=['duration', 'num_characters', 'num_words', 'num_pauses'], inplace=True)
df_eval.drop(columns=['duration', 'num_characters', 'num_words', 'num_pauses'], inplace=True)

In [4]:
qt = pd.cut(df_dev['age'], bins=range(0,105,5), labels=False, right=False)
X = df_dev.drop(columns=['age'])
y = df_dev['age']

In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True)

pipes = [
    Pipeline([
        ('scaler', StandardScaler()),
        ('reg', SVR(C=20, epsilon=5,  gamma='scale', kernel='rbf'))
    ]),
    Pipeline([
    ('scaler', StandardScaler()),
    ('rfe', RFE(Ridge(),  n_features_to_select=0.28)),
    ('poly', PolynomialFeatures(2)),
    ('reg', Ridge(alpha=1000))   
])]

rl = []

for pipe, name in zip(pipes, ['SVR', 'Ridge']):
    rmses_per_class = []
    rmses = []

    for train_index, test_index in skf.split(X, qt):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        z_train, z_test = qt.iloc[train_index], qt.iloc[test_index]

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)

        # Calculate RMSE for each class in y_test
        rmse_per_class = {}
        for cls in z_test.unique():
            idx = z_test == cls
            rmse_per_class[cls] = root_mean_squared_error(y_test[idx], y_pred[idx])
        
        rmses_per_class.append(rmse_per_class)

        # Calculate the overall RMSE
        rmse = root_mean_squared_error(y_test, y_pred)
        rmses.append(rmse)

    rmse_per_class = {cls: sum([d.get(cls, 0) for d in rmses_per_class])/5 for cls in z_test.unique()}
    rmse = sum(rmses)/5
    res = {'Model': name, 'RMSE': rmse} | {f'[{cls*5}-{5*(cls+1)}]': rmse_per_class[cls] for cls in z_test.unique()}
    rl.append(res)

df_res = pd.DataFrame(rl)
    
    



In [10]:
cols = ['Model', 'RMSE'] + sorted([col for col in df_res.columns if col.startswith('[')], key=lambda x: int(x.strip('[]').split('-')[0]))
df_res = df_res[cols]
df_res.set_index('Model', inplace=True)
df_res

Unnamed: 0_level_0,RMSE,[5-10],[15-20],[20-25],[25-30],[30-35],[35-40],[40-45],[45-50],[50-55],[55-60],[60-65],[65-70],[70-75],[75-80],[80-85],[85-90]
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
SVR,10.161987,10.644255,6.607078,6.724343,6.224813,6.569557,7.655529,10.140768,13.361843,16.628728,21.110863,24.645968,28.5999,31.528475,37.8145,42.938424,33.912072
Ridge,10.256088,12.300593,6.760375,7.074073,6.479067,6.248012,8.028855,10.057154,13.390327,16.545282,21.32162,24.399376,27.737752,32.294782,37.50958,42.305518,31.680814


In [12]:
print(df_res.to_latex(float_format="%.3f"))

\begin{tabular}{lrrrrrrrrrrrrrrrrr}
\toprule
 & RMSE & [5-10] & [15-20] & [20-25] & [25-30] & [30-35] & [35-40] & [40-45] & [45-50] & [50-55] & [55-60] & [60-65] & [65-70] & [70-75] & [75-80] & [80-85] & [85-90] \\
Model &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
SVR & 10.162 & 10.644 & 6.607 & 6.724 & 6.225 & 6.570 & 7.656 & 10.141 & 13.362 & 16.629 & 21.111 & 24.646 & 28.600 & 31.528 & 37.815 & 42.938 & 33.912 \\
Ridge & 10.256 & 12.301 & 6.760 & 7.074 & 6.479 & 6.248 & 8.029 & 10.057 & 13.390 & 16.545 & 21.322 & 24.399 & 27.738 & 32.295 & 37.510 & 42.306 & 31.681 \\
\bottomrule
\end{tabular}



: 

\begin{table}[h]
\centering
\caption{Best hyperparameter configurations and relative RMSE scores}
\label{tab:configuration}
\begin{tabular}{lrrrrrrrr}
Model& [15-20] & [20-25] & [25-30] & [30-35] & [35-40] & [40-45] & [45-50]  \\ \hline
SVR & 6.607 & 6.724 & 6.225 & 6.570 & 7.656 & 10.141 & 13.362  \\
Ridge & 6.760 & 7.074 & 6.479 & 6.248 & 8.029 & 10.057 & 13.390 \\ \hline
Model & [50-55] & [55-60] & [60-65] & [65-70] & [70-75] & [75-80] & [80-85] \\ \hline
SVR & 16.629 & 21.111 & 24.646 & 28.600 & 31.528 & 37.815 & 42.938 \\
Ridge & 16.545 & 21.322 & 24.399 & 27.738 & 32.295 & 37.510 & 42.306 
\end{tabular}
\end{table}

### SVR

In [9]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reg', SVR())
])

param_grid = {
    'reg__kernel': ['rbf'],
    'reg__epsilon': [0.1, 0.5, 1, 2, 5, 10],
    'reg__C': [5, 10, 20, 50, 100, 200, 1000],
    'reg__gamma': ['scale', 'auto']
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

res.append({'Regressor': 'SVR', 'Score': -grid.best_score_, 'Params': grid.best_params_})

{'reg__C': 20, 'reg__epsilon': 5, 'reg__gamma': 'scale', 'reg__kernel': 'rbf'}
-10.103415022016922
Pipeline(steps=[('scaler', StandardScaler()), ('reg', SVR(C=20, epsilon=5))])


In [10]:
reg = grid.best_estimator_
reg.fit(X, y)
y_pred = reg.predict(df_eval)

df_out = pd.DataFrame(y_pred, index=df_eval.index, columns=['Predicted'])
df_out.to_csv('out/svr.csv')

**Score**: 9.311 using {'reg__C': 20, 'reg__epsilon': 5, 'reg__gamma': 'scale', 'reg__kernel': 'rbf'}