In [1]:
import pandas as pd

df_dev = pd.read_csv('data/development_processed.csv').set_index('Id')
df_eval = pd.read_csv('data/evaluation_processed.csv').set_index('Id')

In [2]:
df_dev.drop(columns=['duration', 'num_characters', 'num_words', 'num_pauses'], inplace=True)
df_eval.drop(columns=['duration', 'num_characters', 'num_words', 'num_pauses'], inplace=True)

In [3]:
from sklearn.model_selection import train_test_split
X = df_dev.drop(columns=['age'])
y = df_dev['age']


In [4]:
from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

for reg in [LinearRegression(), Lasso(), Ridge(), RandomForestRegressor(), SVR(), KNeighborsRegressor(), MLPRegressor()]:
    score = cross_val_score(make_pipeline(StandardScaler(), reg), X, y, cv=5, scoring='neg_root_mean_squared_error')
    print(reg.__class__(), score.mean())


LinearRegression() -10.51172672393724
Lasso() -10.824896386499677
Ridge() -10.51025773201371
RandomForestRegressor() -10.491159870186582
SVR() -11.16818426440568
KNeighborsRegressor() -11.424799249839335




MLPRegressor() -10.484024697649463




In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reg', SVR())
])

param_grid = {
    'reg__kernel': ['rbf'],
    'reg__epsilon': [0.1, 0.5, 1, 2, 5, 10],
    'reg__C': [5, 10, 20, 50, 100, 200, 1000],
    'reg__gamma': ['scale', 'auto']
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=10, n_jobs=-1)
grid.fit(X, y)  

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

Fitting 5 folds for each of 84 candidates, totalling 420 fits
{'reg__C': 20, 'reg__epsilon': 5, 'reg__gamma': 'scale', 'reg__kernel': 'rbf'}
-10.181128783132111
Pipeline(steps=[('scaler', StandardScaler()), ('reg', SVR(C=20, epsilon=5))])


In [6]:
reg = grid.best_estimator_
reg.fit(X, y)
y_pred = reg.predict(df_eval)

df_out = pd.DataFrame(y_pred, index=df_eval.index, columns=['Predicted'])
df_out.to_csv('out/svr.csv')

**Score**: 9.311 {'reg__C': 20, 'reg__epsilon': 5, 'reg__gamma': 'scale', 'reg__kernel': 'rbf'}, senza MFCC-$\Delta^2$

In [7]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('rfe', RFE(Ridge())),
    ('poly', PolynomialFeatures()),
    ('reg', Ridge())
])

param_grid = {
    'rfe__n_features_to_select': np.linspace(0.1, 1, 6),
    'poly__degree': [2, 3],
    'reg__alpha': [1, 10, 100, 1000]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=10)
grid.fit(X, y)  

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5; 1/48] START poly__degree=2, reg__alpha=1, rfe__n_features_to_select=0.1
[CV 1/5; 1/48] END poly__degree=2, reg__alpha=1, rfe__n_features_to_select=0.1;, score=-10.111 total time=   0.1s
[CV 2/5; 1/48] START poly__degree=2, reg__alpha=1, rfe__n_features_to_select=0.1
[CV 2/5; 1/48] END poly__degree=2, reg__alpha=1, rfe__n_features_to_select=0.1;, score=-10.225 total time=   0.1s
[CV 3/5; 1/48] START poly__degree=2, reg__alpha=1, rfe__n_features_to_select=0.1
[CV 3/5; 1/48] END poly__degree=2, reg__alpha=1, rfe__n_features_to_select=0.1;, score=-10.286 total time=   0.1s
[CV 4/5; 1/48] START poly__degree=2, reg__alpha=1, rfe__n_features_to_select=0.1
[CV 4/5; 1/48] END poly__degree=2, reg__alpha=1, rfe__n_features_to_select=0.1;, score=-11.038 total time=   0.1s
[CV 5/5; 1/48] START poly__degree=2, reg__alpha=1, rfe__n_features_to_select=0.1
[CV 5/5; 1/48] END poly__degree=2, reg__alpha=1, rfe__n_features_to_select=0.

In [8]:
rfe = grid.best_estimator_.named_steps['rfe']
selected_features_indices = rfe.get_support(indices=True)
selected_features = X.columns[selected_features_indices]
print("Selected features:", selected_features)

Selected features: Index(['gender', 'mean_pitch', 'jitter', 'hnr', 'silence_duration',
       'MFCC-2-95', 'MFCC-3-95', 'MFCC-4-95', 'MFCC-8-95', 'MFCC-11-95',
       'MFCC-3-5', 'MFCC-6-5', 'MFCC-7-5', 'MFCC-8-5', 'MFCC-9-5', 'MFCC-11-5',
       'MFCC-2-50', 'MFCC-6-50', 'MFCC-7-50', 'MFCC-11-50', 'MFCCD-4-95',
       'MFCCD-5-95', 'MFCCD-6-95', 'MFCCD-5-5', 'MFCCD-6-5', 'silence_ratio'],
      dtype='object')


In [9]:
reg = grid.best_estimator_
reg.fit(X, y)
y_pred = reg.predict(df_eval)

df_out = pd.DataFrame(y_pred, index=df_eval.index, columns=['Predicted'])
df_out.to_csv('out/ridge.csv')

**Score**: 9.578 {'poly__degree': 2, 'reg__alpha': 1000, 'rfe__n_features_to_select': np.float64(0.28)}