In [2]:
import pandas as pd

df_dev = pd.read_csv('data/development_processed.csv').set_index('Id')
df_eval = pd.read_csv('data/evaluation_processed.csv').set_index('Id')

In [3]:
df_dev.drop(columns=['duration', 'num_characters', 'num_words', 'num_pauses'], inplace=True)
df_eval.drop(columns=['duration', 'num_characters', 'num_words', 'num_pauses'], inplace=True)

In [4]:
from sklearn.model_selection import train_test_split
X = df_dev.drop(columns=['age'])
y = df_dev['age']


In [5]:
from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

for reg in [LinearRegression(), Lasso(), Ridge(), RandomForestRegressor(), SVR(), KNeighborsRegressor(), MLPRegressor()]:
    score = cross_val_score(make_pipeline(StandardScaler(), reg), X, y, cv=5, scoring='neg_root_mean_squared_error')
    print(reg.__class__(), score.mean())


LinearRegression() -10.648888765414885
Lasso() -10.95829204177837
Ridge() -10.648058340186129
RandomForestRegressor() -10.518235089701104
SVR() -11.2761395711642
KNeighborsRegressor() -11.502456689730352




MLPRegressor() -10.434214535967605




In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reg', SVR())
])

param_grid = {
    'reg__kernel': ['linear', 'rbf'],
    'reg__epsilon': [0.1, 0.5, 1, 5],
    'reg__C': [0.1, 1, 10, 100],
    'reg__gamma': ['scale', 'auto', 1e-3, 1e-2, 0.1, 1]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=10, n_jobs=-1)
grid.fit(X, y)  

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
{'reg__C': 10, 'reg__degree': 2, 'reg__gamma': 'auto', 'reg__kernel': 'rbf'}
-10.419464049521922
Pipeline(steps=[('scaler', StandardScaler()),
                ('reg', SVR(C=10, degree=2, gamma='auto'))])


In [14]:
reg = grid.best_estimator_
reg.fit(X, y)
y_pred = reg.predict(df_eval)

df_out = pd.DataFrame(y_pred, index=df_eval.index, columns=['Predicted'])
df_out.to_csv('out/svr.csv')

**Score**: 9.526

In [11]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('rfe', RFE(Ridge())),
    ('poly', PolynomialFeatures()),
    ('reg', Ridge())
])

param_grid = {
    'rfe__n_features_to_select': np.linspace(10, X.shape[1]+1, 10).astype(int),
    'poly__degree': [2, 3],
    'reg__alpha': [1, 10, 100, 1000]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=10, n_jobs=-1)
grid.fit(X, y)  

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _data = np.array(data, dtype=dtype, copy=copy,


{'poly__degree': 2, 'reg__alpha': 1000, 'rfe__n_features_to_select': 40}
-10.280590746192692
Pipeline(steps=[('scaler', StandardScaler()),
                ('rfe', RFE(estimator=Ridge(), n_features_to_select=40)),
                ('poly', PolynomialFeatures()), ('reg', Ridge(alpha=1000))])


In [12]:
rfe = grid.best_estimator_.named_steps['rfe']
selected_features_indices = rfe.get_support(indices=True)
selected_features = X.columns[selected_features_indices]
print("Selected features:", selected_features)

Selected features: Index(['gender', 'mean_pitch', 'jitter', 'shimmer', 'energy', 'zcr_mean',
       'hnr', 'silence_duration', 'MFCC-1-95', 'MFCC-2-95', 'MFCC-4-95',
       'MFCC-9-95', 'MFCC-11-95', 'MFCC-1-5', 'MFCC-2-5', 'MFCC-4-5',
       'MFCC-5-5', 'MFCC-6-5', 'MFCC-7-5', 'MFCC-8-5', 'MFCC-12-5',
       'MFCC-1-50', 'MFCC-2-50', 'MFCC-4-50', 'MFCC-5-50', 'MFCC-6-50',
       'MFCC-12-50', 'MFDD-1-95', 'MFDD-2-95', 'MFDD-3-95', 'MFDD-4-95',
       'MFDD-5-95', 'MFDD-10-95', 'MFDD-1-5', 'MFDD-2-5', 'MFDD-4-5',
       'MFDD-5-5', 'mean_silence', 'silence_ratio', 'wps'],
      dtype='object')


In [13]:
reg = grid.best_estimator_
reg.fit(X, y)
y_pred = reg.predict(df_eval)

df_out = pd.DataFrame(y_pred, index=df_eval.index, columns=['Predicted'])
df_out.to_csv('out/ridge.csv')

**Score**: 9.561