# Data Science Lab: Process and methods - Winter Call 2024/25

Author: Massimiliano Carli

Project: Age estimation from speech

## Dependencies

In [1]:
seed = None

In [2]:
import json
import joblib

import numpy as np
import pandas as pd 
from IPython.display import display  # to display variables in a "nice" way

# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import root_mean_squared_error

# My models and features
from utils.filenames_generators import *
from utils.plotters import *
from utils.feature_engineering.wave_spectrogram_extractors import *
from utils.feature_engineering.feature_engineering import *

In [8]:
pd.options.mode.copy_on_write = True # best practice for avoiding making deep copies rather than views of DataFrames and Series 
pd.options.display.max_rows = 10
%matplotlib inline

For the copy-on-write (CoW) best practice, see the [related Pandas Documentation](https://pandas.pydata.org/pandas-docs/stable//user_guide/copy_on_write.html), it will become the default behaviour from Pandas 3.0

## Load data and configurations

In [9]:
with open('config.json', 'r') as f:
    config = json.load(f)

n_mfcc = config["n_mfcc"]
n_mels = config["n_mels"]
sampling_rate = config["sampling_rate"]
num_silence_frames = config["num_silence_frames"]

print(f"sampling_rate: {sampling_rate}")
print(f"n_mfcc: {n_mfcc}")
print(f"n_mels: {n_mfcc}")
print(f"num_silence_frames: {num_silence_frames}")

sampling_rate: 22050
n_mfcc: 20
n_mels: 20
num_silence_frames: 20


In [10]:
# Load the data from the CSV file
data = pd.read_csv('data/development.csv', index_col='Id')

## Feature Engineering

In [None]:
data = compute_waves(data, sr=sampling_rate, filter=None) # Estimated 2m for computing

In [7]:
data = compute_spectrograms(data, sr=sampling_rate) # Estimated 3m for computing

---

In [8]:
data = drop_unused_columns(data)

In [9]:
data = encode_gender(data)

In [10]:
data = encode_ethnicity(data)

In [11]:
data = floatize_tempo(data)

In [12]:
data = comb_precomp(data)

In [13]:
data = log_mel_spec(data, S=True, sr=sampling_rate, n_mels=n_mels)

In [14]:
data = mfcc(data, S=True, sr=sampling_rate, n_mfcc=n_mfcc)

In [15]:
data = silence_duration_contour(data, sr=sampling_rate, num_silence_frames=num_silence_frames) # Estimated 3m to compute

***Save features for avoiding computation later***

In [None]:
filename = generate_dataset_filename()
data.drop(columns=['wave', 'spectrogram', 'melspectrogram', 'log_melspectrogram']).to_csv(filename)
print(f"Data saved as: {filename}")

## Load previously computed features

In [6]:
data = pd.read_csv('data\development-20250128-114352.csv', index_col='Id')

## Feature selection

In [7]:
pre_computed_features = [
    'mean_pitch',
    'max_pitch',
    'min_pitch',
    'jitter',
    'shimmer',
    'energy',
    'zcr_mean',
    'spectral_centroid_mean',
    'tempo',
    'hnr',
    'num_words',
    'num_characters',
    'num_pauses',
    'silence_duration',
]

In [8]:
comb_pre_comp_features = [
    'duration',
    'intensity',
    'characters_per_word',
    'words_per_second',
    'pitch_range',
    'mean_to_max_pitch_ratio',
    'energy_to_duration_ratio',
    'energy_to_silence_ratio',
    'num_words_per_silence',
    'silence_ratio',
    'gender_female',
    'gender_male',
    'ethnicity_english',
    'ethnicity_igbo',
    'ethnicity_others',
]

In [9]:
log_melspec_mean_features = [f'log_melspec_mean_{i}' for i in range(n_mels)]
log_melspec_median_features = [f'log_melspec_median_{i}' for i in range(n_mels)]
log_melspec_std_features = [f'log_melspec_std_{i}' for i in range(n_mels)]
log_melspec_overall_features = [
    'log_melspec_mean', 
    'log_melspec_median', 
    'log_melspec_std', 
    'log_melspec_skewness', 
    'log_melspec_kurtosis'
]
log_melspec_mean_delta_features = [f'delta_log_melspec_mean_{i}' for i in range(n_mfcc)]
log_melspec_std_delta_features = [f'delta_log_melspec_mean_{i}' for i in range(n_mfcc)]
log_melspec_delta_features = log_melspec_mean_delta_features + log_melspec_std_delta_features

log_melspec_features = log_melspec_mean_features + log_melspec_std_features

In [10]:
mfcc_mean_features = [f'mfcc_mean_{i}' for i in range(n_mfcc)]
mfcc_median_features = [f'mfcc_median_{i}' for i in range(n_mfcc)]
mfcc_std_features = [f'mfcc_std_{i}' for i in range(n_mfcc)]
mfcc_overall_features = [
    'mfcc_mean',
    'mfcc_median',
    'mfcc_std',
    'mfcc_skewness',
    'mfcc_kurtosis',
]
mfcc_mean_delta_features = [f'delta_mfcc_mean_{i}' for i in range(n_mfcc)]
mfcc_std_delta_features = [f'delta_mfcc_mean_{i}' for i in range(n_mfcc)]
mfcc_delta_features = mfcc_mean_delta_features + mfcc_std_delta_features

mfcc_features = mfcc_mean_features + mfcc_std_features

In [11]:
silence_duration_frames_features = [f'silence_duration_frame_{i}' for i in range(num_silence_frames)]
silence_ratio_on_frames_features = [f'silence_ratio_on_frame_{i}' for i in range(num_silence_frames)]
silence_duration_frames_overall_features = [
    'silence_duration_frames_mean',
    'silence_duration_frames_median',
    'silence_duration_frames_std',
]

silence_duration_countour_features = silence_duration_frames_overall_features

In [12]:
features = pre_computed_features + comb_pre_comp_features + mfcc_features + log_melspec_features + silence_duration_countour_features

In [13]:
target = 'age'

## Train model

### Data selection

In [14]:
X = data[features]
y = data[target]

### eXtreme Gradient Boosting Regressor

In [15]:
from xgboost import XGBRegressor

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [18]:
# Best Parameters: {'colsample_bytree': 0.4, 'gamma': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 1000, 'reg_alpha': 0, 'reg_lambda': 100, 'subsample': 0.4}
best_xgb = XGBRegressor(colsample_bytree=0.4, gamma=0.8, learning_rate=0.05, max_depth=5, n_estimators=1000, reg_alpha=0, reg_lambda=100, subsample=0.4, random_state=seed)

In [None]:
param_grid_xgb = {
    
    # Tree structure
    'max_depth': [3, 5],                        # Maximum depth of a tree
    'n_estimators': [500, 1000],         # Number of trees
    'gamma': [0.3, 0.5, 0.8],                   # Minimum loss reduction to split node
    
    # Learning rate
    'learning_rate': [0.01, 0.05, 0.1],

    # Regularization hyperparameters
    'reg_alpha' : [0, 10, 100],                 # L1 regularization
    'reg_lambda': [0, 10, 100],                 # L2 regularization

    # Sampling hyperparameters
    'subsample':        [0.4, 0.6, 0.8],        # Fraction of samples to use for fitting the base learners
    'colsample_bytree': [0.4, 0.6, 0.8],        # Fraction of features to use for fitting the base learners
}

xgb = XGBRegressor(random_state=seed)

# GridSearchCV
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search_xgb.fit(X_train, y_train)

# Get the best estimator
best_xgb = grid_search_xgb.best_estimator_

print(f"Best Parameters: {grid_search_xgb.best_params_}")

In [None]:
best_xgb.fit(X_train, y_train)

# Evaluate on the test set
y_test_pred_xgb = best_xgb.predict(X_test)
y_test_pred_xgb = np.round(y_test_pred_xgb)
test_rmse_xgb = root_mean_squared_error(y_true=y_test, y_pred=y_test_pred_xgb)

print(f"Testing RMSE: {test_rmse_xgb}")

In [None]:
from sklearn.linear_model import LinearRegression
olr = LinearRegression()
olr.fit(X_train, y_train)
y_test_pred_olr = olr.predict(X_test)
y_test_pred_olr = np.round(y_test_pred_olr)
test_rmse_olr = root_mean_squared_error(y_true=y_test, y_pred=y_test_pred_olr)

print(f"Testing RMSE: {test_rmse_olr}")

## Eval

***Choose the model you prefer***

In [24]:
model = best_xgb

In [None]:
model.fit(X, y)

***Store the model for later use***

In [None]:
filename = 'models\\' + generate_model_filename(model) + '.joblib'
joblib.dump(model, filename)
print(f"Model saved as: {filename}")

***Run the model on the prediction set***

In [27]:
def feature_engineering_pipeline(
        df, 
        data_directory='data/audios_evaluation',
        sr=sampling_rate,
        n_mels=n_mels,
        n_mfcc=n_mfcc,
        num_silence_frames=num_silence_frames,
    ):
    
    data = df.copy()
    data['gender'] = data['gender'].replace({'famale': 'female'}) # Fix typo

    # Compute waves and spectogram
    data = compute_waves(data, sr=sr, filter=None, directory=data_directory)
    data = compute_spectrograms(data, sr=sr)
    
    # Extract and clean features
    data = drop_unused_columns(data)
    data = encode_gender(data)
    data = encode_ethnicity(data)
    data = floatize_tempo(data)
    data = comb_precomp(data)
    data = log_mel_spec(data, n_mels=n_mels)
    data = mfcc(data, S=True, sr=sr, n_mfcc=n_mfcc)
    data = silence_duration_contour(data, num_silence_frames=num_silence_frames)
    data = spectral(data, S=True, sr=sr)
    data = rms(data, S=True)
    data = chroma(data, sr=sampling_rate)
    
    data = data.copy()
    return data

def target_engineering_pipeline(y):
    #y = np.expm1(y)
    y = np.round(y)
    return y

In [28]:
def eval(model, features, eval_df=None, eval_path='data/evaluation.csv', index_col='Id', save_to_csv=True):
    
    # Load evaluation data
    if eval_df is None or eval_df.columns.size == 18:
        eval_df = pd.read_csv(eval_path, index_col=index_col)
        eval_df = feature_engineering_pipeline(eval_df)
    
    eval_df = eval_df[features]

    # Run model and predict
    prediction = model.predict(eval_df)
    prediction = target_engineering_pipeline(prediction)
    prediction = pd.Series(prediction, index=eval_df.index)

    # Save results
    if save_to_csv:
        filename = 'predictions\\' + 'prediction-' + generate_model_filename(model) + '.csv'
        prediction.to_csv(filename, header=['Predicted'], index_label='Id')
        print(f"Prediction saved as: {filename}")
    
    return prediction

In [29]:
# # Precompute eval features
# eval_df = pd.read_csv('data/evaluation.csv', index_col='Id')
# eval_df = feature_engineering_pipeline(eval_df)
# eval_df.drop(columns=['wave', 'spectrogram', 'melspectrogram', 'log_melspectrogram']).to_csv('data/evaluation-precomputed.csv')

In [30]:
eval_df = pd.read_csv('data/evaluation-precomputed.csv', index_col='Id')

In [None]:
eval(model=model, features=features, eval_df=eval_df, save_to_csv=True)