In [2]:
import opensmile
import webrtcvad
import librosa
import librosa.display


import sys
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV

from xgboost import XGBClassifier
import xgboost as xgb

from pathlib import Path


# root_dir = Path().resolve().parent
# sys.path.append(str(root_dir))

In [3]:
DATA_PATH = Path("../data")
AUDIO_PATH = DATA_PATH / "audio"

In [4]:
labels = pd.read_csv(DATA_PATH / "train_labels.csv")
print(f"Train labels shape: {labels.shape}")

Train labels shape: (38095, 2)


In [5]:
metadata = pd.read_csv(DATA_PATH / "train_metadata.csv")
print(f"Train metadata shape: {metadata.shape}")

Train metadata shape: (38095, 4)


In [6]:
df = labels.merge(metadata, on="filename", validate="1:1")
print(f"df shape: {df.shape}")

df shape: (38095, 5)


In [9]:
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)

In [12]:
feature_list = []
for filename in tqdm(df.filename, desc="Extracting OpenSMILE Features", unit="file"):
    features = smile.process_file(AUDIO_PATH / filename, start=0, end=None)
    feature_list.append(features.mean(axis=0))

Extracting OpenSMILE Features: 100%|██████████| 38095/38095 [47:37<00:00, 13.33file/s]  


In [16]:
rows = []

for index, filename in enumerate(df.filename):
    features = feature_list[index]
    features_str = features.to_string(index=True)
    row_data = {"filename": filename}
    for feature in features_str.split("\n"):
        parts = feature.rsplit(maxsplit=1)
        if len(parts) < 2: continue
        feature_name = parts[0].strip()
        feature_value = parts[1].strip()
        row_data[feature_name] = feature_value
    rows.append(row_data)
opensmile_df = pd.DataFrame(rows)
opensmile_df.to_csv("../data/compare2016_opensmile_features.csv", index=False)

In [20]:
df = opensmile_df.merge(labels, on="filename", validate="1:1")

In [21]:
filenames_smoke = []
sub_format_smoke = pd.read_csv(f"../literacy-screening-runtime/data/submission_format.csv", index_col="filename")
for filename_smoke in sub_format_smoke.index: filenames_smoke.append(filename_smoke)
df = df[~df['filename'].isin(filenames_smoke)]

In [22]:
X = df.drop(columns = ["filename", "score"], axis = 0)
y = df.score

X.shape, y.shape

((38087, 65), (38087,))

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (30469, 65)
Test set size: (7618, 65)


In [36]:
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')
X = X.apply(pd.to_numeric, errors='coerce')

In [37]:
def clean_feature_names(feature_names):
    return [name.replace('[', '').replace(']', '').replace('<', '') for name in feature_names]

X_train.columns = clean_feature_names(X_train.columns)
X_test.columns = clean_feature_names(X_test.columns)
X = clean_feature_names(X.columns)

In [34]:
xgb_model = XGBClassifier(n_estimators=100, random_state=42, eval_metric="logloss", verbosity=2,
                          enable_categorical=True)

calibrated_model = CalibratedClassifierCV(xgb_model, cv=3)
calibrated_model.fit(X_train, y_train)

y_pred_proba = calibrated_model.predict_proba(X_test)[:, 1]
logloss = log_loss(y_test, y_pred_proba)
print(f"Log Loss on the test set: {logloss}")


Log Loss on the test set: 0.6295072935256824
