# Import

In [2]:
import opensmile
import webrtcvad
import librosa
import librosa.display


import sys
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV

from xgboost import XGBClassifier
import xgboost as xgb

from pathlib import Path


# root_dir = Path().resolve().parent
# sys.path.append(str(root_dir))

In [14]:
DATA_PATH = Path("../data")
AUDIO_PATH = DATA_PATH / "audio"
# AUDIO_PATH = DATA_PATH / "train_audio_1"
# AUDIO_PATH = DATA_PATH / "train_audio_2"
# AUDIO_PATH = DATA_PATH / "train_audio_3"

In [None]:
labels = pd.read_csv(DATA_PATH / "train_labels.csv")
print(f"Train labels shape: {labels.shape}")
labels.head()

In [None]:
metadata = pd.read_csv(DATA_PATH / "train_metadata.csv")
print(f"Train metadata shape: {metadata.shape}")
metadata.head()

In [None]:
df = labels.merge(metadata, on="filename", validate="1:1")
print(f"df shape: {df.shape}")
df.head()

### EDA

In [10]:
def plot_waveform(filepath):
    audio_data, sr = librosa.load(filepath, sr=None)

    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(audio_data, sr=sr)
    plt.title("Waveform")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.show()

    return audio_data, sr

In [11]:
def plot_spectrogram(audio_data, sr):
    S = librosa.stft(audio_data)
    S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)

    plt.figure(figsize=(10, 4))
    librosa.display.specshow(S_db, sr=sr, x_axis="time", y_axis="log")
    plt.colorbar(format="%+2.0f dB")
    plt.title("Spectrogram")
    plt.xlabel("Time (s)")
    plt.ylabel("Frequency (Hz)")
    plt.show()

In [12]:
def voice_activity_detection(filepath, aggressiveness=2):
    vad = webrtcvad.Vad(aggressiveness)
    audio_data, sr = librosa.load(filepath, sr=16000)
    audio_data = (audio_data * 32767).astype(np.int16)

    frame_duration = 30
    frame_length = int(sr * frame_duration / 1000)

    vad_results = []
    for start in range(0, len(audio_data), frame_length):
        frame = audio_data[start : start + frame_length].tobytes()
        vad_results.append(vad.is_speech(frame, sr))

    time_axis = np.linspace(0, len(audio_data) / sr, num=len(vad_results))
    plt.figure(figsize=(10, 2))
    plt.plot(time_axis, vad_results, label="VAD Output")
    plt.title("Voice Activity Detection (VAD) Output")
    plt.xlabel("Time (s)")
    plt.ylabel("Speech Detected")
    plt.ylim(-0.1, 1.1)
    plt.show()

In [13]:
def analyze_audio(filepath):
    print("Plotting waveform...")
    audio_data, sr = plot_waveform(filepath)

    print("Plotting spectrogram...")
    plot_spectrogram(audio_data, sr)

    print("Performing Voice Activity Detection...")
    voice_activity_detection(filepath)

### Feature engineering

In [7]:
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)

In [95]:
feature_list = []
for filename in tqdm(df.filename, desc="Extracting OpenSMILE Features", unit="file"):
    features = smile.process_file(AUDIO_PATH / filename)
    feature_list.append(features.mean(axis=0))

Extracting OpenSMILE Features: 100%|██████████| 38095/38095 [54:37<00:00, 11.62file/s]  


### Save the dataframe with the features

In [None]:
rows = []

for index, filename in enumerate(df.filename):
    features = feature_list[index]
    features_str = features.to_string(index=True)
    row_data = {"filename": filename}
    for feature in features_str.split("\n"):  # Split features into lines
        parts = feature.rsplit(maxsplit=1)
        if len(parts) < 2: continue
        feature_name = parts[0].strip()
        feature_value = parts[1].strip()
        row_data[feature_name] = feature_value
    rows.append(row_data)
opensmile_df = pd.DataFrame(rows)
opensmile_df.to_csv("../data/opensmile_features.csv", index=False)

### Read the dataframe with the features

In [175]:
opensmile_df = pd.read_csv("../data/opensmile_features.csv")

In [176]:
df = opensmile_df.merge(labels, on="filename", validate="1:1")

### Remove smoke dataset from training data

In [19]:
filenames_smoke = []
sub_format_smoke = pd.read_csv(f"../literacy-screening-runtime/data/submission_format.csv", index_col="filename")
for filename_smoke in sub_format_smoke.index: filenames_smoke.append(filename_smoke)
df = df[~df['filename'].isin(filenames_smoke)]

### Get the data for training


In [178]:
X = df.drop(columns = ["filename", "score"], axis = 0)
y = df.score

X.shape, y.shape

((38087, 88), (38087,))

### Split the data

In [179]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (30469, 88)
Test set size: (7618, 88)


### Training baseline

In [180]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_estimators=100, random_state=42, eval_metric="logloss", verbosity=2, enable_categorical=True)

calibrated_model = CalibratedClassifierCV(xgb_model, cv=3)
calibrated_model.fit(X_train, y_train)

y_pred_proba = calibrated_model.predict_proba(X_test)[:, 1]
logloss = log_loss(y_test, y_pred_proba)
print(f"Log Loss on the test set: {logloss}")


Log Loss on the test set: 0.6054233480238884


In [181]:
calibrated_model.fit(X, y)

### Saving the model

In [182]:
ASSETS_DIR = Path("../assets")
ASSETS_DIR.mkdir(exist_ok=True)

joblib.dump(calibrated_model, ASSETS_DIR / "calibrated_model_benchmark_v01.joblib")

['../assets/calibrated_model_benchmark_v01.joblib']

### Get the smoke dataset

In [3]:
DATA_PATH = Path("../literacy-screening-runtime/data")

In [None]:
sub_format = pd.read_csv(f"{DATA_PATH}/submission_format.csv", index_col="filename")

### Process the data with the same tool

In [8]:
feature_list = []
for filename in sub_format.index:
    features = smile.process_file(DATA_PATH / filename)
    feature_list.append(features.mean(axis=0))

In [9]:
features = pd.DataFrame(feature_list, index=sub_format.index)

### Get the trained model

In [10]:
model = joblib.load("../literacy-screening-runtime/submission_src/assets/calibrated_model_benchmark_v01.joblib")

### Make predictions on the smoke dataset

In [12]:
preds =  model.predict_proba(features)[:, 1]

### Get the real values for the smoke dataset

In [20]:
preds_real = labels[labels['filename'].isin(filenames_smoke)]['score']

### Perform metric check

In [21]:
ll = log_loss(preds_real, preds)
print(f"Log Loss: {ll}")

Log Loss: 0.5069924169443882


In [None]:
preds