In [None]:
import pandas as pd
import numpy as np

df_dev = pd.read_csv('data/development.csv').set_index('Id')
df_eval = pd.read_csv('data/evaluation.csv').set_index('Id')

# We drop the sampling rate, which is the same value for all samples
# We drop the path, which is not useful for the regression task
df_dev.drop(['sampling_rate', 'path'], axis=1, inplace=True)
df_eval.drop(['sampling_rate', 'path'], axis=1, inplace=True)

# We convert the tempo to the float data type from the format '[float]'
df_dev['tempo'] = df_dev['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')
df_eval['tempo'] = df_eval['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')

df_dev.head()

Plot age distribution

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.hist(df_dev["age"], bins=range(0,int((max(df_dev["age"])+10)),10))
ax.set_xticks(range(0,int((max(df_dev["age"])+10)),10))
ax.set_title("Age Distribution")
ax.set_xlabel("Age")
ax.set_ylabel("Count")
ax.grid(alpha=0.2)

Ethnicity distribution

In [None]:
# Ethnicities in development set
set1 = set(df_dev["ethnicity"])
set2 = set(df_eval["ethnicity"])
set1.intersection(set2)
print(f"Number of ethnicities in development set: {len(set1)}")
print(f"Number of ethnicities in evaluation set: {len(set2)}")
print(f"Number of ethnicities in both sets: {len(set1.intersection(set2))}")
print(set1.intersection(set2))

In [4]:
df_dev.drop(columns=['ethnicity'], inplace=True)
df_eval.drop(columns=['ethnicity'], inplace=True)

Gender

In [5]:
gender =  {'male': 1, 'female': 0, "famale": 0}

In [None]:
from collections import Counter

gender_dev = Counter(df_dev["gender"])
gender_eval = Counter(df_eval["gender"])
plt.figure()
plt.bar(gender_dev.keys(), gender_dev.values(), label="Development", color="orange", alpha=0.5)
plt.bar(gender_eval.keys(), gender_eval.values(), label="Evaluation", color="blue", alpha=0.5)
plt.title("Gender Distribution")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.legend()
plt.grid(alpha=0.2, axis="y")

In [7]:
# We encode the gender using dummy encoding
df_dev['gender'] = df_dev['gender'].map(gender)
df_eval['gender'] = df_eval['gender'].map(gender)

In [8]:
import os
import librosa

mfcc_num = 13
ll = []
for f in os.listdir("data/audios_development"):
    try:
        y, sr = librosa.load(f"data/audios_development/{f}", sr=None)
        index = int(f.split(".")[0]) - 1
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        duration = librosa.get_duration(y=y, sr=sr)
        mfdd = librosa.feature.delta(mfcc)
        mfddd = librosa.feature.delta(mfcc, order=2)
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean(axis=1)
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr).mean(axis=1)

        d = {f"MFCC-{el+1}-95": np.percentile(mfcc, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCC-{el+1}-5": np.percentile(mfcc, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCC-{el+1}-50": np.percentile(mfcc, 50, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-95": np.percentile(mfdd, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-5": np.percentile(mfdd, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-50": np.percentile(mfdd, 50, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-95": np.percentile(mfddd, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-5": np.percentile(mfddd, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-50": np.percentile(mfddd, 50, axis=1)[el] for el in range(mfcc_num)}
        ll.append({'Id': index, "duration": duration, "spectral_bandwidth": spectral_bandwidth, "spectral_rolloff": spectral_rolloff, **d})
    except:
        pass
df1 = pd.DataFrame(ll).set_index('Id').sort_index()

ll = []
for f in os.listdir("data/audios_evaluation"):
    try:
        y, sr = librosa.load(f"data/audios_evaluation/{f}", sr=None)
        index = int(f.split(".")[0]) - 1
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        duration = librosa.get_duration(y=y, sr=sr)
        mfdd = librosa.feature.delta(mfcc)
        mfddd = librosa.feature.delta(mfcc, order=2)
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean(axis=1)
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr).mean(axis=1)

        d = {f"MFCC-{el+1}-95": np.percentile(mfcc, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCC-{el+1}-5": np.percentile(mfcc, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCC-{el+1}-50": np.percentile(mfcc, 50, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-95": np.percentile(mfdd, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-5": np.percentile(mfdd, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-50": np.percentile(mfdd, 50, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-95": np.percentile(mfddd, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-5": np.percentile(mfddd, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-50": np.percentile(mfddd, 50, axis=1)[el] for el in range(mfcc_num)}
        ll.append({'Id': index, "duration": duration, "spectral_bandwidth": spectral_bandwidth, "spectral_rolloff": spectral_rolloff, **d})
    except:
        pass
df2 = pd.DataFrame(ll).set_index('Id').sort_index()

In [9]:
df_dev = pd.concat([df_dev, df1], axis=1)
df_eval = pd.concat([df_eval, df2], axis=1)

Num words and num characters distribution

In [None]:
plt.figure()
plt.hist(df_dev["num_words"], bins=range(0,int((max(df_dev["num_words"])+10)),10), label="Development", color="orange", alpha=0.5)
plt.hist(df_eval["num_words"], bins=range(0,int((max(df_eval["num_words"])+10)),10), label="Evaluation", color="blue", alpha=0.5)
plt.xticks(range(0,int((max(df_dev["num_words"])+10)),10))
plt.title("Distribution of the number of words")
plt.xlabel("Number of words")
plt.ylabel("Count")
plt.legend()
plt.grid(alpha=0.2)

In [None]:
plt.figure(figsize=(10,10))
plt.hist(df_dev["num_characters"], bins=range(0,int((max(df_dev["num_characters"])+20)),20), label="Development", color="orange", alpha=0.5)
plt.hist(df_eval["num_characters"], bins=range(0,int((max(df_eval["num_characters"])+20)),20), label="Evaluation", color="blue", alpha=0.5)
plt.xticks(range(0,int((max(df_dev["num_characters"])+20)),20))
plt.title("Distribution of the number of characters")
plt.xlabel("Number of characters")
plt.ylabel("Count")
plt.legend()
plt.grid(alpha=0.2)

In [12]:
df_dev['mean_silence'] = df_dev['silence_duration']/df_dev['num_pauses']
df_dev['silence_ratio'] = df_dev['silence_duration']/df_dev['duration']
df_dev['wps'] = df_dev['num_words']/df_dev['duration']

df_eval['mean_silence'] = df_eval['silence_duration']/df_eval['num_pauses']
df_eval['silence_ratio'] = df_eval['silence_duration']/df_eval['duration']
df_eval['wps'] = df_eval['num_words']/df_eval['duration']

In [13]:
df_dev['spectral_bandwidth'] = df_dev['spectral_bandwidth'].apply(lambda x: str(x[0]).replace('[', '').replace(']', '')).astype('float')
df_eval['spectral_bandwidth'] = df_eval['spectral_bandwidth'].apply(lambda x: str(x[0]).replace('[', '').replace(']', '')).astype('float')

df_dev['spectral_rolloff'] = df_dev['spectral_rolloff'].apply(lambda x: str(x[0]).replace('[', '').replace(']', '')).astype('float')
df_eval['spectral_rolloff'] = df_eval['spectral_rolloff'].apply(lambda x: str(x[0]).replace('[', '').replace(']', '')).astype('float')

In [14]:
df_dev.to_csv('data/development_processed.csv')
df_eval.to_csv('data/evaluation_processed.csv')