In [11]:
import pandas as pd
import numpy as np

df_dev = pd.read_csv('data/development.csv').set_index('Id')
df_eval = pd.read_csv('data/evaluation.csv').set_index('Id')

# We drop the sampling rate, which is the same value for all samples
# We drop the path, which is not useful for the regression task
df_dev.drop(['sampling_rate', 'path'], axis=1, inplace=True)
df_eval.drop(['sampling_rate', 'path'], axis=1, inplace=True)

# We convert the tempo to the float data type from the format '[float]'
df_dev['tempo'] = df_dev['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')
df_eval['tempo'] = df_eval['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')

df_dev.head()

Unnamed: 0_level_0,age,gender,ethnicity,mean_pitch,max_pitch,min_pitch,jitter,shimmer,energy,zcr_mean,spectral_centroid_mean,tempo,hnr,num_words,num_characters,num_pauses,silence_duration
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,24.0,female,arabic,1821.6906,3999.717,145.43066,0.013795,0.082725,0.002254,0.210093,3112.257251,151.999081,-123.999726,69,281,39,23.846893
1,22.5,female,hungarian,1297.8187,3998.859,145.37268,0.025349,0.096242,0.007819,0.078849,1688.016389,129.199219,-86.928478,69,281,21,19.388662
2,22.0,female,portuguese,1332.8524,3998.8025,145.42395,0.019067,0.119456,0.002974,0.105365,2576.901706,117.453835,-98.45067,69,281,1,21.640998
3,22.0,female,english,1430.3499,3998.451,147.98083,0.017004,0.102389,0.022371,0.173701,3269.751413,117.453835,-56.459762,69,281,9,19.644127
4,22.0,male,dutch,1688.7234,3998.6113,145.44772,0.028027,0.124831,0.005369,0.107279,1930.897375,112.347147,-80.349204,69,281,11,18.041905


In [12]:
df_dev.drop(columns=['ethnicity'], inplace=True)
df_eval.drop(columns=['ethnicity'], inplace=True)

In [13]:
gender =  {'male': 1, 'female': 0, "famale": 0}

# We encode the gender using dummy encoding
df_dev['gender'] = df_dev['gender'].map(gender)
df_eval['gender'] = df_eval['gender'].map(gender)

In [14]:
import os
import librosa

mfcc_num = 13
ll = []
for f in os.listdir("data/audios_development"):
    try:
        y, sr = librosa.load(f"data/audios_development/{f}", sr=None)   
        index = int(f.split(".")[0]) - 1
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        duration = librosa.get_duration(y=y, sr=sr)
        mfdd = librosa.feature.delta(mfcc)
        mfddd = librosa.feature.delta(mfcc, order=2)
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean(axis=1).item()
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr).mean(axis=1).item()

        d = {f"MFCC-{el+1}-95": np.percentile(mfcc, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCC-{el+1}-5": np.percentile(mfcc, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCC-{el+1}-50": np.percentile(mfcc, 50, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-95": np.percentile(mfdd, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-5": np.percentile(mfdd, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-50": np.percentile(mfdd, 50, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-95": np.percentile(mfddd, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-5": np.percentile(mfddd, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-50": np.percentile(mfddd, 50, axis=1)[el] for el in range(mfcc_num)}
        ll.append({'Id': index, "duration": duration, "spectral_bandwidth": spectral_bandwidth, "spectral_rolloff": spectral_rolloff, **d})
    except:
        pass
df1 = pd.DataFrame(ll).set_index('Id').sort_index()

ll = []
for f in os.listdir("data/audios_evaluation"):
    try:
        y, sr = librosa.load(f"data/audios_evaluation/{f}", sr=None)
        index = int(f.split(".")[0]) - 1
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        duration = librosa.get_duration(y=y, sr=sr)
        mfdd = librosa.feature.delta(mfcc)
        mfddd = librosa.feature.delta(mfcc, order=2)
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean(axis=1).item()
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr).mean(axis=1).item()

        d = {f"MFCC-{el+1}-95": np.percentile(mfcc, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCC-{el+1}-5": np.percentile(mfcc, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCC-{el+1}-50": np.percentile(mfcc, 50, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-95": np.percentile(mfdd, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-5": np.percentile(mfdd, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCD-{el+1}-50": np.percentile(mfdd, 50, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-95": np.percentile(mfddd, 95, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-5": np.percentile(mfddd, 5, axis=1)[el] for el in range(mfcc_num)}
        d |= {f"MFCCDD-{el+1}-50": np.percentile(mfddd, 50, axis=1)[el] for el in range(mfcc_num)}
        ll.append({'Id': index, "duration": duration, "spectral_bandwidth": spectral_bandwidth, "spectral_rolloff": spectral_rolloff, **d})
    except:
        pass
df2 = pd.DataFrame(ll).set_index('Id').sort_index()


  y, sr = librosa.load(f"data/audios_development/{f}", sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(f"data/audios_evaluation/{f}", sr=None)


In [15]:
df_dev = pd.concat([df_dev, df1], axis=1)
df_eval = pd.concat([df_eval, df2], axis=1)

In [16]:
df_dev['mean_silence'] = df_dev['silence_duration']/df_dev['num_pauses']
df_dev['silence_ratio'] = df_dev['silence_duration']/df_dev['duration']
df_dev['wps'] = df_dev['num_words']/df_dev['duration']

df_eval['mean_silence'] = df_eval['silence_duration']/df_eval['num_pauses']
df_eval['silence_ratio'] = df_eval['silence_duration']/df_eval['duration']
df_eval['wps'] = df_eval['num_words']/df_eval['duration']

In [17]:
df_dev.to_csv('data/development_processed.csv')
df_eval.to_csv('data/evaluation_processed.csv')