In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from scipy.io import wavfile
from scipy.signal import resample
from tqdm import tqdm
import librosa
from librosa import yin
from librosa.effects import time_stretch
from librosa.effects import trim
from librosa.effects import pitch_shift
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold

In [8]:
## functions def

nominal = ["gender", "ageRange", "First Language spoken", "Current language used for work/school"]
# nominal = ["gender", "ageRange"]
nominal = ["ageRange"]

ordinal = ["Self-reported fluency level "]
# global variables for encoding and decoding


def create_encoders(df_original):
    """Creates the Encoders for
    OneHotEncoding for nominal categorial data and
    OrdinalEncoding for ordinal categorical data
    and LabelEncoding for the Class label"""

    ohe = OneHotEncoder(sparse=False)
    ohe.fit(df_original[nominal])
    fluency_enc = OrdinalEncoder(categories=[["basic", "intermediate", "advanced", "native"]])
    fluency_enc.fit([["basic"], ["intermediate"], ["advanced"], ["native"]])
    df_original["label"]=df_original["action"]+df_original["object"]
    class_enc = LabelEncoder()
    class_enc.fit(df_original["label"])

    return df_original, ohe, fluency_enc, class_enc


def encode_x(df, ohe, fluency_enc):
    """Encodes X"""
    matrix = ohe.transform(df[nominal])
    np.hstack((matrix, fluency_enc.transform(df[ordinal])))    
    return matrix


def feature_extraction_1(df, db=20):
    """Loads the audio data and extracts number of samples, max aplitude, and pitch"""
    Xtime = []
    samples_per_audio = []
    max_amp_per_audio = []
    pitch_per_audio = []
    for i, files in tqdm(enumerate(df["path"].values)):
        audio = wavfile.read(files)
        if audio[0]==22050:
            audio = np.array(resample(audio[1], int(len(audio[1]) * (16000 / 22050))), dtype="int16")
        else:
            audio = audio[1]
        max_amp_per_audio.append(np.max(audio))
        audio = audio.astype("float32")
        audio = audio[:64000]
        audio = trim(audio, top_db=db)[0]
        samples_per_audio.append(len(audio))
        audio = time_stretch(audio, rate=len(audio)/21000)
        pitch_per_audio.append(yin(audio, 50, 300,sr=16000))
        Xtime.append(audio)
    return np.array(Xtime), np.array(samples_per_audio), np.array(max_amp_per_audio), np.array(pitch_per_audio)


def feature_extraction_2(Xtime, param={"sr":16000, "n_fft":2048, "hop_length":512, "fmin":50, "n_mfcc":10}):
    """Extracts the MFCC and Deltas from the Mel Spectogram"""
    mfccs = []
    deltas = []
    deltas2 = []
    for audio in Xtime:
        mfcc = librosa.feature.mfcc(audio, **param)
        delta = librosa.feature.delta(mfcc)
        delta2 = librosa.feature.delta(mfcc, order=2)
        mfccs.append(mfcc.flatten())
        deltas.append(delta.flatten())
        deltas2.append(delta2.flatten())
    coeff = np.hstack((mfccs, deltas, deltas2))
    return coeff


def data_augmentation(df, Xtime, Xenc, sam, max, y):
    Xtime2 = []
    Xtime3 = []
    Xtime4 = []
    for i, audio in enumerate(Xtime):
        if df.loc[i, "gender"] == "female":
            audio2 = pitch_shift(audio.astype("float32"), sr = 16000, n_steps=-2)
            audio3 = pitch_shift(audio.astype("float32"), sr = 16000, n_steps=-4)
            audio4 = pitch_shift(audio.astype("float32"), sr = 16000, n_steps=-6)
        else:
            audio2 = pitch_shift(audio.astype("float32"), sr = 16000, n_steps=2)
            audio3 = pitch_shift(audio.astype("float32"), sr = 16000, n_steps=4)
            audio4 = pitch_shift(audio.astype("float32"), sr = 16000, n_steps=6)
        Xtime2.append(audio2)
        Xtime3.append(audio3)
        Xtime4.append(audio4)
    Xtime = np.vstack((Xtime, Xtime2, Xtime3, Xtime4))
    pit = []
    for audio in Xtime:
        pit.append(yin(audio, 50, 300,sr=16000))
    pit = np.array(pit)
    Xenc = np.vstack((Xenc, Xenc, Xenc, Xenc))
    sam = np.hstack((sam, sam, sam, sam))
    max = np.hstack((max, max, max, max))
    y = np.hstack((y, y, y, y))
    return Xtime, Xenc, sam, max, pit, y

In [9]:
# plot of MFCC.png
import matplotlib.pyplot as plt
import librosa.display
import seaborn as sns
"""
par = {"sr":16000, "n_fft":2048, "hop_length":512, "fmin":50, "n_mfcc":10}
config = {'C': 2, 'gamma': 'auto'}
# df = pd.read_csv("/content/drive/MyDrive/project/dsl_data/development.csv")
df = pd.read_csv("development.csv")
Xtime, samples_per_audio, max_amp_per_audio, pitch_per_audio = feature_extraction_1(df)

mfcc = librosa.feature.mfcc(Xtime[506], **par)
mfcc_delta = librosa.feature.delta(mfcc)
mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
fig, ax = plt.subplots(nrows=3, sharex=True, sharey=False)
img1 = librosa.display.specshow(mfcc, ax=ax[0], x_axis='ms', y_axis='mel')
ax[0].set(title='MFCC')
ax[0].label_outer()
img2 = librosa.display.specshow(mfcc_delta, ax=ax[1], x_axis='ms', y_axis='mel')
ax[1].set(title=r'MFCC-$\Delta$')
ax[1].label_outer()
img3 = librosa.display.specshow(mfcc_delta2, ax=ax[2], x_axis='ms', y_axis='mel')
ax[2].set(title=r'MFCC-$\Delta$-$\Delta$')
ax[2].label_outer()


plt.tight_layout()
fig.colorbar(img1, ax=[ax[0]])
fig.colorbar(img2, ax=[ax[1]])
fig.colorbar(img3, ax=[ax[2]])
plt.savefig(f"mfcc.png")

plt.show()
"""
pass

In [10]:
from scipy.fft import fft
"""makeitcooler = wavfile.read(df["path"].values[506])[1]
# makeitcooler = makeitcooler/np.max(makeitcooler)
makefft = fft(makeitcooler)[:8000]
amp_fft = np.abs(makefft**2)
db_fft = 20*np.log10(amp_fft)
plt.figure(figsize=(6,3))
plt.plot(db_fft, linewidth=0.2)
plt.xlim((-50,8050))
plt.grid(alpha=0.2)
plt.title("Energy spectrum density")
plt.xlabel("Frequency (Hz)")
plt.ylabel("Magnitude (db)")
plt.tight_layout()

plt.savefig(f"fft.png")"""
pass

In [11]:
"""Xtime = []
samples_per_audio = []
for i, files in tqdm(enumerate(df["path"].values)):
    audio = wavfile.read(files)
    if audio[0]==22050:
        audio = np.array(resample(audio[1], int(len(audio[1]) * (16000 / 22050))), dtype="int16")
    else:
        audio = audio[1]
    audio = audio.astype("float32")
    # audio = trim(audio, top_db=db)[0]
    samples_per_audio.append(len(audio))
    # audio = time_stretch(audio, rate=len(audio)/21000)
    Xtime.append(audio)
"""
pass

In [12]:
"""plt.hist([len(audio)/16000 for audio in Xtime], bins=100)
plt.grid(alpha=0.2)
avg = sum([len(audio)/16000 for audio in Xtime])/len(Xtime)
plt.axvline(x = avg, color='orange', linestyle="--", label="average duration")
plt.xlabel("time (s)")
plt.legend()
plt.title("Distribution of duration of recordings before trimming")
plt.savefig("distr.png")
plt.show()
print(avg)
"""
"""Xtrim = []
for audio in Xtime:
    audio_trim = trim(audio, top_db=20)[0]
    Xtrim.append(audio_trim)
plt.hist([len(audio)/16000 for audio in Xtrim], bins=100)
plt.grid(alpha=0.2)
avg = sum([len(audio)/16000 for audio in Xtrim])/len(Xtrim)
plt.axvline(x = avg, color='orange', linestyle="--", label="average duration")
plt.xlabel("time (s)")
plt.legend()
plt.title("Distribution of duration of recordings after trimming")
plt.show()
print(avg)"""
pass

In [13]:
## X test
par = {'sr': 16000, 'n_fft': 2048, 'hop_length': 512, 'fmin': 50, 'n_mfcc': 10}
config = {'C': 2}
config = {'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'gini',
          'max_depth': None, 'max_features': 'log2', 'min_samples_split': 10, 'n_estimators': 1000}

# df = pd.read_csv("/content/drive/MyDrive/project/dsl_data/development.csv")
df = pd.read_csv("development.csv")
# filter out categories not in x_test
"""df = df[df["Self-reported fluency level "]=="native"]
df = df[df['ageRange']!="65+"]
df = df[df['Current language used for work/school']=="English (United States)"]"""

# get fraction of database
df = df.sample(frac=1).reset_index(drop=True)

# create encoders
df, ohe, fluency_enc, class_enc = create_encoders(df)
print("----- Preprocessing X train -----")
# encode training set
Xtrain_encoded = encode_x(df, ohe, fluency_enc)
ytrain = class_enc.transform(df[["label"]])
Xtime, samples_per_audio, max_amp_per_audio, pitch_per_audio = feature_extraction_1(df)
Xtime_train, Xencoded_train, samples_per_audio_train, max_amp_per_audio_train, pitch_per_audio_train, y_train = \
            data_augmentation(df, Xtime, Xtrain_encoded, samples_per_audio, max_amp_per_audio, ytrain)
mfcc_delta_train = feature_extraction_2(Xtime_train, par)
X_train = np.hstack((
    mfcc_delta_train, Xencoded_train,
    np.array(samples_per_audio_train)[:, np.newaxis],
    np.array(max_amp_per_audio_train)[:, np.newaxis],
    pitch_per_audio_train
    ))
std = StandardScaler()
X_train = std.fit_transform(X_train)
# train SVM
print("----- Training Model -----")
svc = RandomForestClassifier(**config)
svc.fit(X_train, y_train)


----- Preprocessing X train -----


9854it [03:11, 51.44it/s]


----- Training Model -----


RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       max_features='log2', min_samples_split=10,
                       n_estimators=1000)

In [14]:
"""durations = [len(audio) for audio in Xtime]
plt.hist(durations,bins=100)"""
pass

In [15]:
# X test
print("----- Preprocessing X test -----")
df_test = pd.read_csv("evaluation.csv")
Xtest_encoded = encode_x(df_test, ohe, fluency_enc)
Xtime_t, sam_t, max_t, pit_t = feature_extraction_1(df_test)
mfcc_d_t = feature_extraction_2(Xtime_t, par)
X_test = np.hstack((mfcc_d_t, Xtest_encoded, np.array(sam_t)[:, np.newaxis], np.array(max_t)[:, np.newaxis], pit_t))
X_test = std.transform(X_test)
print("----- Predicting X test -----")
y_pred = svc.predict(X_test)

----- Preprocessing X test -----


1455it [00:26, 55.20it/s]


----- Predicting X test -----


In [16]:
y_pred = class_enc.inverse_transform(y_pred)
id = [int(i) for i in df_test["Id"]]
final = pd.DataFrame({"Id":id, "Predicted": y_pred})
final = final.sort_values(by=["Id"])
final.to_csv("submission/g18.csv", index=False)
print("------ ALL DONE ------")

------ ALL DONE ------
