# Import

In [None]:
!pip3 install librosa --upgrade
!pip install pysndfx --upgrade
!apt install sox --upgrade

In [None]:
import ast
import os
import librosa
import librosa.display
import pandas as pd
import numpy as np
import seaborn as sns
import zipfile
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

from google.colab import drive

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from pysndfx import AudioEffectsChain
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope

drive.mount('/content/drive/')


In [None]:
print(librosa.__version__)

In [None]:
def from_np_array(array_string):
    array_string = ','.join(array_string.replace('[ ', '[').split())
    return np.array(ast.literal_eval(array_string))

# Data structure

In [None]:
dev_headers = np.array([])
mfcc_header = np.array([])
for i in range (0,40):
    dev_headers = np.append(dev_headers,"mfccs"+str(i+1)) 
    mfcc_header = np.append(mfcc_header,"mfccs"+str(i+1)) 
for i in range(0,12):
    dev_headers = np.append(dev_headers,"chroma"+str(i+1)) 
    
dev_headers = np.append(dev_headers,"zero_crossing")
dev_headers = np.append(dev_headers,"spectral_bandwidth")
dev_headers = np.append(dev_headers,"spectral_centroids") 
dev_headers = np.append(dev_headers,"roll_off")
dev_headers = np.append(dev_headers,"label")

dfDevelopment = pd.DataFrame(columns=dev_headers)

In [None]:
eval_headers = np.array([])
mfcc_header = np.array([])
for i in range (0,40):
    eval_headers = np.append(eval_headers,"mfccs"+str(i+1)) 
    mfcc_header = np.append(mfcc_header,"mfccs"+str(i+1)) 
for i in range(0,12):
    eval_headers = np.append(eval_headers,"chroma"+str(i+1)) 
    
eval_headers = np.append(eval_headers,"zero_crossing")
eval_headers = np.append(eval_headers,"spectral_bandwidth")
eval_headers = np.append(eval_headers,"spectral_centroids") 
eval_headers = np.append(eval_headers,"roll_off")
eval_headers = np.append(eval_headers,"Id")

dfEvaluation = pd.DataFrame(columns=eval_headers)

# Data Exploration

## Take data

Extraction of development data, analysis of audio file and coefficients extraction by using Librosa library

In [None]:
#Extract development Data from the Drive Zip in a temporary folder
zip_ref = zipfile.ZipFile("/content/drive/My Drive/DSL_Data/development.zip", 'r')
zip_ref.extractall("/dev_tmp")
zip_ref.close()

In [None]:
if(os.path.isfile('drive/My Drive/DSL_Data/dev.csv') == False ):
  for folder in os.listdir("/dev_tmp/development"):
    new_path = "/dev_tmp/development/" + folder
    for file in os.listdir(new_path):
      name = file.split(".")[0]
      data,sample_rate=librosa.load(new_path + "/" + file)

      extractedMFCC = np.mean(librosa.feature.mfcc(y=data,sr=sample_rate,n_mfcc=40).T,axis=0)
      S, phase = librosa.magphase(librosa.stft(data))
      rolloff = np.mean(librosa.feature.spectral_rolloff(S = S, sr = sample_rate).T, axis = 0)
      spectral_centroids = np.mean(librosa.feature.spectral_centroid(data, sr = sample_rate).T, axis = 0)
      spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(data, sr = sample_rate).T, axis = 0)
      zero_crossings = np.sum(librosa.zero_crossings(data, pad=False), axis = 0)
      chromagram = np.mean(librosa.feature.chroma_stft(data, sr = sample_rate).T,axis = 0)

      feature=np.hstack((extractedMFCC,chromagram,zero_crossings,spectral_bandwidth,spectral_centroids,rolloff,folder)).reshape(1,57)
      dfDevelopment = dfDevelopment.append(pd.DataFrame(feature,columns=dev_headers))

  dfDevelopment.to_csv('dev.csv',index=False)
  !cp dev.csv "drive/My Drive/DSL_Data"
else:
  dfDevelopment = pd.read_csv('drive/My Drive/DSL_Data/dev.csv')

In [None]:
dfDevelopment.head()

Extraction of evaluation data, analysis of audio file and coefficients extraction by using Librosa library

In [None]:
#Extract evaluation Data from the Drive Zip in a temporary folder
zip_ref = zipfile.ZipFile("/content/drive/My Drive/DSL_Data/evaluation.zip", 'r')
zip_ref.extractall("/ev_tmp")
zip_ref.close()

In [None]:
if(os.path.isfile('drive/My Drive/DSL_Data/eval.csv') == False ):
  for o in os.listdir("/ev_tmp/evaluation"):
    name = o.split(".")[0]
    data,sample_rate=librosa.load("/ev_tmp/evaluation/" + o)
    #data = librosa.util.normalize(data)

    extractedMFCC = np.mean(librosa.feature.mfcc(y=data,sr=sample_rate,n_mfcc=40).T,axis=0)
    S, phase = librosa.magphase(librosa.stft(data))
    rolloff = np.mean(librosa.feature.spectral_rolloff(S = S, sr = sample_rate).T, axis = 0)
    spectral_centroids = np.mean(librosa.feature.spectral_centroid(data, sr = sample_rate).T, axis = 0)
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(data, sr = sample_rate).T, axis = 0)
    zero_crossings = np.sum(librosa.zero_crossings(data, pad=False), axis = 0)
    chromagram = np.mean(librosa.feature.chroma_stft(data, sr = sample_rate).T,axis = 0)

    feature=np.hstack((extractedMFCC,chromagram,zero_crossings,spectral_bandwidth,spectral_centroids,rolloff, name)).reshape(1,57)
    dfEvaluation = dfEvaluation.append( pd.DataFrame(feature, columns= eval_headers))
  dfEvaluation.to_csv('eval.csv',index=False)

  !cp eval.csv "drive/My Drive/DSL_Data"
else:
  dfEvaluation = pd.read_csv('drive/My Drive/DSL_Data/eval.csv')

In [None]:
dfEvaluation.head()

## Labels count-plot

In [None]:
sns.countplot('label', data=dfDevelopment)
plt.title('Label distribuition among the dataset', fontsize=14)
plt.show()

## Outliers detection

In [None]:
length = np.array([])
for folder in os.listdir("/dev_tmp/development"):
  new_path = "/dev_tmp/development/" + folder
  if( folder != 'desktop.ini'):
    for file in os.listdir(new_path):
        name = file.split(".")[0]
        if( name != 'desktop'):
          length = np.append(length, librosa.get_duration(filename= new_path +  '/' + name + '.wav') )

In [None]:
print( np.mean(length) )
print( np.max(length) )
print( np.min(length ) )

In [None]:
result = np.where(length >= 0.45)

In [None]:
a = pd.Series(result)

In [None]:
dfDevelopment = dfDevelopment.take( a[0] )

In [None]:
dfDevelopment.describe(percentiles=[.25,.5,.75])

In [None]:
dfDevelopment = dfDevelopment[dfDevelopment.roll_off != 0]

In [None]:
# outDec = IsolationForest(random_state=42).fit_predict(dfDevelopment.drop('label', axis=1))
outDec = LocalOutlierFactor(n_neighbors=300).fit_predict(dfDevelopment.drop('label', axis=1))
# outDec = EllipticEnvelope().fit_predict(dfDevelopment.drop('label', axis=1))

In [None]:
c = pd.Series(outDec)
dfDevelopment = dfDevelopment.reset_index( )

dfDevelopment[c == -1].describe() 

In [None]:
dfDevelopment[c == -1].groupby('label').count()

In [None]:
dfDevelopment = dfDevelopment[c == 1].drop('index', axis = 1)

In [None]:
dfDevelopment.head()

In [None]:
sns.countplot('label', data=dfDevelopment)
plt.title('Label distribuition among the dataset', fontsize=14)
plt.show()

# Preprocessing


## Noise Reduction

This part needed to be run in place of the "take data" part

In [None]:
def reduce_noise_centroid_s(y, sr):

    cent = librosa.feature.spectral_centroid(y=y, sr=sr)

    threshold_h = min(np.max(cent), 100)
    threshold_l = max(np.min(cent), 100)

    less_noise = AudioEffectsChain().lowshelf(gain=-12.0, frequency=threshold_l, slope=0.5).highshelf(gain=-12.0, frequency=threshold_h, slope=0.5).limiter(gain=6.0)

    y_cleaned = less_noise(y)

    return y_cleaned

In [None]:
if(os.path.isfile('drive/My Drive/DSL_Data/cleaned_dev.csv') == False ):
  for folder in os.listdir("/dev_tmp/development"):
    new_path = "/dev_tmp/development/" + folder
    for file in os.listdir(new_path):
      name = file.split(".")[0]
      data,sample_rate=librosa.load(new_path + "/" + file)
      data = reduce_noise_centroid_s(data, sample_rate)

      extractedMFCC = np.mean(librosa.feature.mfcc(y=data,sr=sample_rate,n_mfcc=40).T,axis=0)
      S, phase = librosa.magphase(librosa.stft(data))
      rolloff = np.mean(librosa.feature.spectral_rolloff(S = S, sr = sample_rate).T, axis = 0)
      spectral_centroids = np.mean(librosa.feature.spectral_centroid(data, sr = sample_rate).T, axis = 0)
      spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(data, sr = sample_rate).T, axis = 0)
      zero_crossings = np.sum(librosa.zero_crossings(data, pad=False), axis = 0)
      chromagram = np.mean(librosa.feature.chroma_stft(data, sr = sample_rate).T,axis = 0)

      feature=np.hstack((extractedMFCC,chromagram,zero_crossings,spectral_bandwidth,spectral_centroids,rolloff,folder)).reshape(1,57)
      dfDevelopment = dfDevelopment.append(pd.DataFrame(feature,columns=headers))


  dfDevelopment.to_csv('cleaned_dev.csv',index=False)
  !cp cleaned_dev.csv "drive/My Drive/DSL_Data"
else:
  dfDevelopment = pd.read_csv('drive/My Drive/DSL_Data/cleaned_dev.csv', converters={'features': from_np_array}) 

In [None]:
if(os.path.isfile('drive/My Drive/DSL_Data/cleaned_eval.csv') == False ):
  dfEvaluation = pd.DataFrame(columns=['Id', 'features'])  

  for o in os.listdir("/ev_tmp/evaluation"):
    name = o.split(".")[0]
    data,sample_rate=librosa.load("/ev_tmp/evaluation/" + o)
    data = reduce_noise_centroid_s(data, sample_rate)

    extractedMFCC = np.mean(librosa.feature.mfcc(y=data,sr=sample_rate,n_mfcc=40).T,axis=0)
    S, phase = librosa.magphase(librosa.stft(data))
    rolloff = np.mean(librosa.feature.spectral_rolloff(S = S, sr = sample_rate).T, axis = 0)
    spectral_centroids = np.mean(librosa.feature.spectral_centroid(data, sr = sample_rate).T, axis = 0)
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(data, sr = sample_rate).T, axis = 0)
    zero_crossings = np.sum(librosa.zero_crossings(data, pad=False), axis = 0)
    chromagram = np.mean(librosa.feature.chroma_stft(data, sr = sample_rate).T,axis = 0)
    feature=np.hstack((extractedMFCC,chromagram,zero_crossings,spectral_bandwidth,spectral_centroids,rolloff, name)).reshape(1,57)
    dfEvaluation = dfEvaluation.append( pd.DataFrame(feature, columns= dfEvaluation.columns.values))
    
  dfEvaluation.to_csv('cleaned_eval.csv',index=False)
  !cp cleaned_eval.csv "drive/My Drive/DSL_Data"
else:
  dfEvaluation = pd.read_csv('drive/My Drive/DSL_Data/cleaned_eval.csv', converters={'features': from_np_array})

### Visualization noise reduction

In [None]:
data,sample_rate=librosa.load("drive/My Drive/DSL_Data/EsempioNoiseReduction/0e1c80f230c3bac84e834abc1e8055dd2e7487fd317f90ad84eed42167e1eb9f.wav")
data_nr = reduce_noise_centroid_s(data, sample_rate)

In [None]:
plt.figure()
librosa.display.waveplot(data, sr=sample_rate)
librosa.display.waveplot(data_nr, sr=sample_rate)
plt.title('Monophonic')

## Standardization

In [None]:
scaler = StandardScaler()

In [None]:
featuresDev = dfDevelopment.loc[:, dfDevelopment.columns != 'label']

In [None]:
featuresEv = dfEvaluation.loc[:, dfEvaluation.columns != 'Id']

In [None]:
featuresEv

In [None]:
scaler = scaler.fit(featuresDev)

In [None]:
scaledDevDF = scaler.transform(featuresDev)

In [None]:
scaledEvDF = scaler.transform(featuresEv )

## Features Selection

In [None]:
kbest = SelectKBest(f_classif, k=45).fit(scaledDevDF, dfDevelopment['label'])

In [None]:
scaledDevDF = kbest.transform(scaledDevDF)
scaledEvDF = kbest.transform(scaledEvDF)

In [None]:
kbest.get_support()

In [None]:
for i, el in enumerate( kbest.get_support() ):
  if el == True:
    print(dev_headers[i])

## Dimensionality Reduction

In [None]:
dfPCA_dev = PCA(.97).fit( scaledDevDF )

In [None]:
pcaDev = dfPCA_dev.transform(  scaledDevDF )

In [None]:
pcaEv = dfPCA_dev.transform(  scaledEvDF )

In [None]:
plt.plot(np.cumsum(dfPCA_dev.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

# Classification algorithms

In [None]:
X_train, X_test, y_train, y_test = train_test_split(pcaDev, dfDevelopment['label'].tolist(), test_size=0.33, random_state=42)

### Random Forest

In [None]:
#Use the constructed dataset to predict speaker by Random Forest
clfRF = RandomForestClassifier( criterion='entropy' )#n_estimators = 20, max_features = 'sqrt')
dfRFC = dfEvaluation.copy()

In [None]:
print(metrics.f1_score(clfRF.fit(X_train, y_train).predict(X_test), y_test,average='macro'))

res = clfRF.fit(pcaDev, dfDevelopment['label'].tolist()).predict( pcaEv )
dfRFC['Predicted'] = res

In [None]:
dfRFC[['Id','Predicted']].to_csv('RF_solutions.csv',index=False)
!cp RF_solutions.csv "drive/My Drive/DSL_Data"

In [None]:
from sklearn.metrics import plot_confusion_matrix

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
plot_confusion_matrix(clfRF, X_test, y_test, cmap = plt.cm.Blues, ax = ax, normalize = 'true') 
plt.show()

### Support vector machine

In [None]:
clfSVC = svm.SVC(kernel = 'rbf', C=100, gamma = .01)
# dfSVM = dfEvaluation.copy()

In [None]:
print(metrics.f1_score(clfSVC.fit(X_train, y_train).predict( X_test ), y_test,average='macro'))

# res = clfSVC.fit(pcaDev, dfDevelopment['label'].tolist()).predict( pcaEv )
# dfSVM['Predicted'] = res

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
plot_confusion_matrix(clfSVC, X_test, y_test, cmap = plt.cm.Blues, ax = ax, normalize = 'true') 
plt.show()

#### Fine tuning

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [None]:
hypers = { 'C': np.logspace(-1,2,4), 'gamma': np.logspace(-2,-1,2), 'kernel' : ['rbf', 'poly', 'linear']}
clfSVC = svm.SVC()
dfSVM = dfEvaluation.copy()

In [None]:
gridSVM = GridSearchCV(clfSVC, hypers, cv = 3)

In [None]:
gridSVM.fit(pcaDev, dfDevelopment['label'].tolist())

In [None]:
print("The best parameters are %s with a score of %0.2f"
      % (gridSVM.best_params_, gridSVM.best_score_))

In [None]:
gridResult = pd.concat([pd.DataFrame(gridSVM.cv_results_["params"]),pd.DataFrame(gridSVM.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)

In [None]:
print(gridResult.to_string()) 

In [None]:
dfSVM.head()

In [None]:
dfSVM[['Id','Predicted']].to_csv('SVM_solutions.csv',index=False)
!cp SVM_solutions.csv "drive/My Drive/DSL_Data"

### KNN

In [None]:
clfKNN = KNeighborsClassifier(n_neighbors = 7)
dfKNN = dfEvaluation.copy()

In [None]:
print(metrics.f1_score(clfKNN.fit(X_train, y_train).predict( X_test ), y_test,average='macro'))

# res = clfKNN.fit(pcaDev, dfDevelopment['label'].tolist()).predict( pcaEv )
# dfKNN['Predicted'] = res

In [None]:
dfSVM[['Id','Predicted']].to_csv('KNN_solutions.csv',index=False)
!cp KNN_solutions.csv "drive/My Drive/DSL_Data"

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
plot_confusion_matrix(clfSVC, X_test, y_test, cmap = plt.cm.Blues, ax = ax, normalize = 'true') 
plt.show()

### Logistic Regression

In [None]:
clfLR = LogisticRegression(max_iter = 100, C = 10, solver = 'newton-cg')
dfLR = dfEvaluation.copy()

In [None]:
print(metrics.f1_score(clfLR.fit(X_train, y_train).predict( X_test ), y_test,average='macro'))

#res = clfLR.fit(pcaDev, dfDevelopment['label'].tolist()).predict( pcaEv )
#dfLR['Predicted'] = res

In [None]:
dfSVM[['Id','Predicted']].to_csv('LR_solutions.csv',index=False)
!cp LR_solutions.csv "drive/My Drive/DSL_Data"

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
plot_confusion_matrix(clfLR, X_test, y_test, cmap = plt.cm.Blues, ax = ax, normalize = 'true') 
plt.show()

### Naive Bayes

In [None]:
clfNB = GaussianNB()
dfNB = dfEvaluation.copy()

In [None]:
print(metrics.f1_score(clfNB.fit(X_train, y_train).predict( X_test ), y_test,average='macro'))

# res = clfNB.fit(pcaDev, dfDevelopment['label'].tolist()).predict( pcaEv )
# dfNB['Predicted'] = res

In [None]:
dfNB[['Id','Predicted']].to_csv('NB_solutions.csv',index=False)
!cp NB_solutions.csv "drive/My Drive/DSL_Data"

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
plot_confusion_matrix(clfNB, X_test, y_test, cmap = plt.cm.Blues, ax = ax, normalize = 'true') 
plt.show()