# LightGBM vs XGBoost

In [6]:
import json

import numpy as np  
import random

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, train_test_split
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, accuracy_score, f1_score, fbeta_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# carica la struttura dati
with open('data_structure.json') as json_file:
    ds = json.load(json_file)

In [3]:
# prendi le label del dizionario e rimescolale
keys = list(ds.keys())
random.shuffle(keys)

shuffled_ds = {}
for key in keys:
    shuffled_ds.update({key: ds[key]})

In [4]:
# separa le labels(y) dai vettori di embedding (x)
labels = list(shuffled_ds.keys())
embeddings = list(shuffled_ds.values())

# delle label prendi solo il nome dello speaker andando ad eliminare la parte '-numerofile.wav'
for i in range(len(labels)):
    sep = '-'
    labels[i] = labels[i].split(sep, 1)[0]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.33, random_state=42)

n_train_samples = len(X_train)
n_test_samples = len(X_test)

### XGBoost

In [9]:
clf = XGBClassifier(n_estimators=100,learning_rate=0.03,eval_metric='mae')

In [10]:
clf.fit(np.reshape(np.array(X_train), [n_train_samples,512]), y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mae',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.03, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
y_pred = clf.predict(np.reshape(np.array(X_test), [n_test_samples,512]))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
fb = fbeta_score(y_test, y_pred, 0.5, average='weighted')

In [12]:
print('Accuracy Score is: ', accuracy)
print('F1 score is: ', f1)
print('Fbeta Score is: ', fb)

Accuracy Score is:  0.9191919191919192
F1 score is:  0.9190137628517716
Fbeta Score is:  0.9201854207113762


In [17]:
# stampa tutte le classificazioni errate sul test set (360 samples)
for i in range(0,len(y_test)):
    if y_test[i] != y_pred[i]:
        print('Speaker is: ', y_test[i])
        print('Prediction is: ', y_pred[i])

Speaker is:  Roberta_S
Prediction is:  Luigi_Saetta
Speaker is:  Luigi_Saetta
Prediction is:  Jens_Stoltenberg
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Jens_Stoltenberg
Speaker is:  Roberta_S
Prediction is:  Luigi_Saetta
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Jens_Stoltenberg
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Jens_Stoltenberg
Speaker is:  Nelson_Mandela
Prediction is:  Julia_Gillard
Speaker is:  Magaret_Tarcher
Prediction is:  Jens_Stoltenberg
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Benjamin_Netanyau
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Benjamin_Netanyau
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Benjamin_Netanyau
Speaker is:  Luigi_Saetta
Prediction is:  Magaret_Tarcher
Speaker is:  Jens_Stoltenberg
Prediction is:  Benjamin_Netanyau
Speaker is:  Luigi_Saetta
Prediction is:  Roberta_S
Speaker is:  Benjamin_Netanyau
Prediction is:  Magaret_Tarcher
Speaker is:  Luigi_Saetta
Prediction is:  Claudio_Tesei
Speaker is:  Julia_Gillard
Prediction

### LightGbm

In [18]:
light_clf = LGBMClassifier(n_estimators=100, learning_rate=0.03)

In [19]:
light_clf.fit(np.reshape(np.array(X_train), [n_train_samples,512]), y_train, verbose=True)

LGBMClassifier(learning_rate=0.03)

In [20]:
y_pred_light = light_clf.predict(np.reshape(np.array(X_test), [n_test_samples,512]))
accuracy = accuracy_score(y_test, y_pred_light)
f1 = f1_score(y_test, y_pred_light, average='weighted')
fb = fbeta_score(y_test, y_pred_light, 0.5, average='weighted')

In [21]:
print('Accuracy Score is: ', accuracy)
print('F1 score is: ', f1)
print('Fbeta Score is: ', fb)

Accuracy Score is:  0.9259259259259259
F1 score is:  0.9255697822722236
Fbeta Score is:  0.9266309681807839


In [22]:
# stampa tutte le classificazioni errate sul test set (360 samples)
for i in range(0,len(y_test)):
    if y_test[i] != y_pred_light[i]:
        print('Speaker is: ', y_test[i])
        print('Prediction is: ', y_pred_light[i])

Speaker is:  Roberta_S
Prediction is:  Luigi_Saetta
Speaker is:  Luigi_Saetta
Prediction is:  Jens_Stoltenberg
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Jens_Stoltenberg
Speaker is:  Roberta_S
Prediction is:  Luigi_Saetta
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Jens_Stoltenberg
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Luigi_Saetta
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Benjamin_Netanyau
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Benjamin_Netanyau
Speaker is:  Lorenzo_DeMarchis
Prediction is:  Benjamin_Netanyau
Speaker is:  Roberta_S
Prediction is:  Luigi_Saetta
Speaker is:  Jens_Stoltenberg
Prediction is:  Benjamin_Netanyau
Speaker is:  Luigi_Saetta
Prediction is:  Roberta_S
Speaker is:  Benjamin_Netanyau
Prediction is:  Magaret_Tarcher
Speaker is:  Luigi_Saetta
Prediction is:  Claudio_Tesei
Speaker is:  Julia_Gillard
Prediction is:  Magaret_Tarcher
Speaker is:  Jens_Stoltenberg
Prediction is:  Nelson_Mandela
Speaker is:  Luigi_Saetta
Prediction is:  Rober