# Classification Model - XGBoost with previous PCA

In [5]:
#Importing all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import os
from itertools import combinations
from scipy.stats import chi2_contingency
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier, callback
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from collections import Counter

In [84]:
print(os.getcwd())

c:\Users\ntama\Documentos\Data Science\Codecademy\projects\Date-A-Scientist - Final Project\OKCupid-Date-A-Scientist-Starter\date-a-scientist-project\notebooks


## Importing the datasets from PCA

In [85]:
X_train = pd.read_csv("../data/X_train_pca.csv", index_col=False)
X_train = X_train.loc[:, ~X_train.columns.str.contains('^Unnamed')]

X_val = pd.read_csv("../data/X_val_pca.csv", index_col=False)
X_val = X_val.loc[:, ~X_val.columns.str.contains('^Unnamed')]

X_test = pd.read_csv("../data/X_test_pca.csv", index_col=False)
X_test = X_test.loc[:, ~X_test.columns.str.contains('^Unnamed')]

y_train = pd.read_csv("../data/y_train.csv", index_col=False)
y_train = y_train.loc[:, ~y_train.columns.str.contains('^Unnamed')]

y_val = pd.read_csv("../data/y_val.csv", index_col=False)
y_val = y_val.loc[:, ~y_val.columns.str.contains('^Unnamed')]

y_test = pd.read_csv("../data/y_test.csv", index_col=False)
y_test = y_test.loc[:, ~y_test.columns.str.contains('^Unnamed')]

In [86]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)

print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(41960, 6)
(8992, 6)
(8991, 6)
(41960, 1)
(8992, 1)
(8991, 1)


## Training the model on PCA subsets

In [87]:
print(y_train.value_counts())

body_type
0            19353
1            16397
2             6210
Name: count, dtype: int64


Since classes are imbalanced, let's assign a weight for the class 2

In [88]:
count_class_2 = np.sum(y_train['body_type'] == 2)
weight_class_2 = (len(y_train) - count_class_2) / count_class_2

weight_dict = {
    0: 1,
    1: 1,
    2: weight_class_2
}

weights = y_train['body_type'].map(weight_dict)


In [89]:
print(X_val.shape)
print(np.array(y_val).shape)

(8991, 6)
(8991, 1)


In [90]:
Xnp_train = np.array(X_train)
Xnp_val = np.array(X_val)
ynp_train = np.array(y_train)
ynp_val = np.array(y_val)

dtrain = xgb.DMatrix(Xnp_train, label=y_train, weight=weights)
dval   = xgb.DMatrix(Xnp_val,   label=y_val)

In [91]:
params = {
    "objective": "multi:softprob",
    "num_class": 3, 
    "eval_metric": "mlogloss", 
    "eta": 0.1, 
    "max_depth": 3,  
    "verbosity": 1,
}


In [92]:
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [93]:
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=200,
    evals=watchlist,
    early_stopping_rounds=20
)


[0]	train-mlogloss:1.07943	eval-mlogloss:1.10052
[1]	train-mlogloss:1.06330	eval-mlogloss:1.10337


[2]	train-mlogloss:1.04962	eval-mlogloss:1.10677
[3]	train-mlogloss:1.03803	eval-mlogloss:1.11073
[4]	train-mlogloss:1.02814	eval-mlogloss:1.11489
[5]	train-mlogloss:1.01957	eval-mlogloss:1.11922
[6]	train-mlogloss:1.01223	eval-mlogloss:1.12372
[7]	train-mlogloss:1.00595	eval-mlogloss:1.12817
[8]	train-mlogloss:1.00057	eval-mlogloss:1.13261
[9]	train-mlogloss:0.99598	eval-mlogloss:1.13704
[10]	train-mlogloss:0.99205	eval-mlogloss:1.14145
[11]	train-mlogloss:0.98865	eval-mlogloss:1.14572
[12]	train-mlogloss:0.98563	eval-mlogloss:1.14967
[13]	train-mlogloss:0.98303	eval-mlogloss:1.15341
[14]	train-mlogloss:0.98074	eval-mlogloss:1.15695
[15]	train-mlogloss:0.97880	eval-mlogloss:1.16041
[16]	train-mlogloss:0.97707	eval-mlogloss:1.16361
[17]	train-mlogloss:0.97554	eval-mlogloss:1.16661
[18]	train-mlogloss:0.97427	eval-mlogloss:1.16955
[19]	train-mlogloss:0.97306	eval-mlogloss:1.17211
[20]	train-mlogloss:0.97200	eval-mlogloss:1.17456


Turning probabilities into classes

In [94]:
y_pred_probs = bst.predict(dval)
y_pred = np.argmax(y_pred_probs, axis=1)

Getting metrics

In [95]:
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[2335   25 1787]
 [1558   39 1917]
 [ 284    6 1040]]
              precision    recall  f1-score   support

           0       0.56      0.56      0.56      4147
           1       0.56      0.01      0.02      3514
           2       0.22      0.78      0.34      1330

    accuracy                           0.38      8991
   macro avg       0.45      0.45      0.31      8991
weighted avg       0.51      0.38      0.32      8991

