In [18]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn import metrics

In [19]:
# load numpy array
data = np.load('./data/data_pca_50_target.npz')
data.files

['arr_0', 'arr_1']

In [20]:
## Load data
data.allow_pickle = True

In [21]:
X = data['arr_0'] # pca data with 50 components
y = data['arr_1'] # target or dependent variable

In [22]:
X.shape,y.shape

((4319, 50), (4319,))

# Split the data into train and test set

In [23]:
x_train,x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(3455, 50) (864, 50) (3455,) (864,)


# Training machine learning model

In [24]:
model_svc = SVC(probability=True)

param_grid = {'C':[0.5,1,10,20,30,50],
             'kernel':['rbf','poly'],
             'gamma':[0.1,0.05,0.01,0.001,0.002,0.005],
             'coef0':[0,1]
             }

In [25]:
np.random.seed(42)

model_grid = GridSearchCV(model_svc,
                          param_grid=param_grid,
                          scoring='accuracy',cv=3,verbose=2
                          )

In [26]:
model_grid.fit(x_train,y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   3.1s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.4s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.5s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.6s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.6s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.7s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   2.1s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   2.2s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   2.1s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.4s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.5s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.4s
[CV] END .............C=0.5,

In [27]:
model_grid.best_params_

{'C': 1, 'coef0': 1, 'gamma': 0.01, 'kernel': 'poly'}

In [28]:
final_model = model_grid.best_estimator_

In [29]:
final_model.get_params()

{'C': 1,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 1,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.01,
 'kernel': 'poly',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

# Model Evaluation
- Classification Report
    - Precision, Recall, F1-Score
- Kappa Score
    - -ve (worst model)
    - 0 to 0.5 (bad model)
    - 0.5 to 0.7 (Good Model)
    - 0.7 to 0.9 (Excellent Model)
    - 0.9 to 1.0 (Perfect Model)
- AUC
    - Less than 0.5 (Worst Model)
    - 0.5 to 0.6 (Bad Model)
    - 0.6 to 0.8 (Good Model)
    - 0.8 to 0.9 (Excellent Model)
    - 0.9 to 1.0 (Perfect Model)

In [30]:
# predict on test data
y_pred = final_model.predict(x_test)
y_pred

array(['male', 'male', 'female', 'male', 'female', 'female', 'male',
       'female', 'female', 'male', 'female', 'female', 'female', 'female',
       'female', 'male', 'female', 'female', 'female', 'male', 'male',
       'female', 'male', 'female', 'female', 'female', 'male', 'female',
       'male', 'male', 'female', 'female', 'female', 'male', 'female',
       'male', 'male', 'female', 'male', 'female', 'male', 'male', 'male',
       'female', 'female', 'male', 'male', 'female', 'female', 'female',
       'male', 'male', 'female', 'female', 'female', 'female', 'female',
       'male', 'female', 'female', 'female', 'female', 'female', 'male',
       'male', 'female', 'male', 'female', 'female', 'male', 'male',
       'male', 'male', 'male', 'female', 'male', 'female', 'male',
       'female', 'female', 'male', 'male', 'female', 'male', 'male',
       'male', 'male', 'female', 'male', 'male', 'female', 'male',
       'female', 'male', 'male', 'male', 'male', 'male', 'male', 'female',


In [31]:
cr = metrics.classification_report(y_test,y_pred,output_dict=True)
pd.DataFrame(cr).T

Unnamed: 0,precision,recall,f1-score,support
female,0.827004,0.821803,0.824395,477.0
male,0.782051,0.788114,0.785071,387.0
accuracy,0.806713,0.806713,0.806713,0.806713
macro avg,0.804528,0.804958,0.804733,864.0
weighted avg,0.806869,0.806713,0.806781,864.0


# Kappa Score

In [32]:
metrics.cohen_kappa_score(y_test,y_pred)

0.6094709149371529

In [33]:
metrics.roc_auc_score(np.where(y_test=="male",1,0),
                      np.where(y_pred=="male",1,0)
                      )

0.8049583150504608

# Save model

In [34]:
import pickle

pickle.dump(final_model ,open('model/model_svm.pickle',mode='wb'))