In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics

In [3]:
#load numpy array
data = np.load("./Data/data_pca_50.npz")
data.files

['arr_0', 'arr_1']

In [5]:
data.allow_pickle = True

In [7]:
X = data["arr_0"]
Y = data["arr_1"]

In [9]:
X.shape,Y.shape

((4315, 50), (4315,))

In [15]:
Y

array(['Female', 'Female', 'Female', ..., 'Male', 'Male', 'Male'],
      dtype=object)

In [19]:
# split the data into train and test
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,stratify=Y)

In [21]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(3452, 50) (863, 50) (3452,) (863,)


#### Hyperparameter Tuning

In [24]:
model_svc = SVC(probability=True)
param_grid = {
    "C":[0.5,1,10,20,30,50],
    "kernel":["rbf","poly"],
    "gamma":[0.1,0.05,0.01,0.001,0.002,0.005],
    "coef0":[0,1]
}



In [26]:
model_grid = GridSearchCV(model_svc,
                          param_grid=param_grid,
                          scoring="accuracy",
                          cv=3, verbose=2)

In [28]:
model_grid.fit(x_train,y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.5s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.4s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.5s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.5s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.5s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.6s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   2.1s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   3.0s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   3.0s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.6s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.7s
[CV] END ............C=0.5, coef0=0, gamma=0.0

In [30]:
model_grid.best_params_

{'C': 1, 'coef0': 1, 'gamma': 0.01, 'kernel': 'poly'}

In [32]:
model_final = model_grid.best_estimator_

In [36]:
model_final.get_params()

{'C': 1,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 1,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.01,
 'kernel': 'poly',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

## Model Evaluation

- Classification report
   - Precision
   - Recall
   - F1-score

- Kappa score ( mainly on multiclass problems)
    - -ve (worst model)
    - 0 to 0.5(bad model)
    - 0.5 to 0.7 (Good model)
    - 0.7 to 0.9 (Excellent model)
    - 0.9 to 1(Perfect model)

- AUC(area under curve)
    - less than 0.5(worst model)
    - 0.5 to 0.6(bad model)
    - 0.6 to 0.8(Good model)
    - 0.8 to 0.9(Excellent model)
    - 0.9 to 1(perfect model)
  

In [40]:
y_pred = model_final.predict(x_test)
y_pred

array(['Female', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male',
       'Female', 'Male', 'Female', 'Female', 'Female', 'Female', 'Male',
       'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Female',
       'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Male',
       'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male',
       'Male', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female',
       'Male', 'Female', 'Male', 'Male', 'Female', 'Female', 'Female',
       'Female', 'Male', 'Female', 'Female', 'Female', 'Female', 'Female',
       'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male',
       'Female', 'Female', 'Female', 'Female', 'Female', 'Female',
       'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Female',
       'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male',
       'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Male',
       'Female', 'Male', 'Male', 'Female', 'Male', 'Female', 'Female',
       'Fema

In [44]:
# classification report
cr = metrics.classification_report(y_test,y_pred,output_dict=True)
pd.DataFrame(cr).T  # T implies for transverse

Unnamed: 0,precision,recall,f1-score,support
Female,0.787686,0.777778,0.7827,477.0
Male,0.729592,0.740933,0.735219,386.0
accuracy,0.761298,0.761298,0.761298,0.761298
macro avg,0.758639,0.759355,0.758959,863.0
weighted avg,0.761702,0.761298,0.761463,863.0


In [46]:
# kappa score
metrics.cohen_kappa_score(y_test,y_pred)   # good model

0.5179424604788634

In [54]:
# area under curve (auc)
metrics.roc_auc_score(np.where(y_test=="Male",1,0),np.where(y_pred=="Male",1,0))  #excellent model


0.7593552101324121

#### Save Model

In [57]:
import pickle

In [59]:
pickle.dump(model_final,open("./Model/model_svm.pickle",mode="wb"))