In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, recall_score, precision_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.utils import class_weight

In [52]:
feature_set='feature_states_norm_train_aug'
feature_set_test = feature_set.replace('train', 'test')
print(feature_set)
print(feature_set_test)

feature_states_norm_train_aug
feature_states_norm_test_aug


In [57]:
train_df=pd.read_csv(f'/content/drive/MyDrive/CancerVsLaryngitis/FeatureSets/feature_states_train_aug.csv', index_col=0)
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,504,505,506,507,508,509,510,511,file,pathology
0,-0.120713,-0.63317,-0.266847,0.381911,0.378875,0.186353,-0.661571,-0.331602,0.050681,-0.39215,...,2.019294,0.135942,0.627053,-0.591418,0.567487,0.43507,-0.854245,0.479537,940-phrase_speed.wav,Cancer
1,0.02515,-0.561448,0.018797,0.194035,0.593945,0.26103,-0.549476,-0.265667,-0.034633,-0.658216,...,0.824452,0.23926,0.560523,-0.472824,0.4631,0.562366,-0.700494,0.219516,940-phrase.wav,Cancer
2,0.083659,-0.727587,-0.773671,0.404416,0.432034,0.115656,-0.387319,-0.688972,0.930321,0.352522,...,3.046769,0.078175,0.547484,-0.651069,0.150797,0.474411,-0.828002,0.251287,1942-phrase_speed.wav,Cancer
3,0.012563,-0.495998,-0.315859,0.277518,0.316574,0.299874,-0.479337,-0.482175,0.802551,0.007793,...,3.205146,1.059886,0.586349,-0.795936,0.857375,0.426386,-0.423451,0.088447,1560-phrase_pitch.wav,Cancer
4,0.394916,-0.442807,0.717502,0.136139,0.43847,0.152567,-0.332735,-0.891878,-0.323862,-0.111114,...,0.636067,0.020065,0.569033,-0.604564,0.680637,0.256006,-0.443291,-0.156645,1273-phrase.wav,Cancer


In [58]:
X = train_df.drop(['file', 'pathology'], axis=1)
y = train_df['pathology']

In [59]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'linear', 'sigmoid', 'poly']}

grid = GridSearchCV(SVC(random_state=42), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.688 total time=   0.1s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.703 total time=   0.1s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.703 total time=   0.1s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.703 total time=   0.1s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.703 total time=   0.1s
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=1.000 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.891 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.922 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.938 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.969 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.688 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=1, kernel=sigmoi

In [60]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
SVC(C=100, gamma=0.001, random_state=42)


In [61]:
test_df=pd.read_csv(f'/content/drive/MyDrive/CancerVsLaryngitis/FeatureSets/feature_states_test.csv', index_col=0)
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,504,505,506,507,508,509,510,511,file,pathology
0,0.172979,-0.363699,1.842731,0.041766,0.479898,0.397434,-0.531031,-0.936717,-0.722212,-0.295034,...,-0.240645,-0.746416,0.553921,-0.324544,0.665001,0.447248,-0.016285,-0.317483,2402-phrase.wav,Cancer
1,0.453035,-0.655606,1.561639,0.041225,0.559176,0.211827,-0.268881,-0.945386,-0.567863,-0.550519,...,-0.266303,-0.415646,0.543781,-0.601966,0.716743,0.481418,-0.248686,-0.142528,1403-phrase.wav,Cancer
2,0.167406,-0.078759,0.498476,0.224008,0.500203,0.285295,-0.213153,-0.085809,-0.674559,-0.451193,...,0.10274,0.282924,0.59948,-0.669555,0.449027,0.321411,-0.280022,-0.159335,820-phrase.wav,Cancer
3,0.21578,-0.509523,2.087462,0.006096,0.448333,0.275221,-0.189902,-1.036782,-0.223437,-0.224473,...,-0.11818,-0.415785,0.369609,-0.596403,0.786977,0.355175,0.013314,-0.337218,1451-phrase.wav,Cancer
4,0.395836,-0.490914,-0.290234,0.282078,0.533619,0.433206,-0.344187,-0.442995,-0.345965,-0.305066,...,0.449944,0.189775,0.431429,-0.3101,0.340192,0.358336,0.376992,0.134569,2343-phrase.wav,Cancer


In [62]:
X_test = test_df.drop(['file', 'pathology'], axis=1)
y_test = test_df['pathology']

In [63]:

grid_predictions = grid.predict(X_test)

# print classification report
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

      Cancer       0.90      0.64      0.75        14
  Laryngitis       0.83      0.96      0.89        26

    accuracy                           0.85        40
   macro avg       0.87      0.80      0.82        40
weighted avg       0.86      0.85      0.84        40



In [64]:
confusion_matrix(y_test, grid_predictions)

array([[ 9,  5],
       [ 1, 25]])

In [65]:
balanced_accuracy_score(y_test, grid_predictions)

0.8021978021978022

In [66]:
precision_score(y_test, grid_predictions, pos_label='Cancer')

0.9

In [67]:
recall_score(y_test, grid_predictions, pos_label='Cancer')

0.6428571428571429

In [49]:
import pickle

In [50]:
pickle.dump(grid, open(f'/content/drive/MyDrive/CancerVsLaryngitis/Models/feature_states_svm.sav', 'wb'))