In [73]:
%matplotlib inline

import numpy as np
import scipy as sp
import scipy.signal as signal
import pandas as pd
import visuals as vs
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

random_state = 6
np.random.seed(random_state)

In [64]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
rfc = RandomForestClassifier(criterion='entropy', random_state = random_state, class_weight='balanced')
params = {
    'n_estimators': [10, 20, 50, 100],
    'max_depth': [1, 2, 3, 4, 5],
    'min_samples_split': [10, 20, 40, 50]
}

grid_clf = GridSearchCV(estimator = rfc, param_grid = params, scoring = 'accuracy', cv = cv)

In [65]:
# we have to find out best optimized parameters
grid_clf.fit(X_train, Y_train.astype(int))

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=6, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=6, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 50, 100], 'max_depth': [1, 2, 3, 4, 5], 'min_samples_split': [10, 20, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [68]:
# filename = 'seismogram_data_raw.csv'
filename = 'seismogram_data_cents_db.csv'
# filename = 'seismogram_data_cents_db_mfcc_delta.csv'

df = pd.read_csv(filename)
n_feature = np.shape(df)[1] - 2

X = df.iloc[:,:n_feature]
Y = df.iloc[:,-1]

X, Y = shuffle(X, Y, random_state = random_state)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state=random_state)
mms = StandardScaler()
X_train = mms.fit_transform(X_train)
X_test = mms.fit_transform(X_test)

In [69]:
class_weight = {0:1, 1:2}

model = RandomForestClassifier(bootstrap=True, class_weight=class_weight,
            criterion='entropy', max_depth=4, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=random_state, verbose=0, warm_start=False)

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

print('The accuracy of the model is {}'.format(accuracy_score(Y_test, Y_pred)))
print('Classification report \n {}'.format(classification_report(Y_test, Y_pred)))
print('Confusion matrix \n {}'.format(confusion_matrix(Y_pred, Y_test)))

The accuracy of the model is 0.8059701492537313
Classification report 
              precision    recall  f1-score   support

        0.0       0.88      0.83      0.85       276
        1.0       0.67      0.76      0.71       126

avg / total       0.82      0.81      0.81       402

Confusion matrix 
 [[228  30]
 [ 48  96]]


# Support Vector machine 

In [30]:
from sklearn import svm

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state = random_state)
svm_clss = svm.SVC(class_weight = class_weight, random_state = random_state) 

svm_params = {
    'C': [1, 10, 100, 1000, 5000, 10000, 100000], 
    'gamma': [0.01, 0.001, 0.0001, 0.00001, 0.000001], 
    'kernel': ['rbf', 'poly'],
    
}

grid_clf = GridSearchCV(estimator = svm_clss, param_grid = svm_params, scoring = 'accuracy', cv = cv)

In [31]:
grid_clf.fit(X_train, Y_train.astype(int))

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=6, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight={0: 1, 1: 2}, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=6, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1, 10, 100, 1000, 5000, 10000, 100000], 'gamma': [0.01, 0.001, 0.0001, 1e-05, 1e-06], 'kernel': ['rbf', 'poly']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [32]:
print(grid_clf.best_params_)
print(grid_clf.best_score_)
print(grid_clf.best_estimator_)

{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.908637873754
SVC(C=1000, cache_size=200, class_weight={0: 1, 1: 2}, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=6, shrinking=True,
  tol=0.001, verbose=False)


In [75]:
model = svm.SVC(C=1000, cache_size=200, class_weight={0: 1, 1: 2}, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=random_state, shrinking=True,
  tol=0.001, verbose=False)

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

print('1. The accuracy of the model is {}\n'.format(accuracy_score(Y_test, Y_pred)))
print('2. Classification report \n {} \n'.format(classification_report(Y_test, Y_pred)))
print('3. Confusion matrix \n {} \n'.format(confusion_matrix(Y_pred, Y_test)))
print('4. Roc_Auc score \n {}'.format(roc_auc_score(Y_pred, Y_test)))

1. The accuracy of the model is 0.9154228855721394

2. Classification report 
              precision    recall  f1-score   support

        0.0       0.94      0.94      0.94       276
        1.0       0.87      0.87      0.87       126

avg / total       0.92      0.92      0.92       402
 

3. Confusion matrix 
 [[259  17]
 [ 17 109]] 

4. Roc_Auc score 
 0.9017425810904072
