In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import scipy.signal as signal
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import seaborn as sns
import warnings
import sys, os

warnings.filterwarnings('ignore')
random_state = 6
np.random.seed(random_state)

In [3]:
filename = 'seismogram_data_62.csv'
df = pd.read_csv(filename)

In [4]:
df.describe()

Unnamed: 0,mfccs_0,mfccs_1,mfccs_2,mfccs_3,mfccs_4,mfccs_5,mfccs_6,mfccs_7,mfccs_8,mfccs_9,...,centiroid,max_amplitude,mean_amplitude,moment,variation,skew,var,autocr,kurto,target
count,1606.0,1606.0,1606.0,1606.0,1606.0,1606.0,1606.0,1606.0,1606.0,1606.0,...,1606.0,1606.0,1606.0,1606.0,1606.0,1606.0,1606.0,1606.0,1606.0,1606.0
mean,941.665095,75.003741,9.231527,50.089629,9.011473,33.163889,11.137501,21.313839,13.738046,14.940774,...,620.513687,1055193000.0,1110251.0,0.0,278.243072,-0.18807,3743952000000.0,5.021586e+17,31.972242,0.323163
std,225.061154,37.099307,35.261332,19.673425,23.210687,12.591131,14.331587,7.673028,8.545568,5.140691,...,737.102635,4208591000.0,5156415.0,0.0,7662.201633,4.893058,27810320000000.0,4.202786e+18,763.247083,0.46783
min,487.638894,0.590916,-95.650386,9.991572,-63.162621,5.273298,-38.585637,0.4955,-20.88389,-4.159762,...,3.609066,30786.43,203.9792,0.0,-18424.601562,-148.536407,1.145826,164742000.0,-1.557898,0.0
25%,775.150225,46.703585,-14.839313,36.362204,-7.899532,24.523875,2.047651,16.663696,9.773171,12.017408,...,134.614808,1197471.0,3374.144,0.0,-0.564912,-0.066578,108912.5,47261520000.0,-0.056514,0.0
50%,900.926968,74.069958,12.228716,49.900223,12.684427,31.645115,14.229327,20.592639,15.489321,15.137249,...,319.441259,5082282.0,12805.47,0.0,0.114271,-0.00416,1769261.0,768203200000.0,0.241702,0.0
75%,1095.790912,100.281783,35.329806,63.719153,28.028384,40.849423,22.416433,25.514643,19.2473,18.234024,...,797.667762,204197500.0,265340.3,0.0,2.218386,0.057529,659358200.0,821713100000000.0,1.950677,1.0
max,1572.073401,202.898203,98.485401,123.096246,64.457973,81.809122,47.306912,52.77485,39.409499,35.151009,...,4656.880334,38143460000.0,55824710.0,0.0,300014.3125,11.173301,309805500000000.0,6.909585e+19,27426.243439,1.0


In [5]:
Y = df['target']
X = df.drop(['target'], axis=1)

print(np.shape(X), np.shape(Y))

X, Y = shuffle(X, Y, random_state = random_state)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state=random_state)
mms = StandardScaler()
X_train = mms.fit_transform(X_train)
X_test = mms.fit_transform(X_test)

(1606, 61) (1606,)


## LightGBM

In [6]:
import lightgbm as lgb

lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.5,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_data_in_leaf=40, min_split_gain=0.0,
        n_estimators=10, n_jobs=-1, num_leaves=20, objective=None,
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=50000, subsample_freq=1)

lgbm_model.fit(X_train, Y_train)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.5,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_data_in_leaf=40, min_split_gain=0.0,
        n_estimators=10, n_jobs=-1, num_leaves=20, objective=None,
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=50000, subsample_freq=1)

In [7]:
Y_pred = lgbm_model.predict(X_test)

print('1. The accuracy of the model is {}\n'.format(accuracy_score(Y_test, Y_pred)))
print('2. Classification report \n {} \n'.format(classification_report(Y_test, Y_pred)))
print('3. Confusion matrix \n {} \n'.format(confusion_matrix(Y_pred, Y_test)))
print('4. Roc_Auc score \n {}'.format(roc_auc_score(Y_pred, Y_test)))

1. The accuracy of the model is 0.7810945273631841

2. Classification report 
              precision    recall  f1-score   support

        0.0       0.79      0.93      0.85       276
        1.0       0.74      0.46      0.57       126

avg / total       0.78      0.78      0.76       402
 

3. Confusion matrix 
 [[256  68]
 [ 20  58]] 

4. Roc_Auc score 
 0.7668566001899335


## xgboost

In [9]:
import xgboost as xgb 
xgb_model = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.10, max_delta_step=0,
       max_depth=5, min_child_weight=11, missing=-999, n_estimators=2000,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1337, silent=1,
       subsample=0.8)

xgb_model.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=11, missing=-999, n_estimators=2000,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1337, silent=1,
       subsample=0.8)

In [10]:
Y_pred = xgb_model.predict(X_test)

print('1. The accuracy of the model is {}\n'.format(accuracy_score(Y_test, Y_pred)))
print('2. Classification report \n {} \n'.format(classification_report(Y_test, Y_pred)))
print('3. Confusion matrix \n {} \n'.format(confusion_matrix(Y_pred, Y_test)))
print('4. Roc_Auc score \n {}'.format(roc_auc_score(Y_pred, Y_test)))

1. The accuracy of the model is 0.7985074626865671

2. Classification report 
              precision    recall  f1-score   support

        0.0       0.81      0.93      0.86       276
        1.0       0.76      0.52      0.62       126

avg / total       0.79      0.80      0.79       402
 

3. Confusion matrix 
 [[256  61]
 [ 20  65]] 

4. Roc_Auc score 
 0.786138430135461


## Random forest classifier

In [17]:
rf_model = RandomForestClassifier()

rf_model.fit(X_train, Y_train)
Y_pred = rf_model.predict(X_test)

print('1. The accuracy of the model is {}\n'.format(accuracy_score(Y_test, Y_pred)))
print('2. Classification report \n {} \n'.format(classification_report(Y_test, Y_pred)))
print('3. Confusion matrix \n {} \n'.format(confusion_matrix(Y_pred, Y_test)))
print('4. Roc_Auc score \n {}'.format(roc_auc_score(Y_pred, Y_test)))

1. The accuracy of the model is 0.7562189054726368

2. Classification report 
              precision    recall  f1-score   support

        0.0       0.76      0.93      0.84       276
        1.0       0.71      0.37      0.49       126

avg / total       0.75      0.76      0.73       402
 

3. Confusion matrix 
 [[257  79]
 [ 19  47]] 

4. Roc_Auc score 
 0.7385010822510822


## Gaussian Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB
gb_model = GaussianNB()

gb_model.fit(X_train, Y_train)
Y_pred = gb_model.predict(X_test)

print('1. The accuracy of the model is {}\n'.format(accuracy_score(Y_test, Y_pred)))
print('2. Classification report \n {} \n'.format(classification_report(Y_test, Y_pred)))
print('3. Confusion matrix \n {} \n'.format(confusion_matrix(Y_pred, Y_test)))
print('4. Roc_Auc score \n {}'.format(roc_auc_score(Y_pred, Y_test)))


1. The accuracy of the model is 0.6865671641791045

2. Classification report 
              precision    recall  f1-score   support

        0.0       0.69      1.00      0.81       276
        1.0       0.50      0.01      0.02       126

avg / total       0.63      0.69      0.56       402
 

3. Confusion matrix 
 [[275 125]
 [  1   1]] 

4. Roc_Auc score 
 0.59375


## Support Vector Machine

In [19]:
from sklearn import svm
svm_model = svm.SVC()

svm_model.fit(X_train, Y_train)
Y_pred = svm_model.predict(X_test)

print('1. The accuracy of the model is {}\n'.format(accuracy_score(Y_test, Y_pred)))
print('2. Classification report \n {} \n'.format(classification_report(Y_test, Y_pred)))
print('3. Confusion matrix \n {} \n'.format(confusion_matrix(Y_pred, Y_test)))
print('4. Roc_Auc score \n {}'.format(roc_auc_score(Y_pred, Y_test)))

1. The accuracy of the model is 0.8507462686567164

2. Classification report 
              precision    recall  f1-score   support

        0.0       0.88      0.91      0.89       276
        1.0       0.78      0.73      0.75       126

avg / total       0.85      0.85      0.85       402
 

3. Confusion matrix 
 [[250  34]
 [ 26  92]] 

4. Roc_Auc score 
 0.8299713535449988
