In [1]:
## Packages
# system
import os
import math
# data wrangling
import numpy as np
import pandas as pd
from scipy import stats
from dfply import *
import statistics
# plotting
import matplotlib.pyplot as plt
import seaborn as sns
# models
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold, StratifiedKFold, cross_val_score 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, recall_score, make_scorer, accuracy_score, balanced_accuracy_score, f1_score, precision_score, roc_auc_score
from sklearn.preprocessing import normalize, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
## Settings
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None) 
pd.set_option('display.width', 1000)

  import pandas.util.testing as tm


## Data Import

In [2]:
df = pd.read_csv('../processed_data/provider_final_new.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df = df.set_index('Provider')
display(df.shape)

# Make dummies
encode_cols = df[~df.applymap(np.isreal)].dropna(axis=1).columns.to_list()
print(encode_cols)
df[encode_cols[2:]] = df[encode_cols[2:]].applymap(lambda x: 1 if x.lower() == 'yes' else 0)
dummies = pd.get_dummies(df['Serves'],prefix='Serves_',drop_first=True)
df_dum = pd.concat([df.drop('Serves',axis=1),dummies],axis=1)
display(df_dum.shape)

# this is for pos_label for precision & recall for gridsearchcv
df_dum['PotentialFraud'] = np.where(df_dum['PotentialFraud']=='Yes', 1, 0)

(5410, 46)

['PotentialFraud', 'Serves', 'DupClaim_IP', 'DupClaimMultiState_IP', 'DupClaimMultiProvider_IP', 'DupClaimMultiStartDt_IP', 'DupClaim_OP', 'DupClaimMultiState_OP', 'DupClaimMultiProvider_OP', 'DupClaimMultiStartDt_OP']


(5410, 47)

## SVM

### train-test-split

In [3]:
# remove Duration MedianIP
df_dum_drop = df_dum.drop(['DurationMedianIP'], axis=1)

# train-test-split
x = df_dum_drop.drop(['PotentialFraud'], axis = 1)
y = df_dum_drop.PotentialFraud
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=4, stratify=y)

# upsampling by SMOTE
sm = SMOTE(random_state=4)
xtrain, ytrain = sm.fit_sample(xtrain, ytrain)



In [6]:
svc = SVC()

# svc_params = [{'kernel':['linear', 'poly', 'rbf'], 'C':[0.1, 1, 10] , 'degree':[2,3,4]}]
# svc_params = [{'kernel':['linear'], 'C':[0.1, 1, 10]]
svc_params = [{'kernel':['poly', 'rbf', 'sigmoid'], 'C':[0.01, 0.1, 1, 10, 100], 'degree':[3,4,5,6,7,8]}]

kf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 4) # cv = 5

scores = { 'recall': make_scorer(recall_score, pos_label = 1, average = 'binary'), 
          'f1': make_scorer(f1_score, pos_label = 1, average = 'binary'), # use binary not weighted
         'precision': make_scorer(precision_score, pos_label = 1, average = 'binary'),
         'accuracy': make_scorer(accuracy_score), # not use balanced_accurarcy_score
         'specificity': make_scorer(recall_score, pos_label = 0, average = 'binary'), 
         'roc_auc_score': make_scorer(roc_auc_score, average = 'macro')
         }


svc_grid_nln = GridSearchCV(estimator=svc, param_grid=svc_params, return_train_score=True, 
                       cv = kf, verbose=2, scoring=scores, refit=False)

%time svc_grid_nln.fit(xtrain, ytrain)

Fitting 10 folds for each of 90 candidates, totalling 900 fits
[CV] C=0.01, degree=3, kernel=poly ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................... C=0.01, degree=3, kernel=poly, total=   4.6s
[CV] C=0.01, degree=3, kernel=poly ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.7s remaining:    0.0s


[CV] .................... C=0.01, degree=3, kernel=poly, total=   5.3s
[CV] C=0.01, degree=3, kernel=poly ...................................
[CV] .................... C=0.01, degree=3, kernel=poly, total=   4.0s
[CV] C=0.01, degree=3, kernel=poly ...................................
[CV] .................... C=0.01, degree=3, kernel=poly, total=   3.9s
[CV] C=0.01, degree=3, kernel=poly ...................................
[CV] .................... C=0.01, degree=3, kernel=poly, total=   3.2s
[CV] C=0.01, degree=3, kernel=poly ...................................
[CV] .................... C=0.01, degree=3, kernel=poly, total=   3.4s
[CV] C=0.01, degree=3, kernel=poly ...................................
[CV] .................... C=0.01, degree=3, kernel=poly, total=   3.1s
[CV] C=0.01, degree=3, kernel=poly ...................................
[CV] .................... C=0.01, degree=3, kernel=poly, total=   4.7s
[CV] C=0.01, degree=3, kernel=poly ...................................
[CV] .

[CV] ................. C=0.01, degree=4, kernel=sigmoid, total=   3.1s
[CV] C=0.01, degree=5, kernel=poly ...................................
[CV] .................... C=0.01, degree=5, kernel=poly, total=   5.5s
[CV] C=0.01, degree=5, kernel=poly ...................................
[CV] .................... C=0.01, degree=5, kernel=poly, total=   5.3s
[CV] C=0.01, degree=5, kernel=poly ...................................
[CV] .................... C=0.01, degree=5, kernel=poly, total=   4.9s
[CV] C=0.01, degree=5, kernel=poly ...................................
[CV] .................... C=0.01, degree=5, kernel=poly, total=   4.7s
[CV] C=0.01, degree=5, kernel=poly ...................................
[CV] .................... C=0.01, degree=5, kernel=poly, total=   5.9s
[CV] C=0.01, degree=5, kernel=poly ...................................
[CV] .................... C=0.01, degree=5, kernel=poly, total=   4.7s
[CV] C=0.01, degree=5, kernel=poly ...................................
[CV] .

[CV] ................. C=0.01, degree=6, kernel=sigmoid, total=   3.1s
[CV] C=0.01, degree=6, kernel=sigmoid ................................
[CV] ................. C=0.01, degree=6, kernel=sigmoid, total=   2.4s
[CV] C=0.01, degree=6, kernel=sigmoid ................................
[CV] ................. C=0.01, degree=6, kernel=sigmoid, total=   2.9s
[CV] C=0.01, degree=7, kernel=poly ...................................
[CV] .................... C=0.01, degree=7, kernel=poly, total=   3.9s
[CV] C=0.01, degree=7, kernel=poly ...................................
[CV] .................... C=0.01, degree=7, kernel=poly, total=   4.3s
[CV] C=0.01, degree=7, kernel=poly ...................................
[CV] .................... C=0.01, degree=7, kernel=poly, total=   4.2s
[CV] C=0.01, degree=7, kernel=poly ...................................
[CV] .................... C=0.01, degree=7, kernel=poly, total=   5.1s
[CV] C=0.01, degree=7, kernel=poly ...................................
[CV] .

[CV] ................. C=0.01, degree=8, kernel=sigmoid, total=   2.7s
[CV] C=0.01, degree=8, kernel=sigmoid ................................
[CV] ................. C=0.01, degree=8, kernel=sigmoid, total=   2.6s
[CV] C=0.01, degree=8, kernel=sigmoid ................................
[CV] ................. C=0.01, degree=8, kernel=sigmoid, total=   2.6s
[CV] C=0.01, degree=8, kernel=sigmoid ................................
[CV] ................. C=0.01, degree=8, kernel=sigmoid, total=   2.6s
[CV] C=0.01, degree=8, kernel=sigmoid ................................
[CV] ................. C=0.01, degree=8, kernel=sigmoid, total=   2.6s
[CV] C=0.1, degree=3, kernel=poly ....................................
[CV] ..................... C=0.1, degree=3, kernel=poly, total=   3.9s
[CV] C=0.1, degree=3, kernel=poly ....................................
[CV] ..................... C=0.1, degree=3, kernel=poly, total=   3.4s
[CV] C=0.1, degree=3, kernel=poly ....................................
[CV] .

[CV] .................. C=0.1, degree=4, kernel=sigmoid, total=   2.1s
[CV] C=0.1, degree=4, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=4, kernel=sigmoid, total=   2.4s
[CV] C=0.1, degree=4, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=4, kernel=sigmoid, total=   2.4s
[CV] C=0.1, degree=4, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=4, kernel=sigmoid, total=   2.1s
[CV] C=0.1, degree=4, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=4, kernel=sigmoid, total=   2.2s
[CV] C=0.1, degree=4, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=4, kernel=sigmoid, total=   2.1s
[CV] C=0.1, degree=4, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=4, kernel=sigmoid, total=   1.9s
[CV] C=0.1, degree=5, kernel=poly ....................................
[CV] .

[CV] .................. C=0.1, degree=6, kernel=sigmoid, total=   2.2s
[CV] C=0.1, degree=6, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=6, kernel=sigmoid, total=   2.1s
[CV] C=0.1, degree=6, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=6, kernel=sigmoid, total=   1.9s
[CV] C=0.1, degree=6, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=6, kernel=sigmoid, total=   2.1s
[CV] C=0.1, degree=6, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=6, kernel=sigmoid, total=   2.1s
[CV] C=0.1, degree=6, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=6, kernel=sigmoid, total=   2.1s
[CV] C=0.1, degree=6, kernel=sigmoid .................................
[CV] .................. C=0.1, degree=6, kernel=sigmoid, total=   2.2s
[CV] C=0.1, degree=6, kernel=sigmoid .................................
[CV] .

KeyboardInterrupt: 

In [None]:
svc_grid_nln_scores = pd.DataFrame({'params': svc_grid_nln.cv_results_['params'], 
              'mean_validation_recall': svc_grid_nln.cv_results_['mean_test_recall'],
              'mean_train_recall': svc_grid_nln.cv_results_['mean_train_recall'],
              'mean_validation_f1': svc_grid_nln.cv_results_['mean_test_f1'],
              'mean_train_f1': svc_grid_nln.cv_results_['mean_train_f1'],
              'mean_validation_precision': svc_grid_nln.cv_results_['mean_test_precision'],
              'mean_train_precision': svc_grid_nln.cv_results_['mean_train_precision'],
              'mean_validation_accuracy': svc_grid_nln.cv_results_['mean_test_accuracy'],
              'mean_train_accuracy': svc_grid_nln.cv_results_['mean_train_accuracy'],
              'mean_validation_specificity': svc_grid_nln.cv_results_['mean_test_specificity'],
              'mean_train_specificity': svc_grid_nln.cv_results_['mean_train_specificity'], 
              'mean_validation_roc_auc_score': svc_grid_nln.cv_results_['mean_test_roc_auc_score'],
              'mean_train_roc_auc_score': svc_grid_nln.cv_results_['mean_train_roc_auc_score']
             })
# svc_grid_nln_scores.to_csv('../output/svc_grid_nln_scores_dropDurationMedianIP.csv')
svc_grid_nln_scores

In [None]:
svc_grid_nln_scores_validation = svc_grid_nln_scores[svc_grid_nln_scores.columns[svc_grid_nln_scores.columns.str.contains('validation')]]
display(svc_grid_nln_scores_validation)
svc_grid_nln_scores_train = svc_grid_nln_scores[svc_grid_nln_scores.columns[svc_grid_nln_scores.columns.str.contains('train')]]
display(svc_grid_nln_scores_train)

In [None]:
# plot validation scores
plt.figure(figsize=(8,6))
sns.lineplot(data =svc_grid_nln_scores_validation)
plt.legend( loc=5) #bbox_to_anchor=(1.05,1)
plt.xlabel('model#', fontsize=18)
plt.ylabel('score', fontsize=18)
plt.title('SVC NonLinear Grid Search Model Scores: Validation', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

# plt.savefig('../output/svc_grid_nln_validation_allscore_dropDurationMedianIP_cv10.png', bbox_inches='tight')
# plt.xlim(180, 190)
# plt.savefig('../output/svc_grid_nln_validation_allscore_dropDurationMedianIP_cv10_zoom.png', bbox_inches='tight')

In [None]:
# plot train scores
plt.figure(figsize=(8,6))
sns.lineplot(data =svc_grid_nln_scores_train)
plt.legend( loc=5) #bbox_to_anchor=(1.05,1)
plt.xlabel('model#', fontsize=18)
plt.ylabel('score', fontsize=18)
plt.title('SVC NonLinear Grid Search Model Scores: Train', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

# plt.savefig('../output/svc_grid_nln_train_allscore_dropDurationMedianIP_cv10.png', bbox_inches='tight')
# plt.xlim(180, 190)
# plt.savefig('../output/svc_grid_nln_train_allscore_dropDurationMedianIP_cv10_zoom.png', bbox_inches='tight')

In [None]:
svc_grid_nln_scores['tvdiff_recall'] = abs(svc_grid_nln_scores['mean_validation_recall'] - svc_grid_nln_scores['mean_train_recall'])
lof_row = svc_grid_nln_scores['tvdiff_recall'].idxmin()
display(svc_grid_nln_scores_validation.iloc[[lof_row]])
display(svc_grid_nln_scores_train.iloc[[lof_row]])
display(svc_grid_nln_scores.params[lof_row])

In [None]:
best_row =svc_grid_nln_scores.mean_validation_recall.idxmax()
worst_row =svc_grid_nln_scores.mean_validation_recall.idxmin()
print('best RF row number', best_row)
print('worst RF row number', worst_row)
print('best RF coarse',svc_grid_nln_scores.params[best_row])
display(gbc_grid_fine_scores[gbc_grid_fine_scores.columns[gbc_grid_fine_scores.columns.str.contains('mean_validation')]].iloc[[best_row,]])
display(gbc_grid_fine_scores[gbc_grid_fine_scores.columns[gbc_grid_fine_scores.columns.str.contains('mean_train')]].iloc[[best_row,]])

svc_params = [{'kernel':['poly', 'rbf', 'sigmoid'], 'C':[0.01, 0.1, 1, 10, 100], 'degree':[3,4,5,6,7,8]}]

svc_grid_nln_kernel =svc_grid_nln_scores.params[best_row]['kernel']
svc_grid_nln_C =svc_grid_nln_scores.params[best_row]['C']
svc_grid_nln_degree =svc_grid_nln_scores.params[best_row]['degree']

print(svc_grid_nln_kernel, svc_grid_nln_C, svc_grid_nln_degree)

In [None]:
# scores
svc_nln_best_fine = SVC(kernel=el=el = svc_grid_nln_kernel, C = svc_grid_nln_C, degree = svc_grid_nln_degree)
svc_nln_best_fine.fit(xtrain, ytrain)

svc_nln_best_train = svc_nln_best_fine.score(xtrain, ytrain)
svc_nln_best_test = svc_nln_best_fine.score(xtest, ytest)
print('svc nln coarse best train', round(svc_nln_best_train, 4)) 
print('svc nln coarse best test', round(svc_nln_best_test, 4))