In [1]:
## Packages
# system
import os
import math
# data wrangling
import numpy as np
import pandas as pd
from scipy import stats
from dfply import *
import statistics
# plotting
import matplotlib.pyplot as plt
import seaborn as sns
# models
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold, StratifiedKFold, cross_val_score 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, recall_score, make_scorer, accuracy_score, balanced_accuracy_score, f1_score, precision_score, roc_auc_score
from sklearn.preprocessing import normalize, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
## Settings
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None) 
pd.set_option('display.width', 1000)

  import pandas.util.testing as tm


## Data Import

In [2]:
df = pd.read_csv('../processed_data/provider_final_new.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df = df.set_index('Provider')
display(df.shape)

# Make dummies
encode_cols = df[~df.applymap(np.isreal)].dropna(axis=1).columns.to_list()
print(encode_cols)
df[encode_cols[2:]] = df[encode_cols[2:]].applymap(lambda x: 1 if x.lower() == 'yes' else 0)
dummies = pd.get_dummies(df['Serves'],prefix='Serves_',drop_first=True)
df_dum = pd.concat([df.drop('Serves',axis=1),dummies],axis=1)
display(df_dum.shape)

# this is for pos_label for precision & recall for gridsearchcv
df_dum['PotentialFraud'] = np.where(df_dum['PotentialFraud']=='Yes', 1, 0)

(5410, 46)

['PotentialFraud', 'Serves', 'DupClaim_IP', 'DupClaimMultiState_IP', 'DupClaimMultiProvider_IP', 'DupClaimMultiStartDt_IP', 'DupClaim_OP', 'DupClaimMultiState_OP', 'DupClaimMultiProvider_OP', 'DupClaimMultiStartDt_OP']


(5410, 47)

## SVM

### train-test-split

In [4]:
# remove Duration MedianIP
df_dum_drop = df_dum.drop(['DurationMedianIP'], axis=1)

# train-test-split
x = df_dum_drop.drop(['PotentialFraud'], axis = 1)
y = df_dum_drop.PotentialFraud
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=4, stratify=y)

# upsampling by SMOTE
sm = SMOTE(random_state=4)
xtrain, ytrain = sm.fit_sample(xtrain, ytrain)



In [None]:
svc = SVC()

# svc_params = [{'kernel':['linear', 'poly', 'rbf'], 'C':[0.1, 1, 10] , 'degree':[2,3,4]}]
svc_params = [{'kernel':['linear'], 'C':[0.1, 1, 10]}]
# svc_params = [{'kernel':['poly', 'rbf'], 'C':[0.1, 1, 10], 'degree':[3,4,5]}]

kf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 4) # cv = 5

scores = { 'recall': make_scorer(recall_score, pos_label = 1, average = 'binary'), 
          'f1': make_scorer(f1_score, pos_label = 1, average = 'binary'), # use binary not weighted
         'precision': make_scorer(precision_score, pos_label = 1, average = 'binary'),
         'accuracy': make_scorer(accuracy_score), # not use balanced_accurarcy_score
         'specificity': make_scorer(recall_score, pos_label = 0, average = 'binary'), 
         'roc_auc_score': make_scorer(roc_auc_score, average = 'macro')
         }


svc_grid_ln = GridSearchCV(estimator=svc, param_grid=svc_params, return_train_score=True, 
                       cv = kf, verbose=2, scoring=scores, refit=False)

%time svc_grid_ln.fit(xtrain, ytrain)

Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
