In [1]:
#import basic modules
import pandas as pd 
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt        
%matplotlib inline

#import feature selection modules
from sklearn.feature_selection import mutual_info_classif,RFE,RFECV

#import classification modules
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#import classification evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import f1_score

In [2]:
#data load function
def load_data():
    dmfraud = pd.read_csv("medicalfraud.csv")
    return dmfraud

In [3]:
#data cleaning function
def cleaningup(dmfraud):
    #write all the cleaning code here
    print("dmfraud is all cleaned up..")
    return dmfraud

In [5]:
#basic analysis
def basicanalysis(dmfraud):
    print("Shape is:\n", dmfraud.shape)
    print("Columns are:\n", dmfraud.columns)
    print("Types are:\n", dmfraud.dtypes)
    print("Statistical Analysis of Numerical Columns:\n", dmfraud.describe())

In [6]:
#string column analysis analysis
def stringcolanalysis(dmfraud):
    stringcols = dmfraud.select_dtypes(exclude=np.number)
    fig = plt.figure(figsize = (8,10))
    for i,col in enumerate(stringcols):
        fig.add_subplot(4,2,i+1)
        fig.savefig('Categorical.png')
        dmfraud[col].value_counts().plot(kind = 'barh', color='black' ,fontsize=10)
        plt.tight_layout()
        plt.title(col)

In [7]:
#numerical analysis
#histograms and boxplots for all numerical columns
#scatter plots (seaborn heatmaps)
def numcolanalysis(dmfraud):
    numcols = dmfraud.select_dtypes(include=np.number)
    for col in numcols:
        fig = plt.figure(figsize = (5,5))
        sb.boxplot(dmfraud[col], color='grey', linewidth=1)
        plt.tight_layout()
        plt.title(col)
        plt.savefig("Numerical.png")

In [14]:
dmfraud.head()

Unnamed: 0,amount_paid_to_date,number_presc_to_date,max_presc_to_date,max_presc_per_doctor,max_presc_per_hospital,max_presc_per_year,id,FRAUD_LABEL,amount_paid_per_year,amount_paid_per_hospital,amount_paid_per_doctor,amount_paid_to_prescription,amount_paid_total,number_presc_per_year,number_presc_per_hospital,number_presc_per_doctor,number_presc_to_prescription,number_presc_total
0,100.0,0,1,1,0,0,21,False,109.111328,71.316356,24.658738,40.935309,400.0,120,37,30,35,400
1,100.58801,9,1,1,9,9,34,False,0.942975,48.316478,27.854219,24.471927,402.352042,285,55,35,46,402
2,100.402561,0,1,1,0,0,35,False,159.592229,53.969161,26.888177,37.658748,401.610244,156,64,21,32,402
3,100.0,3,1,1,84,9,38,False,59.203803,43.843904,48.933583,19.248757,400.0,253,27,35,46,400
4,100.0,22,1,1,22,22,24,False,95.780922,38.195557,43.876891,40.236061,400.0,206,47,49,46,400


In [None]:
#splitting manually
def traintestsplit(dmfraud,split,random, label_col=''):
    #make a copy of the label column and store in y
    y = dmfraud[label_col].copy()
    
    #now delete the original
    X = dmfraud.drop(label_col,axis=1)
    
    #manual split
    trainX, testX, trainY, testY= train_test_split(X, y, test_size=split, random_state=random)
    
    return X, trainX, testX, trainY, testY

In [None]:
#splitting through cross validation (you can try out different splitting methods)
#KFold, RepeatedKFold, StratifiedKFold, StratifiedShuffleSplit
def cross_valid(X,y,split,repeat,random):
    kf = RepeatedKFold(n_splits=split, n_repeats=repeat, random_state=random) 
    for train_index, test_index in kf.split(X):
        trainX, testX = X.iloc[train_index], X.iloc[test_index] 
        trainY, testY = y.iloc[train_index], y.iloc[test_index]
    return trainX,trainY,testX,testY

In [None]:
def validationmetrics(model,testX,testY):
    predictions = model.predict(testX)
    print("Prediction Vector: \n", predictions)
    
    #Accuracy
    print("Accuracy: \n", accuracy_score(testY, predictions)*100)
    
    #Precision
    print("Precision of Fraud Happening: \n", precision_score(testY, predictions,pos_label=1,labels=[0,1])*100)
    
    #Recall
    print("Recall of Fraud Happening: \n", recall_score(testY, predictions,pos_label=1,labels=[0,1])*100)
    
    #get FPR (specificity) and TPR (sensitivity)
    fpr , tpr, _ = roc_curve(testY, predictions)
    
    #AUC
    print("AUC of Fraud Happening: \n",auc(fpr, tpr))
    
    #F-Score
    print("F-Score OF Fraud Happening:\n", f1_score(testY, predictions))
    
    #confusion Matrix
    
    print("Confusion Matrix: \n", confusion_matrix(testY, predictions,labels=[0,1]))

## Random Forest Feature Selection (RFFS)

In [None]:
#determine the important features given by RFFS
def RFfeatureimportance(dmfraud, trainX, testX, trainY, testY, trees, random):
    clf  = RandomForestClassifier(n_estimators=trees, random_state=random)
    clf.fit(trainX,trainY)
    validationmetrics(clf,testX,testY)
    print(pd.Series(clf.feature_importances_, index=dmfraud.columns.values).sort_values(ascending=False)*100)

In [None]:
#select features with importance >=threshold
def MachineLearningwithRFFS():
    #include all selected features in impftrs and last should be the label
    impftrs = []
    dmfraud = load_data()
    dmfraud = cleaningup(dmfraud)
    dmfraud = stringcolencoding(dmfraud)
    dmfraud = dmfraud[impftrs]
    dmfraud, trainX, testX, trainY, testY = traintestsplit(dmfraud,0.2,91)     
    
    print("\n\n Results for Logistic Regression.....")
    LogReg(dmfraud, trainX, testX, trainY, testY)
    
    print("\n\n Results for KNN.....")
    KNN(dmfraud, trainX, testX, trainY, testY)

## Mutual Information Feature Selection (MIFS)

In [None]:
#determine the important features given by MIFS
def mutualinformation(dmfraud):
    #make a copy of the label column and store in y
    y = dmfraud[''].copy()
    X = dmfraud.drop('',axis=1)
    
    mutual_info = mutual_info_classif(X,y,random_state=35)
    results = pd.Series(mutual_info, index=X.columns).sort_values(ascending=False)*100
    results.to_csv("sortedfeatures.csv")

In [None]:
#select features with importance >=threshold
def MachineLearningwithMIFS():
    #include all selected features in impftrs and last should be the label
    impftrs = []
    dmfraud = load_data()
    dmfraud = cleaningup(dmfraud)
    dmfraud = stringcolencoding(dmfraud)
    dmfraud = dmfraud[impftrs]
    dmfraud, trainX, testX, trainY, testY = traintestsplit(dmfraud,0.2,91)     
   
    print("\n\n Results for Logistic Regression.....")
    LogReg(dmfraud, trainX, testX, trainY, testY)
   
    print("\n\n Results for KNN.....")
    KNN(dmfraud, trainX, testX, trainY, testY)

## Recursive Elimination Feature Selection (REFS) with Cross validation

In [None]:
def XYsplit(dmfraud):
    y = dmfraud['Pak_Win_yes'].copy()
    X = dmfraud.drop('Pak_Win_yes',axis=1)
    return X,y

In [None]:
def LogRegRECV(X, y, random, split,repeat):
    clf = LogisticRegression(solver='liblinear',penalty='l2')
    selector = RFECV(estimator=clf, step=1, cv=split)
    selector = selector.fit(X,y)
    X = X[X.columns[selector.support_].tolist()]
    trainX, trainY, testX, testY= cross_valid(X, y,split,repeat,random)
    clf  = LogisticRegression(solver='liblinear',penalty='l2')
    clf.fit(trainX , trainY)
    validationmetrics(clf,testX,testY)

In [None]:
def KNNRECV(X, y, trees, random, split,repeat):
    clf = RandomForestClassifier(n_estimators=trees, random_state=random)
    selector = RFECV(estimator=clf, step=1, cv=split)
    selector = selector.fit(X,y)
    X = X[X.columns[selector.support_].tolist()]
    
    trainX, trainY, testX, testY= cross_valid(X, y,split,repeat,random)
    clf = KNeighborsClassifier()
    clf.fit(trainX , trainY)
    validationmetrics(clf,testX,testY)
    

## Recursive Elimination Feature Selection with out Cross validation

In [None]:
def XYsplit(dmfraud):
    y = dmfraud[''].copy()
    X = dmfraud.drop('',axis=1)
    return X,y

In [None]:
def LogRegRE(X, y, random, split):
    clf = LogisticRegression(solver='liblinear',penalty='l2')
    selector = RFE(estimator=clf, step=1)
    selector = selector.fit(X,y)
    X = X[X.columns[selector.support_].tolist()]
   
    trainX, testX, trainY, testY= train_test_split(X, y, test_size=split, random_state=random)
    clf  = LogisticRegression(solver='liblinear',penalty='l2')
    clf.fit(trainX , trainY)
    validationmetrics(clf,testX,testY)

In [None]:
def KNNRE(X, y, trees, random, split):
    clf = RandomForestClassifier(n_estimators=trees, random_state=random)
    selector = RFE(estimator=clf, step=1)
    selector = selector.fit(X,y)
    X = X[X.columns[selector.support_].tolist()]
    
    #trainX, trainY, testX, testY= cross_valid(X, y,split,repeat,random)
    trainX, testX, trainY, testY= train_test_split(X, y, test_size=split, random_state=random)
    clf = KNeighborsClassifier()
    clf.fit(trainX , trainY)
    validationmetrics(clf,testX,testY)
    

## Results for RFE without Cross Validation

In [None]:
dmfraud = load_data()
dmfraud = cleaningup(dmfraud)
dmfraud = stringcolencoding(dmfraud)
X, y = XYsplit(dmfraud)
print("LOGISTIC REGRESSION")
LogRegRE(X,y,65,0.2)

In [None]:
dmfraud = load_data()
dmfraud = cleaningup(dmfraud)
dmfraud = stringcolencoding(dmfraud)
X, y = XYsplit(dmfraud)
print("KNNRE")
KNNRE(X,y,100,59,0.2)

## Results for RFE with cross validation

In [None]:
dmfraud = load_data()
dmfraud = cleaningup(dmfraud)
dmfraud = stringcolencoding(dmfraud)
X, y = XYsplit(dmfraud)
print("LOGISTIC REGRESSION")
LogRegRECV(X,y,65,0.2)

In [None]:
print("KNNRE")
KNNRECV(X,y,200,70,10,10)

## Algorithms

In [None]:
def LogReg(dmfraud, trainX, testX, trainY, testY):
    clf  = LogisticRegression()
    clf.fit(trainX , trainY)
    validationmetrics(clf,testX,testY)

In [None]:
def KNN(dmfraud, trainX, testX, trainY, testY):
    clf = KNeighborsClassifier()
    clf.fit(trainX , trainY)
    validationmetrics(clf,testX,testY)

## Control Center - II

In [None]:
#Control Center (Initiate)
dmfraud = load_data()
dmfraud = cleaningup(dmfraud)
#basicanalysis(dmfraud)
#stringcolanalysis(dmfraud)
#numcolanalysis(dmfraud)
dmfraud = stringcolencoding(dmfraud)
dmfraud, trainX, testX, trainY, testY = traintestsplit(dmfraud,0.2,91)
#applying different feature selection methods
#RFfeatureimportance(dmfraud, trainX, testX, trainY, testY, 1000, 65)
print("\n\n\n#########ML WITH RF WITHOUT CROSS VALIDATION#######\n\n\n")
MLwithRFFtrImp()
print("\n\n\n#########ML WITH RF WITH CROSS VALIDATION#######\n\n\n")
MLwithRFFtrImpCV(10,10,65)
