In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_curve, precision_score, recall_score, confusion_matrix, auc
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load

## Utility functions

In [3]:
def generate_data(x, models):
    
    '''This function generates metadata with k predictions of k base learners for custom model'''
    
    res_x = []
    for model in models:
        res_x.append(model.predict(x))
    res_x = np.array(res_x).T
    
    return res_x

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Final pipeline

In [11]:
def final_fun_1(X):
    
    '''This function takes details about a healthcare provider as input and returns a prediction of the healthcare provider
       being a potential fraud. The details include: no. of inpatient claims(is_inpatient), no. of claims with group codes
       (is_groupcode), no. of claims with chronic illnesses like heartfailure, alzeimer, diabetes, etc., avg. deductible amt,
       avg. insurance amount reimbursed to the provider and avg. no. of days a patient was admitted under provider's care.'''
    
    # Loading Standard Scaler model to scale the data
    with open ('/content/drive/MyDrive/Colab Notebooks/Self_Case_study/1st case study/scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)
        
    # Storing all provider ids separately
    provider_ids = X['Provider'].values

    X = X.drop('Provider',axis=1)
    X = X [['InscClaimAmtReimbursed',
 'PerProvider_mean_InscClaimAmtReimbursed',
 'PerOperatingPhysician_mean_InscClaimAmtReimbursed',
 'PerAttendingPhysician_mean_InscClaimAmtReimbursed',
 'Days_Admitted',
 'Hospitalization_Duration',
 'PerProvider_mean_Hospitalization_Duration',
 'ChronicCond_rheumatoidarthritis',
 'ChronicCond_ObstrPulmonary',
 'PerOtherPhysician_mean_Hospitalization_Duration',
 'PerAttendingPhysician_mean_DeductibleAmtPaid',
 'PerOtherPhysician_mean_InscClaimAmtReimbursed',
 'ChronicCond_stroke',
 'PerOperatingPhysician_mean_DeductibleAmtPaid',
 'ExtraClaimDays',
 'PerOperatingPhysician_mean_Hospitalization_Duration',
 'is_inpatient',
 'ChronicCond_Cancer',
 'ChronicCond_Alzheimer',
 'DeductibleAmtPaid']]
    
    # Scaling data
    
    
    X_scaled = scaler.transform(X)
    
    # Loading all base learners
    files = os.listdir('/content/drive/MyDrive/Colab Notebooks/Self_Case_study/1st case study/BaseModel')
    models = []
    for model in files:
        clf = load('/content/drive/MyDrive/Colab Notebooks/Self_Case_study/1st case study/BaseModel/'+model)
        models.append(clf)
        
    # Loading custom model
    custom_model = load('/content/drive/MyDrive/Colab Notebooks/Self_Case_study/1st case study/best_custom_model.joblib')


    
    # Predictions
    x_meta = generate_data(X_scaled, models)
    y_pred = custom_model.predict(x_meta)
    y_prob = custom_model.predict_proba(x_meta)
    
    all_predictions = pd.DataFrame(X)
    all_predictions['PotentialFraud'] = y_pred
    all_predictions.insert(0, "Provider", provider_ids)
    
    return all_predictions

In [12]:
def final_fun_2(X, Y):
    
    '''This fuction evaluates the predictions of model by comparing with actual values'''
    
    predictions = final_fun_1(X)
    y_pred = predictions.PotentialFraud
    
    print("F1 score for data: ", f1_score(Y, y_pred))
    print("Recall for data: ", recall_score(Y, y_pred))
    print("Precision for data: ", precision_score(Y, y_pred))
    

In [6]:
# Reading data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Self_Case_study/train_final.csv')

Y = df['PotentialFraud'].values

X = df.drop('PotentialFraud',axis=1)

In [None]:
single_x = X[:1]
print(type(single_x))

<class 'pandas.core.frame.DataFrame'>


In [13]:
single_x = X[:1]
final_fun_1(single_x)

Unnamed: 0,Provider,InscClaimAmtReimbursed,PerProvider_mean_InscClaimAmtReimbursed,PerOperatingPhysician_mean_InscClaimAmtReimbursed,PerAttendingPhysician_mean_InscClaimAmtReimbursed,Days_Admitted,Hospitalization_Duration,PerProvider_mean_Hospitalization_Duration,ChronicCond_rheumatoidarthritis,ChronicCond_ObstrPulmonary,...,PerOtherPhysician_mean_InscClaimAmtReimbursed,ChronicCond_stroke,PerOperatingPhysician_mean_DeductibleAmtPaid,ExtraClaimDays,PerOperatingPhysician_mean_Hospitalization_Duration,is_inpatient,ChronicCond_Cancer,ChronicCond_Alzheimer,DeductibleAmtPaid,PotentialFraud
0,PRV51001,104640,104640.0,57048.346298,104022.857143,25,30.0,30.0,8,10,...,63109.467885,6,2973.295139,0.0,20.340776,5.0,5,15,5340.0,0


In [14]:
# Testing final_fun_1

results = final_fun_1(X)
results.head()

Unnamed: 0,Provider,InscClaimAmtReimbursed,PerProvider_mean_InscClaimAmtReimbursed,PerOperatingPhysician_mean_InscClaimAmtReimbursed,PerAttendingPhysician_mean_InscClaimAmtReimbursed,Days_Admitted,Hospitalization_Duration,PerProvider_mean_Hospitalization_Duration,ChronicCond_rheumatoidarthritis,ChronicCond_ObstrPulmonary,...,PerOtherPhysician_mean_InscClaimAmtReimbursed,ChronicCond_stroke,PerOperatingPhysician_mean_DeductibleAmtPaid,ExtraClaimDays,PerOperatingPhysician_mean_Hospitalization_Duration,is_inpatient,ChronicCond_Cancer,ChronicCond_Alzheimer,DeductibleAmtPaid,PotentialFraud
0,PRV51001,104640,104640.0,57048.346298,104022.857143,25,30.0,30.0,8,10,...,63109.467885,6,2973.295139,0.0,20.340776,5.0,5,15,5340.0,0
1,PRV51003,605670,605670.0,494155.806397,605660.0,320,382.0,382.0,38,41,...,141046.870912,12,46362.233854,0.0,296.882374,62.0,10,56,66286.0,1
2,PRV51004,52170,52170.0,77487.912419,95525.068627,0,0.0,0.0,46,41,...,128915.615873,17,5819.500347,0.0,30.478732,0.0,16,64,310.0,0
3,PRV51005,280910,280910.0,551685.694626,280360.954907,0,0.0,0.0,331,295,...,964639.629123,124,40368.465797,0.0,204.667576,0.0,165,426,3700.0,1
4,PRV51007,33710,33710.0,38903.610323,32038.632219,16,19.0,19.0,22,16,...,61137.034847,12,3588.456845,0.0,18.022327,3.0,12,26,3264.0,0


In [9]:
# Testing final_fun_2
final_fun_2(X, Y)

[[0.98332046 0.01667954]
 [0.55981481 0.44018519]
 [0.98332046 0.01667954]
 ...
 [0.98332046 0.01667954]
 [0.98332046 0.01667954]
 [0.98332046 0.01667954]]
F1 score for data:  0.5513784461152882
Recall for data:  0.8695652173913043
Precision for data:  0.4036697247706422
AUC score for data:  0.8954150923018396
