## **Overall predictive pipeline**

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("/home/lyrax/matplotlib-dracula/dracula.mplstyle")
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix, classification_report, RocCurveDisplay, roc_auc_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import datetime

from tqdm import tqdm, trange
import warnings; warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("data/data_clean.csv", parse_dates=['participant_date_of_birth', 'treatment_created_date',
                                                       'claim_finalized_date'])
data.head(3)

Unnamed: 0,Serial Number,claim_status,provider_type,provider_region,program_cover,participant_date_of_birth,participant_gender,treatment_created_date,claim_finalized_date,item_status,item_name,item_amount,item_quantity,total_item_amount,diagnoses
0,162967.0,Approved,Specialist,NAIROBI,DENTAL,1987-12-19,FEMALE,2021-01-19 10:45:21,2021-01-19 22:38:08,APPROVED,Simple Extraction Permanent,3000.0,1,3000.0,Dental caries
1,162967.0,Approved,Specialist,NAIROBI,DENTAL,1987-12-19,FEMALE,2021-01-19 10:45:21,2021-01-19 22:38:08,APPROVED,Dental Consultation,1500.0,1,1500.0,Dental caries
2,164883.0,Approved,Specialist,NAIROBI,DENTAL,1977-01-01,FEMALE,2021-01-22 10:49:08,2021-01-28 10:48:12,APPROVED,Dental consultation,1500.0,1,1500.0,Dental caries


In [4]:
# extract a test set from the dataframe
X_train, X_test, y_train, y_test = train_test_split(data.drop('claim_status', axis=1),
                                                    data.claim_status, test_size=0.1, stratify=data.claim_status, random_state=101)
data = X_train.join(y_train); data.reset_index(drop=True, inplace=True)
test = X_test.join(y_test); test.reset_index(drop=True, inplace=True)

In [5]:
#test.to_csv("test.csv", index=False)

### **Overall Feature Engineering function**

In [26]:
def feature_engineer(data):
    # label encoder object
    le = LabelEncoder()
    
    # new features from dates
    data['participant_age'] = ((pd.to_datetime(datetime.date.today()) - data.participant_date_of_birth).dt.days) / 364
    
    data['participant_yearOB'] = data.participant_date_of_birth.dt.year.astype(int)
    data['participant_monthOB'] = data.participant_date_of_birth.dt.month.astype(int)
    data['participant_dayOB'] = data.participant_date_of_birth.dt.day.astype(int)
    
    data['treat_cr_year'] = data.treatment_created_date.dt.year.astype(int)
    data['treat_cr_month'] = data.treatment_created_date.dt.month.astype(int)
    data['treat_cr_day'] = data.treatment_created_date.dt.day.astype(int)
    data['treat_cr_weekday'] = pd.Series(data.treatment_created_date.dt.weekday).apply(lambda x: 1 if x<5 else 0).astype(int)
    
    data['claim_final_year'] = data.claim_finalized_date.dt.year.astype(int)
    data['claim_final_month'] = data.claim_finalized_date.dt.month.astype(int)
    data['claim_final_day'] = data.claim_finalized_date.dt.day.astype(int)
    data['claim_final_weekday'] = pd.Series(data.claim_finalized_date.dt.weekday).apply(lambda x: 1 if x<5 else 0).astype(int)
    
    # days from treatment creation to claim finalization
    data['treat_claim_diff'] = (data.claim_finalized_date - data.treatment_created_date).dt.days
    
    # categorize some continuous variables from information on plots above
    data['totals_cat'] = np.select([
        data.total_item_amount <= 1200,
        (data.total_item_amount > 1200) & (data.total_item_amount <= 2300),
        (data.total_item_amount > 2300) & (data.total_item_amount <= 4500),
        (data.total_item_amount > 4500) & (data.total_item_amount <= 10000),
        data.total_item_amount > 10000
    ], [3, 5, 1, 2, 4])
    
    
    data['itemq_cat'] = np.select([
        data.item_quantity <= 200,
        data.item_quantity > 200
    ], [0, 1])
    
    
    data['age_cat'] = np.select([
        data.participant_age <= 20,
        (data.participant_age > 20) & (data.participant_age <= 40),
        (data.participant_age > 40) & (data.participant_age <= 60),
        data.participant_age > 60
    ], [3, 0, 2, 1])
    
    # some combination features
    data['prov_typeXreg'] = le.fit_transform(data.provider_type + data.provider_region)
    data['prov_typeXcover'] = le.fit_transform(data.provider_type + data.program_cover)
    data['prov_regXcover'] = le.fit_transform(data.provider_region + data.program_cover)
    data['prov_typeXgen'] = le.fit_transform(data.provider_type + data.participant_gender)
    data['prov_regXgen'] = le.fit_transform(data.provider_region + data.participant_gender)
    data['coverXgen'] = le.fit_transform(data.program_cover + data.participant_gender)
    data['regXstatus'] = le.fit_transform(data.provider_region + data.item_status)
    data['statusXgen'] = le.fit_transform(data.item_status + data.participant_gender)
    data['coverXstatus'] = le.fit_transform(data.program_cover + data.item_status)
    
    # encoding categorical features
    for col in ['provider_type', 'provider_region', 'program_cover', 'participant_gender', 'item_status', 'item_name']:
        data[col] = le.fit_transform(data[col])
    
    # convert 'continuous' columns with < 5 unique values to categorical
    for col in data.select_dtypes(np.number).columns:
        if data[col].nunique() < 5:
            data[col] = le.fit_transform(data[col])
        else:
            pass
        
    return data

In [27]:
# apply function to data
data = feature_engineer(data); test = feature_engineer(test)
display(data.sample(3))
print(data.shape)

Unnamed: 0,Serial Number,provider_type,provider_region,program_cover,participant_date_of_birth,participant_gender,treatment_created_date,claim_finalized_date,item_status,item_name,item_amount,item_quantity,total_item_amount,diagnoses,claim_status,participant_age,participant_yearOB,participant_monthOB,participant_dayOB,treat_cr_year,treat_cr_month,treat_cr_day,treat_cr_weekday,claim_final_year,claim_final_month,claim_final_day,claim_final_weekday,treat_claim_diff,totals_cat,itemq_cat,age_cat,prov_typeXreg,prov_typeXcover,prov_regXcover,prov_typeXgen,prov_regXgen,coverXgen,regXstatus,statusXgen,coverXstatus
81525,189769.0,0,29,4,1973-11-23,1,2021-03-01 14:51:53,2021-03-09 08:25:57,0,676,1376.0,1,1376.0,Acute nasopharyngitis [common cold],Approved,48.266484,1973,11,23,1,3,1,1,1,3,9,1,7,5,0,2,29,4,95,1,58,9,80,1,10
30259,168708.0,0,29,4,2013-03-03,0,2021-01-28 14:57:16,2021-01-28 19:19:29,0,676,1687.0,1,1687.0,Acute tonsillitis; Atopic dermatitis,Approved,8.857143,2013,3,3,1,1,28,1,1,1,28,1,0,5,0,3,29,4,95,0,57,8,80,0,10
149208,201710.0,0,29,4,1980-09-30,1,2021-03-19 09:08:20,2021-03-23 17:04:57,0,5089,52.0,30,1560.0,Metabolic syndrome,Approved,41.39011,1980,9,30,1,3,19,1,1,3,23,1,4,5,0,2,29,4,95,1,58,9,80,1,10


(150454, 40)


## **Algorithms testing**

In [35]:
X = data[data.select_dtypes(np.number).columns[1:]]
y = data.claim_status

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=101)

models = [
    LogisticRegression(),
    SGDClassifier(),
    MultinomialNB(),
    RandomForestClassifier(),
    CatBoostClassifier(verbose=False),
    LGBMClassifier(),
    XGBClassifier()
]

for model in models:
    print(model.__class__.__name__, "\n", "="*40)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print(confusion_matrix(y_test, pred), "\n\n", classification_report(y_test, pred), "\nRoc-Auc: ",
         roc_auc_score(y_test.replace({'Approved':1, 'Not Approved':0}), pd.Series(pred).replace({'Approved':1, 'Not Approved':0})))

LogisticRegression 
[[74104  1024]
 [ 3420  5038]] 

               precision    recall  f1-score   support

    Approved       0.96      0.99      0.97     75128
Not Approved       0.83      0.60      0.69      8458

    accuracy                           0.95     83586
   macro avg       0.89      0.79      0.83     83586
weighted avg       0.94      0.95      0.94     83586
 
Roc-Auc:  0.7910095091371954
SGDClassifier 
[[73867  1261]
 [ 6638  1820]] 

               precision    recall  f1-score   support

    Approved       0.92      0.98      0.95     75128
Not Approved       0.59      0.22      0.32      8458

    accuracy                           0.91     83586
   macro avg       0.75      0.60      0.63     83586
weighted avg       0.88      0.91      0.89     83586
 
Roc-Auc:  0.5991981031808024
MultinomialNB 
[[59361 15767]
 [ 4815  3643]] 

               precision    recall  f1-score   support

    Approved       0.92      0.79      0.85     75128
Not Approved       0.19  

Ensemble and boosting algorithms are pretty accurate given that the labels for this dataset are imbalanced.

## **Final Model**

In [11]:
data.claim_status.map({'Approved': 1, 'Not Approved': 0})

0         1
1         1
2         1
3         1
4         1
         ..
167167    0
167168    0
167169    0
167170    0
167171    0
Name: claim_status, Length: 167172, dtype: int64

In [33]:
features = data.select_dtypes(np.number).columns[1:]
X = data[features]
y = data.claim_status.map({'Approved': 1, 'Not Approved': 0})

#==============================================================================================
# KFOLD CV
#==============================================================================================
oofs_df = np.zeros(len(X))
preds_df = []
oofs_probs = np.zeros(len(X))

NFOLDS = 5
fold = KFold(n_splits=NFOLDS)

for fold, (train_index, test_index) in enumerate(fold.split(X,y)):
    print(f"Fold {fold+1}")
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = XGBClassifier(objective='binary:logistic', eval_metric='logloss')
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)
    
    preds = model.predict(X_test)
    oofs_df[test_index] = preds
    oofs_probs[test_index] = model.predict_proba(X_test)[:, 1]
    print("    Roc-Auc: ", roc_auc_score(y_test, preds))
    
    p2 = model.predict_proba(test[features])[:, 1]
    preds_df.append(p2)
    
print(f'\nOOF Roc-Auc : {roc_auc_score(y,oofs_df)} ')


Fold 1
    Roc-Auc:  0.9992697926929478
Fold 2
    Roc-Auc:  0.998318670758415
Fold 3
    Roc-Auc:  0.9980786350891395
Fold 4
    Roc-Auc:  0.998435516712279
Fold 5
    Roc-Auc:  0.9994804270513319


NameError: name 'oofs_df_lgbm' is not defined

In [36]:
np.round(np.mean(preds_df, axis=0))

array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)

In [38]:
# test on all data and save model
import pickle

model = XGBClassifier(objective='binary:logistic', eval_metric='logloss')
model.fit(data[features], data.claim_status)
pickle.dump(model, open(f'{model.__class__.__name__}_model.pkl', 'wb'))

In [56]:
# load model
model = pickle.load(open(f'{model.__class__.__name__}_model.pkl', 'rb'))
model.predict(feature_engineer(pd.DataFrame(test.iloc[0:5]))[features])

array(['Approved', 'Approved', 'Approved', 'Approved', 'Approved'],
      dtype=object)

In [59]:
test.columns

Index(['Serial Number', 'provider_type', 'provider_region', 'program_cover',
       'participant_date_of_birth', 'participant_gender',
       'treatment_created_date', 'claim_finalized_date', 'item_status',
       'item_name', 'item_amount', 'item_quantity', 'total_item_amount',
       'diagnoses', 'claim_status', 'participant_age', 'participant_yearOB',
       'participant_monthOB', 'participant_dayOB', 'treat_cr_year',
       'treat_cr_month', 'treat_cr_day', 'treat_cr_weekday',
       'claim_final_year', 'claim_final_month', 'claim_final_day',
       'claim_final_weekday', 'treat_claim_diff', 'totals_cat', 'itemq_cat',
       'age_cat', 'prov_typeXreg', 'prov_typeXcover', 'prov_regXcover',
       'prov_typeXgen', 'prov_regXgen', 'coverXgen', 'regXstatus',
       'statusXgen', 'coverXstatus'],
      dtype='object')

In [69]:
[list(arr) for arr in test.head().values]

[[171272.0,
  1,
  1,
  0,
  Timestamp('1997-04-20 00:00:00'),
  1,
  Timestamp('2021-02-02 05:45:47'),
  Timestamp('2021-02-02 11:55:44'),
  0,
  3,
  40.0,
  2,
  800.0,
  'Other specified septicemia; Other inflammation of eyelid',
  'Approved',
  24.77747252747253,
  1997,
  1,
  2,
  0,
  1,
  2,
  0,
  0,
  1,
  2,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  1,
  0],
 [180355.0,
  0,
  3,
  0,
  Timestamp('2019-03-05 00:00:00'),
  1,
  Timestamp('2021-02-16 08:58:31'),
  Timestamp('2021-02-16 20:34:03'),
  0,
  2,
  600.0,
  0,
  600.0,
  'Upper respiratory tract hypersensitivity reaction site unspecified',
  'Approved',
  2.82967032967033,
  2019,
  0,
  0,
  0,
  1,
  16,
  0,
  0,
  1,
  16,
  0,
  0,
  1,
  0,
  1,
  2,
  0,
  3,
  1,
  4,
  1,
  3,
  1,
  0],
 [193970.0,
  1,
  2,
  0,
  Timestamp('2020-10-31 00:00:00'),
  0,
  Timestamp('2021-03-09 09:01:15'),
  Timestamp('2021-03-18 08:23:48'),
  0,
  4,
  862.0,
  0,
  862.0,
  'Acute nasopharyngitis [com

In [70]:
features

Index(['provider_type', 'provider_region', 'program_cover',
       'participant_gender', 'item_status', 'item_name', 'item_amount',
       'item_quantity', 'total_item_amount', 'participant_age',
       'participant_yearOB', 'participant_monthOB', 'participant_dayOB',
       'treat_cr_year', 'treat_cr_month', 'treat_cr_day', 'treat_cr_weekday',
       'claim_final_year', 'claim_final_month', 'claim_final_day',
       'claim_final_weekday', 'treat_claim_diff', 'totals_cat', 'itemq_cat',
       'age_cat', 'prov_typeXreg', 'prov_typeXcover', 'prov_regXcover',
       'prov_typeXgen', 'prov_regXgen', 'coverXgen', 'regXstatus',
       'statusXgen', 'coverXstatus'],
      dtype='object')

In [28]:
test.to_csv("test.csv", index=False)