# Health Insurance | Job-A-Thon

# plan

run pycaret first, then go to next two using scikit learn, then neural net

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',50)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# df = df[df['Reco_Insurance_Type']=='Individual']
# df.drop(['ID','Region_Code','Reco_Insurance_Type','Lower_Age','Is_Spouse'],axis=1,inplace=True)

In [4]:
df = pd.read_csv('train.csv')

In [5]:
df.drop(['ID','Region_Code'],axis=1,inplace=True)

In [6]:
vals = {'Rented':1,'Owned':2,'Individual':1,'Joint':2,'No':0,'Yes':1}
cols = ['Accomodation_Type','Reco_Insurance_Type','Is_Spouse']

for col in cols:
    df[col] = df[col].replace(vals)

In [7]:
df['Holding_Policy_Duration'] = df['Holding_Policy_Duration'].replace('14+',15)

#### fill nulls

In [8]:
# duration actual zero
# policy type zero for non existant even though itll be a category
cats = ['Holding_Policy_Duration', 'Holding_Policy_Type']
for cat in cats:
    df[cat] = df[cat].fillna(0)

#### feature type conversions

In [9]:
# convert to numerical 
df['Holding_Policy_Duration'] = df['Holding_Policy_Duration'].astype(float).astype(int)
df['Reco_Policy_Premium'] = df['Reco_Policy_Premium'].astype(int)

In [10]:
# convert to category
df['Holding_Policy_Type'] = df['Holding_Policy_Type'].astype('O')
df['Reco_Policy_Cat'] = df['Reco_Policy_Cat'].astype('O')

In [11]:
# separate city for dummy variables
citydf = df['City_Code']
# citydf = pd.Dataframe(df['City_Code'])
city = citydf.to_frame()

In [12]:
# drop city to do variable conversion then add back in
df.drop(['City_Code'],axis=1,inplace=True)

In [13]:
df2 = df.copy()

#### variable conversion / premium

In [14]:
# let's capture the categorical variables in a list
cat_vars = [var for var in df.columns if df[var].dtype == 'O']

In [15]:
def replace_categories(df, var, target):

    # order the categories in a variable from that with the lowest
    # to that with the highest
    ordered_labels = df.groupby([var])[target].mean().sort_values().index

    # create a dictionary of ordered categories to integer values
    ordinal_label = {k: i for i, k in enumerate(ordered_labels, 0)}

    # use the dictionary to replace the categorical strings by integers
    df[var] = df[var].map(ordinal_label)

In [16]:
for var in cat_vars:
    replace_categories(df, var, 'Reco_Policy_Premium')

In [17]:
# df['Health Indicator'].describe()
df['Health Indicator'] = df['Health Indicator'].fillna(2)

In [18]:
df['Health Indicator'] = df['Health Indicator'].astype(int)

#### join tables

In [19]:
data = pd.concat([city,df],axis=1)

#### one hot encode

In [20]:
# One Hot Encode
data = pd.get_dummies(data,drop_first=True)

In [24]:
# Converting uint8 datatypes back to categorical variables 
for cat_cols in data.select_dtypes('uint8').columns:
    data[cat_cols] = data[cat_cols].astype('category')

#### Test-Train Split | Standardize

In [28]:
df = data.copy()

In [30]:
# Identify X, y
y = df['Response']
X = df.drop(['Response'], axis=1)

In [32]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

In [38]:
# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier

In [33]:
def model_visuals (model, X_test, y_test):
    '''Plots the confusion matrix and ROC-AUC plot'''
    fig, axes = plt.subplots(figsize = (12, 6), ncols = 2)  # confusion matrix
    metrics.plot_confusion_matrix(model, X_test, y_test, normalize = 'true', 
                          cmap = 'Blues', ax = axes[0])
    axes[0].set_title('Confusion Matrix');
    # ROC-AUC Curve
    roc_auc = metrics.plot_roc_curve(model, X_test, y_test,ax=axes[1])
    axes[1].plot([0,1],[0,1],ls=':')
    axes[1].set_title('ROC-AUC Plot')
    axes[1].grid()
    axes[1].legend()
    fig.tight_layout()
    plt.show()

In [34]:
# Test/Train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [35]:
# Standardize the data
scaler = StandardScaler() # transform "X" features
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train) # fit model
gb_prediction = gb_clf.predict(X_test)
gbclf_score = round(accuracy_score(y_test,gb_prediction)*100,2)
print('Accuracy Percentage', gbclf_score, '\n')
print(classification_report(y_test, gb_prediction), '\n\n')
# model_visuals (gb_clf, X_test, y_test) # class report / plots

Accuracy Percentage 76.59 

              precision    recall  f1-score   support

           0       0.77      1.00      0.87      7794
           1       0.54      0.00      0.01      2383

    accuracy                           0.77     10177
   macro avg       0.65      0.50      0.44     10177
weighted avg       0.71      0.77      0.67     10177
 




In [40]:
logreg_clf = LogisticRegression(class_weight = 'balanced') # Logistic Regression
logreg_clf.fit(X_train, y_train) # fit model
logreg_prediction = logreg_clf.predict(X_test)
lrs = round(accuracy_score(y_test,logreg_prediction)*100,2)
print('Accuracy Percentage', lrs, '\n')
print(classification_report(y_test,logreg_prediction), '\n\n')
# model_visuals (logreg_clf, X_test, y_test) # class report / plots

Accuracy Percentage 51.93 

              precision    recall  f1-score   support

           0       0.79      0.51      0.62      7794
           1       0.26      0.55      0.35      2383

    accuracy                           0.52     10177
   macro avg       0.52      0.53      0.48     10177
weighted avg       0.66      0.52      0.56     10177
 




In [41]:
ranfor_clf = RandomForestClassifier(class_weight = 'balanced') # Random Forest 
ranfor_clf.fit(X_train, y_train) # fit model
ranfor_prediction = ranfor_clf.predict(X_test)
random_forest_score = round(accuracy_score(y_test,ranfor_prediction)*100,2)
print('Accuracy Percentage', random_forest_score, '\n')
print(classification_report(y_test,ranfor_prediction), '\n\n')
# model_visuals (ranfor_clf, X_test, y_test) # class report / plots

Accuracy Percentage 75.62 

              precision    recall  f1-score   support

           0       0.77      0.97      0.86      7794
           1       0.35      0.05      0.09      2383

    accuracy                           0.76     10177
   macro avg       0.56      0.51      0.47     10177
weighted avg       0.67      0.76      0.68     10177
 




In [42]:
svm_clf = SVC(class_weight = 'balanced') # Support Vector Machine
svm_clf.fit(X_train, y_train) # fit model
svm_prediction = svm_clf.predict(X_test)
svm_score = round(accuracy_score(y_test,svm_prediction)*100,2)
print('Accuracy Percentage', svm_score, '\n')
print(classification_report(y_test,svm_prediction), '\n\n')
# model_visuals (svm_clf, X_test, y_test) # class report / plots

Accuracy Percentage 51.18 

              precision    recall  f1-score   support

           0       0.80      0.49      0.60      7794
           1       0.26      0.60      0.36      2383

    accuracy                           0.51     10177
   macro avg       0.53      0.54      0.48     10177
weighted avg       0.67      0.51      0.55     10177
 




In [43]:
knn_clf = KNeighborsClassifier() # K-Nearest Neighbors
knn_clf.fit(X_train, y_train) # fit model
knn_prediction = knn_clf.predict(X_test)
knn_score = round(accuracy_score(y_test,knn_prediction)*100,2)
print('Accuracy Percentage', knn_score, '\n')
print(classification_report(y_test,knn_prediction), '\n\n')
# model_visuals (knn_clf, X_test, y_test) # class report / plots

Accuracy Percentage 71.36 

              precision    recall  f1-score   support

           0       0.77      0.90      0.83      7794
           1       0.26      0.12      0.16      2383

    accuracy                           0.71     10177
   macro avg       0.51      0.51      0.49     10177
weighted avg       0.65      0.71      0.67     10177
 




In [44]:
gaussian_clf = GaussianNB() # Guassian Naive Bayes
gaussian_clf.fit(X_train, y_train) # fit model
gaussian_prediction = gaussian_clf.predict(X_test)
gaussian_score = round(accuracy_score(y_test,gaussian_prediction)*100,2)
print('Accuracy Percentage', gaussian_score, '\n')
print(classification_report(y_test,gaussian_prediction), '\n\n')
# model_visuals (gaussian_clf, X_test, y_test) # class report / plots

Accuracy Percentage 69.1 

              precision    recall  f1-score   support

           0       0.77      0.86      0.81      7794
           1       0.24      0.14      0.18      2383

    accuracy                           0.69     10177
   macro avg       0.50      0.50      0.49     10177
weighted avg       0.64      0.69      0.66     10177
 




In [45]:
dectree_clf = DecisionTreeClassifier(class_weight = 'balanced') # Decision Tree 
dectree_clf.fit(X_train, y_train)  # fit model
dectree_prediction = dectree_clf.predict(X_test)
decision_tree_score = round(accuracy_score(y_test,dectree_prediction)*100,2)
print('Accuracy Percentage', decision_tree_score, '\n')
print(classification_report(y_test,dectree_prediction), '\n\n')
# model_visuals (dectree_clf, X_test, y_test) # class report / plots

Accuracy Percentage 66.15 

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      7794
           1       0.28      0.29      0.29      2383

    accuracy                           0.66     10177
   macro avg       0.53      0.53      0.53     10177
weighted avg       0.66      0.66      0.66     10177
 




In [46]:
adabst_clf = AdaBoostClassifier() # AdaBoost
adabst_clf.fit(X_train, y_train) # fit model
adabst_prediction = adabst_clf.predict(X_test)
adabst_score = round(accuracy_score(y_test,adabst_prediction)*100,2)
print('Accuracy Percentage', adabst_score, '\n')
print(classification_report(y_test,adabst_prediction), '\n\n')
# model_visuals (adabst_clf, X_test, y_test) # class report / plots

Accuracy Percentage 76.58 

              precision    recall  f1-score   support

           0       0.77      1.00      0.87      7794
           1       0.00      0.00      0.00      2383

    accuracy                           0.77     10177
   macro avg       0.38      0.50      0.43     10177
weighted avg       0.59      0.77      0.66     10177
 




#### interpret models

In [47]:
# Dataframe of Results
models = pd.DataFrame({
    'Model': ['Logistic Regression',
              'KNN', 
              'Random Forest', 
              'Gaussian Naive Bayes',
              'Support Vector Machine (SVC)', 
              'Decision Tree', 
              'AdaBoost Classifier', 
              'Gradient Boosting Classifier',
             ],
    'Score': [lrs, 
              knn_score, 
              random_forest_score, 
              gaussian_score,
              svm_score, 
              decision_tree_score,
              adabst_score, 
              gbclf_score, 
             ]})

models.sort_values(by='Score', ascending=False) # Sorting by score

Unnamed: 0,Model,Score
7,Gradient Boosting Classifier,76.59
6,AdaBoost Classifier,76.58
2,Random Forest,75.62
1,KNN,71.36
3,Gaussian Naive Bayes,69.1
5,Decision Tree,66.15
0,Logistic Regression,51.93
4,Support Vector Machine (SVC),51.18


#### Class Imbalance

#### Pycaret

In [49]:
dataset = df.copy()

In [50]:
import pycaret
import pycaret.preprocess as preprocess
from pycaret.datasets import get_data
from pycaret.classification import *
import pycaret.preprocess as preprocess

In [51]:
data = dataset.sample(frac=0.80, random_state=786)
data_unseen = dataset.drop(data.index).reset_index(drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (40706, 46)
Unseen Data For Predictions: (10176, 46)


In [53]:
clf1 = setup(data=data,target='Response',session_id=123,numeric_features=['Accomodation_Type','Reco_Insurance_Type','Is_Spouse','Health Indicator','Holding_Policy_Duration','Holding_Policy_Type'])

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(40706, 46)"
4,Missing Values,False
5,Numeric Features,10
6,Categorical Features,35
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [54]:
# testpycar = setup(data=data,target='Response',fix_imbalance=True,session_id=123,normalize=True,n_jobs=-1,feature_ratio=False,use_gpu=True)

In [55]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Ada Boost Classifier,0.7594,0.6109,0.0007,0.35,0.0015,0.001,0.0119,1.1612
1,Logistic Regression,0.7593,0.5109,0.0,0.0,0.0,0.0,0.0,0.2516
2,Ridge Classifier,0.7593,0.0,0.0,0.0,0.0,0.0,0.0,0.0433
3,Linear Discriminant Analysis,0.7593,0.5126,0.0,0.0,0.0,0.0,0.0,0.2495
4,Extreme Gradient Boosting,0.7592,0.6377,0.0023,0.5375,0.0046,0.0023,0.0206,1.2572
5,CatBoost Classifier,0.7589,0.6491,0.0429,0.4913,0.0788,0.0418,0.0857,11.8974
6,Light Gradient Boosting Machine,0.7587,0.652,0.0296,0.4768,0.0557,0.0285,0.068,0.4156
7,Gradient Boosting Classifier,0.7586,0.6379,0.0019,0.2795,0.0038,0.0006,0.0041,2.9318
8,Naive Bayes,0.7556,0.5114,0.0052,0.2505,0.0102,-0.0018,-0.0032,0.0281
9,Random Forest Classifier,0.7362,0.5614,0.08,0.3125,0.1273,0.0324,0.043,0.1169


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=123)

In [None]:
# GBC = create_model('gbc')
# tuned_GBC = tune_model(GBC, optimize = 'AUC')
# print(tuned_GBC)
# plot_model(tuned_GBC, plot = 'auc')
# plot_model(tuned_GBC, plot = 'confusion_matrix')
# # Final check using the unseen data / test
# predict_model(tuned_GBC);

# # Final Model / trained on the complete dataset
# final_GBC = finalize_model(tuned_GBC)

# evaluate_model(final_GBC)

# plot_model(final_GBC, plot='feature')

# # Interpret model | SHAP
# interpret_model(final_GBC)

In [None]:
#### neural network

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense,Dropout
# from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# early_stop = EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=25)

In [None]:
# model = Sequential()

# model.add(Dense(21,activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(18,activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(15,activation='relu'))
# model.add(Dropout(0.5))

# # Binary Classification
# model.add(Dense(1,activation='sigmoid'))

# model.compile(loss='binary_crossentropy',optimizer='adam')

In [None]:
# model.fit(x=X_train,y=y_train,epochs=500,validation_data=(X_test,y_test),
#           callbacks=[early_stop])

In [None]:
# model_loss = pd.DataFrame(model.history.history)
# model_loss.plot(figsize=(10,6));

In [None]:
# model.summary()

In [None]:
# # model.predict_classes(X_test)
# predictions = model.predict_classes(X_test)

In [None]:
# print(classification_report(y_test,predictions))