<a href="https://colab.research.google.com/github/palakagl/CapstoneProject/blob/main/Insurance_Classification_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# Import library
import pandas  as pd #Data manipulation
import numpy as np #Data manipulation
import matplotlib.pyplot as plt # Visualization
import seaborn as sns #Visualization
import io

## for statistical tests
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import pickle

## for machine learning
from sklearn import model_selection, preprocessing, feature_selection, ensemble, linear_model, metrics, decomposition

from sklearn.metrics import accuracy_score,confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,r2_score,roc_auc_score

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [16]:
train_df = pd.read_csv('life_insurance_train.csv')
train_df = train_df.drop('Id',axis=1)
print('\nNumber of rows and columns in the data set: ',train_df.shape)
print('')

#Lets look into top few rows and columns in the dataset
train_df.head()


Number of rows and columns in the data set:  (31285, 127)



Unnamed: 0,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,...,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response
0,1,D3,10,0.076923,2,1,1,0.641791,0.581818,0.148536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
1,1,E1,26,0.076923,2,3,1,0.029851,0.745455,0.288703,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
2,1,D4,10,0.487179,2,3,1,0.164179,0.672727,0.205021,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
3,1,D2,26,0.230769,2,3,1,0.41791,0.654545,0.23431,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
4,1,D2,26,0.230769,3,1,1,0.507463,0.836364,0.299163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0


In [17]:
test_df = pd.read_csv('life_insurance_test.csv')
test_df = test_df.drop('Id',axis=1)
print('\nNumber of rows and columns in the data set: ',test_df.shape)
print('')

#Lets look into top few rows and columns in the dataset
test_df.head()


Number of rows and columns in the data set:  (17815, 126)



Unnamed: 0,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,...,Medical_Keyword_39,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48
0,1,A1,26,0.076923,2,3,1,0.059701,0.6,0.131799,...,0,0,0,0,0,0,0,0,0,0
1,1,D3,26,0.128205,2,3,1,0.537313,0.690909,0.309623,...,0,0,0,0,0,1,0,0,1,1
2,1,A2,26,0.102564,2,3,1,0.567164,0.618182,0.16318,...,0,0,0,0,0,0,0,0,0,0
3,1,D4,26,0.230769,2,3,1,0.179104,0.8,0.539749,...,0,0,0,0,0,0,0,0,0,0
4,1,A2,26,0.179487,2,3,1,0.164179,0.745455,0.288703,...,0,0,0,0,0,0,0,0,0,0


In [18]:
y_test = pd.read_csv('response_test.csv')
y_test = y_test.drop('Id',axis=1)
print('\nNumber of rows and columns in the data set: ',y_test.shape)
print('')

#Lets look into top few rows and columns in the dataset
y_test.head()


Number of rows and columns in the data set:  (17815, 1)



Unnamed: 0,Response
0,4
1,6
2,7
3,2
4,8


In [19]:
dftest = pd.read_csv('classfication_containerized_test.csv')

In [20]:
list_train = train_df.columns[train_df.isna().any()].tolist()

In [21]:
list_test =  test_df.columns[test_df.isna().any()].tolist()

In [22]:
for column in list_train:
    train_df[column].fillna(train_df[column].mean(), inplace=True)

In [23]:
for column in list_test:
    test_df[column].fillna(test_df[column].mean(), inplace=True)

In [24]:
X_test = test_df

In [25]:
num_columns = train_df.select_dtypes(np.number).columns.tolist()
cat_columns = train_df.select_dtypes('object').columns.tolist()
all_columns = num_columns + cat_columns  # this order will need to be preserved
print('Numerical columns:', ', '.join(num_columns))
print('Categorical columns:', ', '.join(cat_columns))

cat_pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='N/A'),
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)


Numerical columns: Product_Info_1, Product_Info_3, Product_Info_4, Product_Info_5, Product_Info_6, Product_Info_7, Ins_Age, Ht, Wt, BMI, Employment_Info_1, Employment_Info_2, Employment_Info_3, Employment_Info_4, Employment_Info_5, Employment_Info_6, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_5, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5, Medical_History_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_10, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_15, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_Hi

In [26]:
y_train = train_df[['Response']]# Dependent variable
X_train = train_df.drop('Response',axis=1) # Independet variable

In [27]:
X_train.shape
y_train.shape

(31285, 126)

(31285, 1)

Random Forest Classifier

In [28]:
# Fit/predict
clf6 = RandomForestClassifier(n_estimators= 850)
rfclassifier_model = make_pipeline(cat_pipe, clf6)
rfclassifier_model = rfclassifier_model.fit(X_train, y_train)
y_pred = rfclassifier_model.predict(X_test)
print(classification_report(y_test, y_pred))

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


ValueError: ignored

In [None]:
#Save to file in the current working directory
pkl_filename = "rfclassifier_model.pkl"
with open (pkl_filename, 'wb') as file:
  pickle.dump(rfclassifier_model, file)
#Load from file
with open (pkl_filename, 'rb' ) as file:
  pickle_model = pickle.load(file)

In [None]:
dftest.head()

In [None]:
# make predictions on the test set

y_predtest_rf = rfclassifier_model.predict(dftest)
print(y_predtest_rf)

In [None]:
# Fit/predict
clf7 = XGBClassifier(learning_rate= 0.05, max_depth= 500, n_estimators= 100)
xgbclassfier_model = make_pipeline(cat_pipe, clf7)

xgbclassfier_model = xgbclassfier_model.fit(X_train, y_train)
y_pred = xgbclassfier_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
#Save to file in the current working directory
pkl_filename = "xgbclassfier_model.pkl"
with open (pkl_filename, 'wb') as file:
  pickle.dump(xgbclassfier_model, file)
#Load from file
with open (pkl_filename, 'rb' ) as file:
  pickle_model = pickle.load(file)

In [None]:
# make predictions on the test set

y_predtest_xgb = xgbclassfier_model.predict(dftest)
print(y_predtest_xgb)