#Getting Started


import and get a sense of data

In [8]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv("travel-insurance.csv", index_col=0) #import the dataset
df.head() #observe some data sample

Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,31,Government Sector,Yes,400000,6,1,No,No,0
1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1
3,28,Private Sector/Self Employed,Yes,700000,3,1,No,No,0
4,28,Private Sector/Self Employed,Yes,700000,8,1,Yes,No,0


Here follows the list of columns in the dataset:

* Age - Age of the customer
* Employment Type - The sector in which customer is employed
* GraduateOrNot - Whether the customer is college graduate or not
* AnnualIncome - The yearly income of the customer in indian rupees
* FamilyMembers - Number of members in customer's family
* ChronicDisease - Whether the customer suffers from any major disease or conditions like diabetes/high BP or asthama, etc.
* FrequentFlyer - Derived data based on customer's history of booking air tickets on atleast 4 different instances in the last 2 Years (2017-2019).
* EverTravelledAbroad - Has the customer ever travelled to a foreign country.
* TravelInsurance: (label) Did the customer buy travel insurance package during introductory offering held in the year 2019.

In [10]:
df.shape #get the overall shape of the data frame

(1987, 9)

In [11]:
df.info() #get the sense of the data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1987 entries, 0 to 1986
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1987 non-null   int64 
 1   Employment Type      1987 non-null   object
 2   GraduateOrNot        1987 non-null   object
 3   AnnualIncome         1987 non-null   int64 
 4   FamilyMembers        1987 non-null   int64 
 5   ChronicDiseases      1987 non-null   int64 
 6   FrequentFlyer        1987 non-null   object
 7   EverTravelledAbroad  1987 non-null   object
 8   TravelInsurance      1987 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 155.2+ KB


# Cleaning the data

In [12]:
#checking the null data in the dataset
null_data = df.isnull().sum()
null_data

Age                    0
Employment Type        0
GraduateOrNot          0
AnnualIncome           0
FamilyMembers          0
ChronicDiseases        0
FrequentFlyer          0
EverTravelledAbroad    0
TravelInsurance        0
dtype: int64

We can see that the data is already clean

#Pre-process the Data

## Encode catergorical value

Since we have categorical values, we have to encode them

In [13]:
feature_names = df.columns.tolist() #get the list of all columns in the dataframe
feature_names

['Age',
 'Employment Type',
 'GraduateOrNot',
 'AnnualIncome',
 'FamilyMembers',
 'ChronicDiseases',
 'FrequentFlyer',
 'EverTravelledAbroad',
 'TravelInsurance']

In [14]:
feature_names = ['Employment Type',  
 'GraduateOrNot',
 'FrequentFlyer',
 'EverTravelledAbroad',
 'ChronicDiseases',
 'Age',
 'AnnualIncome',
 'FamilyMembers',
 'TravelInsurance']

df = df[feature_names] #rearrange the column so that all categorical columns are on the left of the dataframe
df.head()

Unnamed: 0,Employment Type,GraduateOrNot,FrequentFlyer,EverTravelledAbroad,ChronicDiseases,Age,AnnualIncome,FamilyMembers,TravelInsurance
0,Government Sector,Yes,No,No,1,31,400000,6,0
1,Private Sector/Self Employed,Yes,No,No,0,31,1250000,7,0
2,Private Sector/Self Employed,Yes,No,No,1,34,500000,4,1
3,Private Sector/Self Employed,Yes,No,No,1,28,700000,3,0
4,Private Sector/Self Employed,Yes,Yes,No,1,28,700000,8,0


In [15]:
from sklearn.preprocessing import LabelEncoder #import LabelEncoder
loc_encoder = LabelEncoder()

for i in range(4): #encode all categorical value
  col = feature_names[i]
  df[col] = loc_encoder.fit_transform(df[col])

df.head()

Unnamed: 0,Employment Type,GraduateOrNot,FrequentFlyer,EverTravelledAbroad,ChronicDiseases,Age,AnnualIncome,FamilyMembers,TravelInsurance
0,0,1,0,0,1,31,400000,6,0
1,1,1,0,0,0,31,1250000,7,0
2,1,1,0,0,1,34,500000,4,1
3,1,1,0,0,1,28,700000,3,0
4,1,1,1,0,1,28,700000,8,0


## determine feature label 



In [16]:
feature_names.remove("TravelInsurance") #remove the label columns
X = df[feature_names].values #assign data feature values to X

Y = df.TravelInsurance.values #assign label values to Y

In [17]:
# Show data feature shape
X.shape

(1987, 8)

In [18]:
# Show label shape
Y.shape

(1987,)

In [19]:
X

array([[      0,       1,       0, ...,      31,  400000,       6],
       [      1,       1,       0, ...,      31, 1250000,       7],
       [      1,       1,       0, ...,      34,  500000,       4],
       ...,
       [      1,       1,       0, ...,      28, 1150000,       6],
       [      1,       1,       1, ...,      34, 1000000,       6],
       [      1,       1,       0, ...,      34,  500000,       4]])

## Split train, test, validation sets

In [20]:
# Split the dataset to trainvalidation and test set
from sklearn.model_selection import train_test_split
X_trainandval, X_test, Y_trainandval, Y_test = train_test_split(X, Y, train_size=0.85, random_state=42)

In [21]:
# Split the dataset to train and validation set
X_train, X_validation, Y_train, Y_validation = train_test_split(X_trainandval, Y_trainandval, train_size=0.85, random_state=42)

Now we have 3 sets: Train, Validation and Test

# Train the model using the Gaussian without StandardScaler

## train the model

In [27]:
# Initialize and train Gaussian Naive Bayes model using X_train (data features) and y_train (data label)
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

naive_model = GaussianNB()
grid_search = {'var_smoothing': np.logspace(0,-9, num=100)}
nb_cv = GridSearchCV(naive_model, grid_search, cv=5)
nb_cv.fit(X_train, Y_train)

GridSearchCV(cv=5, estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5.33669923e-02, 4.32876128e-02,
       3.51119173e-02, 2.84803587e-02, 2.31...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])})

##Validate the model

In [28]:
nb_cv.best_params_  # Show the best value of var_smoothing

{'var_smoothing': 0.43287612810830584}

In [29]:
nb_cv.best_score_ # Show the model performance with the best value of var_smoothing

0.7908067542213884

## test the model

In [30]:
naive_model = GaussianNB(var_smoothing = nb_cv.best_params_['var_smoothing']) #initialize the model with the best value of hyper parameter var_smoothing
naive_model.fit(X_train, Y_train) # train the model

GaussianNB(var_smoothing=0.43287612810830584)

In [31]:
# Impport libraries to calculate evaluation metrics: precision, recall, f1 score.
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
# Make prediction on the test data
predicted_label_gau_test = naive_model.predict(X_test)

# Calculate evaluation metrics by comparing the prediction with the data label y_test
print(precision_score(predicted_label_gau_test, Y_test))
print(recall_score(predicted_label_gau_test, Y_test))
print(f1_score(predicted_label_gau_test, Y_test))
print(classification_report(predicted_label_gau_test, Y_test))

0.44
0.9565217391304348
0.6027397260273973
              precision    recall  f1-score   support

           0       0.99      0.78      0.87       253
           1       0.44      0.96      0.60        46

    accuracy                           0.81       299
   macro avg       0.71      0.87      0.74       299
weighted avg       0.91      0.81      0.83       299



# Train the model using the Gaussian with StandardScaler

## train the model

In [32]:
# Standardize the data using Standard scaler
from sklearn.preprocessing import StandardScaler
normalizer = StandardScaler()
X_normal_train = normalizer.fit_transform(X_train) # Note that we use fit_transform() on training data so that it can learn the scaling parameters of that data.
X_normal_validation = normalizer.fit_transform(X_validation)     
X_normal_test = normalizer.transform(X_test)           # But we only transform() in test data using the learned scaling parameters.

In [33]:
# Initialize and train Gaussian Naive Bayes model using X_normal_train (data features) and y_train (data label)
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

naive_model = GaussianNB()
#naive_model.fit(X_normal_train, Y_train)
grid_search = {'var_smoothing': np.logspace(0,-9, num=100)}
nb_cv = GridSearchCV(naive_model, grid_search, cv=5)
nb_cv.fit(X_normal_train, Y_train)

GridSearchCV(cv=5, estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5.33669923e-02, 4.32876128e-02,
       3.51119173e-02, 2.84803587e-02, 2.31...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])})

##Validate the model

In [34]:
nb_cv.best_params_  # Show the best value of var_smoothing

{'var_smoothing': 1.0}

In [35]:
nb_cv.best_score_ # Show the model performance with the best value of var_smoothing

0.7761628615287152

## test the model

In [36]:
naive_model = GaussianNB(var_smoothing = nb_cv.best_params_['var_smoothing']) #initialize the model with the best value of hyper parameter var_smoothing
naive_model.fit(X_normal_train, Y_train) # train the model

GaussianNB(var_smoothing=1.0)

In [37]:
# Make prediction on the test data
predicted_label_gau_test = naive_model.predict(X_normal_test)

# Calculate evaluation metrics by comparing the prediction with the data label y_test
print(precision_score(predicted_label_gau_test, Y_test))
print(recall_score(predicted_label_gau_test, Y_test))
print(f1_score(predicted_label_gau_test, Y_test))
print(classification_report(predicted_label_gau_test, Y_test))

0.5
0.819672131147541
0.6211180124223602
              precision    recall  f1-score   support

           0       0.94      0.79      0.86       238
           1       0.50      0.82      0.62        61

    accuracy                           0.80       299
   macro avg       0.72      0.80      0.74       299
weighted avg       0.85      0.80      0.81       299



We can see that StandardScaler does impact the model

# Train the Model using Mixed Naive Bayes

## train the model using train set

In [38]:
# Import mixed Naive Bayes library
!pip install git+https://github.com/remykarem/mixed-naive-bayes#egg=mixed_naive_bayes
from mixed_naive_bayes import MixedNB

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mixed_naive_bayes
  Cloning https://github.com/remykarem/mixed-naive-bayes to /tmp/pip-install-8ohrgx86/mixed-naive-bayes_d8470d3a70af49cebde4c538b804e102
  Running command git clone -q https://github.com/remykarem/mixed-naive-bayes /tmp/pip-install-8ohrgx86/mixed-naive-bayes_d8470d3a70af49cebde4c538b804e102
Building wheels for collected packages: mixed-naive-bayes
  Building wheel for mixed-naive-bayes (setup.py) ... [?25l[?25hdone
  Created wheel for mixed-naive-bayes: filename=mixed_naive_bayes-0.0.3-py3-none-any.whl size=10756 sha256=3d147c5f77f4cb9caba40c187ac96e0dc9550e2901de15b04a931b8a3d701d6e
  Stored in directory: /tmp/pip-ephem-wheel-cache-aeuf45si/wheels/f6/17/45/08ff7102e1201fe077c968291143479ecc63186e638bcce9f3
Successfully built mixed-naive-bayes
Installing collected packages: mixed-naive-bayes
Successfully installed mixed-naive-bayes-0.0.3


In [39]:
# Specify the indices of the features which are to follow the categorical distribution (first 5 columns).
clf = MixedNB(categorical_features=[0,1,2,3,4])

In [40]:
# Train the model using data features X_train and label Y_train
clf.fit(X_train,Y_train)

MixedNB(alpha=0.5, var_smoothing=1e-09)

##test the model

In [41]:
# Make prediction on the test data
predicted_label_mixed_test = clf.predict(X_test)

# Calculate evaluation metrics by comparing the prediction with the data label y_test
print(precision_score(predicted_label_mixed_test, Y_test))
print(recall_score(predicted_label_mixed_test, Y_test))
print(f1_score(predicted_label_mixed_test, Y_test))
print(classification_report(predicted_label_mixed_test, Y_test))

0.53
0.6973684210526315
0.6022727272727272
              precision    recall  f1-score   support

           0       0.88      0.79      0.83       223
           1       0.53      0.70      0.60        76

    accuracy                           0.77       299
   macro avg       0.71      0.74      0.72       299
weighted avg       0.79      0.77      0.78       299

