In [1]:
import pandas as pd
import numpy as np
import researchpy as rp
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [2]:
data=pd.read_csv('./NIIT-insurance_case_study.csv')

In [3]:
data.head()

Unnamed: 0,No of Vehicle,Vacations/Year,Vacation Type,Brand concious,Hobbies,Existing Insurance Coverage,Medical History,Living Region,bmi,Age,Gender,Salary,Insurance Type
0,1,2,domestic,yes,tennis,PMCAH,no,north,33.77,22,male,30000,type1
1,2,2,domestic,no,badminton,ECAH,yes,west,33.0,25,female,35000,type2
2,0,3,domestic,yes,tennis,CACAH,no,west,22.705,28,male,32000,type1
3,1,1,international,yes,tennis,ECAH,yes,east,28.88,27,female,45000,type2
4,4,4,international,yes,golf,BCACAH,no,south,25.74,30,male,70000,type3


In [4]:
#The data consists of 30 rows and 13 columns
data.shape

(30, 13)

In [5]:
data.describe()

Unnamed: 0,No of Vehicle,Vacations/Year,bmi,Age,Salary
count,30.0,30.0,30.0,30.0,30.0
mean,1.766667,2.7,30.664833,29.733333,43433.333333
std,1.165106,1.4657,5.773851,4.653018,15902.034859
min,0.0,1.0,17.385,19.0,20000.0
25%,1.0,2.0,26.2375,27.0,32000.0
50%,1.5,2.0,31.35,30.0,37000.0
75%,2.75,3.0,34.325,33.75,53750.0
max,4.0,6.0,42.13,38.0,80000.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 13 columns):
No of Vehicle                  30 non-null int64
Vacations/Year                 30 non-null int64
Vacation Type                  30 non-null object
Brand concious                 30 non-null object
Hobbies                        30 non-null object
Existing Insurance Coverage    30 non-null object
Medical History                30 non-null object
Living Region                  30 non-null object
bmi                            30 non-null float64
Age                            30 non-null int64
Gender                         30 non-null object
Salary                         30 non-null int64
Insurance Type                 30 non-null object
dtypes: float64(1), int64(4), object(8)
memory usage: 3.1+ KB


In [7]:
data.isnull().sum()

No of Vehicle                  0
Vacations/Year                 0
Vacation Type                  0
Brand concious                 0
Hobbies                        0
Existing Insurance Coverage    0
Medical History                0
Living Region                  0
bmi                            0
Age                            0
Gender                         0
Salary                         0
Insurance Type                 0
dtype: int64

In [8]:
rp.summary_cat(data[['Vacation Type','Brand concious','Hobbies','Existing Insurance Coverage', 'Medical History',
       'Living Region', 'Insurance Type']])

Unnamed: 0,Variable,Outcome,Count,Percent
0,Vacation Type,domestic,19,63.33
1,,international,11,36.67
2,Brand concious,yes,19,63.33
3,,no,11,36.67
4,Hobbies,tennis,9,30.0
5,,gym,9,30.0
6,,golf,7,23.33
7,,badminton,5,16.67
8,Existing Insurance Coverage,PMCAH,9,30.0
9,,CACAH,8,26.67


In [9]:
rp.summary_cont(data[['No of Vehicle','Vacations/Year','bmi','Age','Salary']])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,No of Vehicle,30.0,1.766667,1.165106,0.212718,1.331609,2.201724
1,Vacations/Year,30.0,2.7,1.4657,0.267599,2.152699,3.247301
2,bmi,30.0,30.664833,5.773851,1.054156,28.508842,32.820825
3,Age,30.0,29.733333,4.653018,0.849521,27.995868,31.470799
4,Salary,30.0,43433.333333,15902.034859,2903.301067,37495.41593,49371.250736


In [10]:
#Factorizing the dependent variable
ins_factors=pd.factorize(data['Insurance Type'])

In [11]:
data['Insurance Type']=ins_factors[0]

In [12]:
ins_definations=ins_factors[1]

In [13]:
#creating one hot encoders
data=pd.get_dummies(data,drop_first=True)

In [14]:
data.head()

Unnamed: 0,No of Vehicle,Vacations/Year,bmi,Age,Salary,Insurance Type,Vacation Type_international,Brand concious_yes,Hobbies_golf,Hobbies_gym,Hobbies_tennis,Existing Insurance Coverage_CACAH,Existing Insurance Coverage_ECAH,Existing Insurance Coverage_PMCAH,Medical History_yes,Living Region_north,Living Region_south,Living Region_west,Gender_male
0,1,2,33.77,22,30000,0,0,1,0,0,1,0,0,1,0,1,0,0,1
1,2,2,33.0,25,35000,1,0,0,0,0,0,0,1,0,1,0,0,1,0
2,0,3,22.705,28,32000,0,0,1,0,0,1,1,0,0,0,0,0,1,1
3,1,1,28.88,27,45000,1,1,1,0,0,1,0,1,0,1,0,0,0,0
4,4,4,25.74,30,70000,2,1,1,1,0,0,0,0,0,0,0,1,0,1


In [15]:
data.dtypes,data.shape

(No of Vehicle                          int64
 Vacations/Year                         int64
 bmi                                  float64
 Age                                    int64
 Salary                                 int64
 Insurance Type                         int64
 Vacation Type_international            uint8
 Brand concious_yes                     uint8
 Hobbies_golf                           uint8
 Hobbies_gym                            uint8
 Hobbies_tennis                         uint8
 Existing Insurance Coverage_CACAH      uint8
 Existing Insurance Coverage_ECAH       uint8
 Existing Insurance Coverage_PMCAH      uint8
 Medical History_yes                    uint8
 Living Region_north                    uint8
 Living Region_south                    uint8
 Living Region_west                     uint8
 Gender_male                            uint8
 dtype: object, (30, 19))

In [16]:
#dividing the data into train test split
from sklearn.model_selection import train_test_split

In [17]:
train,test=train_test_split(data,test_size=0.2,random_state=10)

In [18]:
train.shape,test.shape

((24, 19), (6, 19))

In [19]:
X_train=train.drop('Insurance Type',1)
y_train=train['Insurance Type']

In [20]:
X_test=test.drop('Insurance Type',1)
y_test=test['Insurance Type']

In [21]:
scaler=StandardScaler()

In [22]:
#scaling the independent varaibles so that the model could learn better
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
# Making RF classifier object using entropy and random state as 10 in RF classifier
classifier=RandomForestClassifier(criterion='entropy',random_state=10)

In [24]:
classifier.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

In [25]:
#predicting the test data over out trained model
y_pred=classifier.predict(X_test)

In [26]:
#reverse factoring our dependent variable so that the resulst are readable.
ins_reversefactor=dict(zip(range(3),ins_definations))
ins_reversefactor

{0: 'type1', 1: 'type2', 2: 'type3'}

In [27]:
y_test = np.vectorize(ins_reversefactor.get)(y_test)
y_pred = np.vectorize(ins_reversefactor.get)(y_pred)

In [28]:
print("This is y_test",y_test)
print("This is y_pred",y_pred)

This is y_test ['type2' 'type1' 'type2' 'type1' 'type2' 'type3']
This is y_pred ['type2' 'type2' 'type2' 'type1' 'type2' 'type3']


In [29]:
#Making a pandas cross table for visualizing the results
print(pd.crosstab(y_test, y_pred, rownames=['Actual Species'], colnames=['Predicted Species']))

Predicted Species  type1  type2  type3
Actual Species                        
type1                  1      1      0
type2                  0      3      0
type3                  0      0      1


In [30]:
prediction=pd.DataFrame(columns=['Actual','Predicted'])

In [31]:
prediction['Actual']=y_test
prediction['Predicted']=y_pred
prediction

Unnamed: 0,Actual,Predicted
0,type2,type2
1,type1,type2
2,type2,type2
3,type1,type1
4,type2,type2
5,type3,type3


In [32]:
#checking the accuracy of model

In [33]:
print(confusion_matrix(y_test,y_pred))

[[1 1 0]
 [0 3 0]
 [0 0 1]]


In [34]:
print(accuracy_score(y_test,y_pred)*100)

83.33333333333334


In [35]:
print(classification_report(y_test, y_pred,labels=None, sample_weight=None))

              precision    recall  f1-score   support

       type1       1.00      0.50      0.67         2
       type2       0.75      1.00      0.86         3
       type3       1.00      1.00      1.00         1

   micro avg       0.83      0.83      0.83         6
   macro avg       0.92      0.83      0.84         6
weighted avg       0.88      0.83      0.82         6

