In [58]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.pandas.set_option('display.max_columns', None)

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors    import KNeighborsClassifier
from sklearn.tree         import DecisionTreeClassifier
from sklearn.ensemble     import RandomForestClassifier
from sklearn.ensemble     import AdaBoostClassifier
from sklearn.ensemble     import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.metrics import roc_auc_score,roc_curve

In [3]:
df_train = pd.read_csv("train_insurance.csv")
df_test  = pd.read_csv("test_insurance.csv")

In [59]:
sum(df_train["OUTCOME"])/len(df_train["OUTCOME"])

0.4229592713524661

In [4]:
print(df_train.shape)
print(df_test.shape)

(87285, 18)
(17715, 17)


In [5]:
df_train.head()

Unnamed: 0,ID,AGE,GENDER,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,OUTCOME,TYPE_OF_VEHICLE
0,816393,40-64,female,20-29y,university,middle class,0.63805,0,after 2015,0,0,37379,11000,0,0,0,0,Sports Car
1,251762,26-39,male,20-29y,high school,middle class,0.475741,1,before 2015,1,0,10238,9000,0,0,0,1,HatchBack
2,481952,40-64,male,20-29y,none,middle class,0.839817,1,before 2015,1,1,10238,12000,0,0,0,1,Sedan
3,3506,40-64,male,20-29y,high school,upper class,0.682527,1,before 2015,0,1,92099,6000,1,0,0,1,Sedan
4,498013,40-64,female,20-29y,none,working class,0.572184,1,after 2015,1,1,32122,15000,0,0,1,0,Sedan


In [6]:
df_train.dtypes

ID                       int64
AGE                     object
GENDER                  object
DRIVING_EXPERIENCE      object
EDUCATION               object
INCOME                  object
CREDIT_SCORE           float64
VEHICLE_OWNERSHIP        int64
VEHICLE_YEAR            object
MARRIED                  int64
CHILDREN                 int64
POSTAL_CODE              int64
ANNUAL_MILEAGE           int64
SPEEDING_VIOLATIONS      int64
DUIS                     int64
PAST_ACCIDENTS           int64
OUTCOME                  int64
TYPE_OF_VEHICLE         object
dtype: object

In [7]:
# train
df_train["VEHICLE_OWNERSHIP"]      = df_train["VEHICLE_OWNERSHIP"].astype("object")
df_train["MARRIED"]                = df_train["MARRIED"].astype("object")
df_train["CHILDREN"]               = df_train["CHILDREN"].astype("object")
df_train["DUIS"]                   = df_train["DUIS"].astype("object")

# test
df_test["VEHICLE_OWNERSHIP"]      = df_test["VEHICLE_OWNERSHIP"].astype("object")
df_test["MARRIED"]                = df_test["MARRIED"].astype("object")
df_test["CHILDREN"]               = df_test["CHILDREN"].astype("object")
df_test["DUIS"]                   = df_test["DUIS"].astype("object")

In [8]:
for i in df_train:
    print(df_train[i].dtypes)
    print(df_train[i].value_counts())

int64
101       109
102        35
103         9
104         9
106         8
         ... 
858173      1
977142      1
607797      1
212047      1
164608      1
Name: ID, Length: 82657, dtype: int64
object
40-64    27967
65+      27398
26-39    16677
16-25    15243
Name: AGE, dtype: int64
object
male      54379
female    32906
Name: GENDER, dtype: int64
object
20-29y    31171
0-9y      27524
10-19y    20844
30y+       7746
Name: DRIVING_EXPERIENCE, dtype: int64
object
high school    38746
university     25966
none           22573
Name: EDUCATION, dtype: int64
object
upper class      42685
working class    20276
middle class     12203
poverty          12121
Name: INCOME, dtype: int64
float64
0.442071    2
0.583765    2
0.501793    2
0.505614    2
0.755054    2
           ..
0.688666    1
0.761721    1
0.528849    1
0.717558    1
0.763635    1
Name: CREDIT_SCORE, Length: 87262, dtype: int64
object
1    72251
0    15034
Name: VEHICLE_OWNERSHIP, dtype: int64
object
before 2015    47738
afte

In [9]:
df_train.isnull().sum()

ID                     0
AGE                    0
GENDER                 0
DRIVING_EXPERIENCE     0
EDUCATION              0
INCOME                 0
CREDIT_SCORE           0
VEHICLE_OWNERSHIP      0
VEHICLE_YEAR           0
MARRIED                0
CHILDREN               0
POSTAL_CODE            0
ANNUAL_MILEAGE         0
SPEEDING_VIOLATIONS    0
DUIS                   0
PAST_ACCIDENTS         0
OUTCOME                0
TYPE_OF_VEHICLE        0
dtype: int64

In [10]:
df_train.select_dtypes("object").nunique()

AGE                   4
GENDER                2
DRIVING_EXPERIENCE    4
EDUCATION             3
INCOME                4
VEHICLE_OWNERSHIP     2
VEHICLE_YEAR          2
MARRIED               2
CHILDREN              2
DUIS                  7
TYPE_OF_VEHICLE       4
dtype: int64

In [11]:
df_train.select_dtypes("object").head()

Unnamed: 0,AGE,GENDER,DRIVING_EXPERIENCE,EDUCATION,INCOME,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,DUIS,TYPE_OF_VEHICLE
0,40-64,female,20-29y,university,middle class,0,after 2015,0,0,0,Sports Car
1,26-39,male,20-29y,high school,middle class,1,before 2015,1,0,0,HatchBack
2,40-64,male,20-29y,none,middle class,1,before 2015,1,1,0,Sedan
3,40-64,male,20-29y,high school,upper class,1,before 2015,0,1,0,Sedan
4,40-64,female,20-29y,none,working class,1,after 2015,1,1,0,Sedan


In [12]:
# encoding trainng dataset
for i in df_train.select_dtypes("object"):
    if  df_train[i].nunique() == 2:
        LE             =   LabelEncoder()
        df_train[i]    =   LE.fit_transform(df_train[i])
    
    elif i             ==  "AGE":
        OE             =   OrdinalEncoder(categories=[["16-25","26-39","40-64","65+"]])
        df_train[i]    =   OE.fit_transform(df_train[[i]])
        
    elif i             == "EDUCATION":
        OE             =   OrdinalEncoder(categories=[["none","high school","university"]])
        df_train[i]    =   OE.fit_transform(df_train[[i]])
    
    elif i             == "DRIVING_EXPERIENCE":
        OE             =   OrdinalEncoder(categories=[["0-9y","10-19y","20-29y","30y+"]])
        df_train[i]    =   OE.fit_transform(df_train[[i]])
        
    elif i             == "INCOME":
        OE             =   OrdinalEncoder(categories=[["poverty","working class","middle class","upper class"]])
        df_train[i]    =   OE.fit_transform(df_train[[i]])
    
    elif i             in  ["DUIS","TYPE_OF_VEHICLE"]:
        dummy_df       =   pd.get_dummies(df_train[i], prefix="Category")
        df_train       =   df_train.drop(i,axis=1)
        df_train       =   pd.concat([df_train,dummy_df],axis=1)
        

  dummy_df       =   pd.get_dummies(df_train[i], prefix="Category")


In [13]:
# encoding for testing dataset
for i in df_test.select_dtypes("object"):
    if  df_test[i].nunique() == 2:
        LE             =   LabelEncoder()
        df_test[i]    =   LE.fit_transform(df_test[i])
    
    elif i             ==  "AGE":
        OE             =   OrdinalEncoder(categories=[["16-25","26-39","40-64","65+"]])
        df_test[i]    =   OE.fit_transform(df_test[[i]])
        
    elif i             == "EDUCATION":
        OE             =   OrdinalEncoder(categories=[["none","high school","university"]])
        df_test[i]    =   OE.fit_transform(df_test[[i]])
    
    elif i             == "DRIVING_EXPERIENCE":
        OE             =   OrdinalEncoder(categories=[["0-9y","10-19y","20-29y","30y+"]])
        df_test[i]    =   OE.fit_transform(df_test[[i]])
        
    elif i             == "INCOME":
        OE             =   OrdinalEncoder(categories=[["poverty","working class","middle class","upper class"]])
        df_test[i]    =   OE.fit_transform(df_test[[i]])
    
    elif i             in  ["DUIS","TYPE_OF_VEHICLE"]:
        dummy_df       =   pd.get_dummies(df_test[i], prefix="Category")
        df_test       =   df_test.drop(i,axis=1)
        df_test       =   pd.concat([df_test,dummy_df],axis=1)
        

  dummy_df       =   pd.get_dummies(df_test[i], prefix="Category")


In [14]:
df_train.head()

Unnamed: 0,ID,AGE,GENDER,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,PAST_ACCIDENTS,OUTCOME,Category_0,Category_1,Category_2,Category_3,Category_4,Category_5,Category_6,Category_HatchBack,Category_SUV,Category_Sedan,Category_Sports Car
0,816393,2.0,0,2.0,2.0,2.0,0.63805,0,0,0,0,37379,11000,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,251762,1.0,1,2.0,1.0,2.0,0.475741,1,1,1,0,10238,9000,0,0,1,1,0,0,0,0,0,0,1,0,0,0
2,481952,2.0,1,2.0,0.0,2.0,0.839817,1,1,1,1,10238,12000,0,0,1,1,0,0,0,0,0,0,0,0,1,0
3,3506,2.0,1,2.0,1.0,3.0,0.682527,1,1,0,1,92099,6000,1,0,1,1,0,0,0,0,0,0,0,0,1,0
4,498013,2.0,0,2.0,0.0,1.0,0.572184,1,0,1,1,32122,15000,0,1,0,1,0,0,0,0,0,0,0,0,1,0


In [15]:
df_train.columns

Index(['ID', 'AGE', 'GENDER', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME',
       'CREDIT_SCORE', 'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'MARRIED',
       'CHILDREN', 'POSTAL_CODE', 'ANNUAL_MILEAGE', 'SPEEDING_VIOLATIONS',
       'PAST_ACCIDENTS', 'OUTCOME', 'Category_0', 'Category_1', 'Category_2',
       'Category_3', 'Category_4', 'Category_5', 'Category_6',
       'Category_HatchBack', 'Category_SUV', 'Category_Sedan',
       'Category_Sports Car'],
      dtype='object')

In [16]:
target=df_train["OUTCOME"]

In [17]:
# train
X = df_train.drop("OUTCOME",axis=1)
y = df_train["OUTCOME"]

# test 
df_test.drop(["ID","POSTAL_CODE"],axis=1,inplace=True)

In [18]:
df_test.shape

(17715, 24)

## Defined function

In [19]:
def data_preprocess(X,y):
    X_train, X_test, y_train, y_test =train_test_split(X, y , test_size = 0.2, random_state=7)
    

    train_scaled = pd.DataFrame(X_train, columns = X.columns).reset_index(drop=True)
    test_scaled = pd.DataFrame(X_test, columns = X.columns).reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    scaler = StandardScaler(copy=True,with_mean=True, with_std=True)
    
    train_scaled["ANNUAL_MILEAGE"]=scaler.fit_transform(train_scaled[["ANNUAL_MILEAGE"]])

    train_scaled["ANNUAL_MILEAGE"]
    
    return (train_scaled,test_scaled,y_train,y_test)

train_scaled,test_scaled,y_train,y_test  =  data_preprocess(X,y)

In [20]:
scaler = StandardScaler(copy=True,with_mean=True, with_std=True)
    
df_test["ANNUAL_MILEAGE"]=scaler.fit_transform(df_test[["ANNUAL_MILEAGE"]])

df_test["ANNUAL_MILEAGE"]
df_test

Unnamed: 0,AGE,GENDER,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,PAST_ACCIDENTS,Category_0,Category_1,Category_2,Category_3,Category_4,Category_5,Category_6,Category_HatchBack,Category_SUV,Category_Sedan,Category_Sports Car
0,0.0,1,1.0,1.0,2.0,0.424958,1,1,1,0,0.649415,0,0,1,0,0,0,0,0,0,0,0,0,1
1,0.0,1,2.0,1.0,1.0,0.503478,0,1,1,1,0.988517,0,0,1,0,0,0,0,0,0,0,1,0,0
2,1.0,0,2.0,1.0,3.0,0.252246,1,1,1,0,-0.367892,0,0,1,0,0,0,0,0,0,1,0,0,0
3,0.0,0,2.0,2.0,2.0,0.464749,1,1,1,1,-1.385199,0,0,0,1,0,0,0,0,0,0,1,0,0
4,0.0,1,2.0,1.0,1.0,0.452968,1,0,0,1,0.649415,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17710,3.0,0,0.0,1.0,3.0,0.672255,1,1,0,0,-0.706994,2,0,1,0,0,0,0,0,0,0,1,0,0
17711,2.0,0,0.0,0.0,1.0,0.514193,1,0,1,0,-0.367892,0,0,1,0,0,0,0,0,0,0,0,1,0
17712,0.0,1,1.0,0.0,3.0,0.461942,1,0,1,0,-0.028790,0,0,1,0,0,0,0,0,0,0,0,1,0
17713,2.0,0,2.0,1.0,1.0,0.483571,1,1,0,0,-0.367892,0,0,1,0,0,0,0,0,0,1,0,0,0


In [21]:
def performance(model,X,y):
    y_pred = model.predict(X)
    y_pred_prob = model.predict_proba(X)[:,1]
    cm =confusion_matrix(y,y_pred)
    cl_report=classification_report(y,y_pred)
    print(cm)
    print(cl_report)
    print("AUC score",roc_auc_score(y,y_pred_prob))
    return cm

In [22]:
def roc_plot(model,X,y,label):
    y_pred = model.predict_proba(X)[:,1]
    
    fpr,tpr,thresholds = roc_curve(y,y_pred)
    
    auc_score = roc_auc_score(y,y_pred)
    
    plt.plot(fpr,tpr, label = label+'(AUC Score = %0.4f)'%auc_score)
    
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.0])
    
    plt.plot([0,1],[0,1],'r--')
    
    plt.title('ROC Curve', fontsize=15)
    plt.xlabel('False positive rate (1-specificity)', fontsize=15)
    plt.ylabel('True positive rate (Sensitivity)', fontsize=15)
    
    plt.legend(loc = 'lower right')
    
    plt.grid(True)

## MODEL BUILDING

In [46]:
KNN_model=  KNeighborsClassifier()
KNN_model.fit(train_scaled,y_train)

In [55]:
KNN_pred = KNN_model.predict(test_scaled)
print("KNN_model",f1_score(y_test,KNN_pred))

AttributeError: 'Flags' object has no attribute 'c_contiguous'

In [23]:
#LR_model =  LogisticRegression()
#KNN_model=  KNeighborsClassifier()
#DT_model =  DecisionTreeClassifier(random_state=10)
RF_model =  RandomForestClassifier(random_state=10)
AB_model =  AdaBoostClassifier(random_state=10)
GB_model =  GradientBoostingClassifier(random_state=10)


#LR_model.fit(train_scaled,y_train)
#KNN_model.fit(train_scaled,y_train)
#DT_model.fit(train_scaled,y_train)
RF_model.fit(train_scaled,y_train)
AB_model.fit(train_scaled,y_train)
GB_model.fit(train_scaled,y_train)

In [24]:
from sklearn.metrics import f1_score

In [None]:
RF_pred = RF_model.predict(test_scaled)
AB_pred = RF_model.predict(test_scaled)
GB_pred = RF_model.predict(test_scaled)
KNN_pred = KNN_model.predict(test_scaled)


print("random forest:",f1_score(y_test,RF_pred))
print("Ada boost",f1_score(y_test,AB_pred))
print("Gradient boost",f1_score(y_test,GB_pred))
print("Gradient boost",f1_score(y_test,KNN_pred))

In [36]:
param_grid = {
    "criterion":             ["gini"],
  "max_features"       :  ["sqrt","log2"],
  "n_estimators"       :  [100],
  "max_depth"          :  [3,5],
  "min_samples_split"  :  [10,15],
  "min_samples_leaf"   :  [5],
  "max_leaf_nodes"     :  [4,5]
 }

RF_model =  RandomForestClassifier(random_state=10)

grid_search = GridSearchCV(estimator=RF_model, param_grid=param_grid, cv=3, scoring='roc_auc')

grid_search.fit(train_scaled, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'max_leaf_nodes': 5, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 100}


In [56]:
RF_model2 =  RandomForestClassifier(criterion = 'gini',
                        max_features= 'log2',max_leaf_nodes= 2, min_samples_leaf= 1, min_samples_split= 2,
                                    n_estimators= 25,random_state=10)
RF_model2.fit(train_scaled,y_train)

RF_pred2 = RF_model2.predict(test_scaled)
print("random forest:",f1_score(y_test,RF_pred2))

random forest: 0.0


In [57]:
performance(RF_model2,test_scaled,y_test)

[[10054     0]
 [ 7403     0]]
              precision    recall  f1-score   support

           0       0.58      1.00      0.73     10054
           1       0.00      0.00      0.00      7403

    accuracy                           0.58     17457
   macro avg       0.29      0.50      0.37     17457
weighted avg       0.33      0.58      0.42     17457

AUC score 0.5025327972968663


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[10054,     0],
       [ 7403,     0]], dtype=int64)

In [32]:
RF_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 10,
 'verbose': 0,
 'warm_start': False}

In [None]:
param_grid = {
    "criterion":             ["gini","entropy"],
  "max_features"       :  ["sqrt","log2"],
  "n_estimators"       :  [10, 15, 5, 20, 12],
  "max_depth"          :  range(2,10),
  "min_samples_split"  :  range(2,10),
  "min_samples_leaf"   :  range(1,10),
  "max_leaf_nodes"     :  range(1,10)
 }

RF_model =  RandomForestClassifier(random_state=10)

grid_search = GridSearchCV(estimator=RF_model, param_grid=param_grid, cv=3, scoring='roc_auc')

grid_search.fit(train_scaled, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Traceback (most recent call last):
  File "C:\Users\Amruta\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 459, in _score
    y_pred = method_caller(clf, "decision_function", X, pos_label=pos_label)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Amruta\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Amruta\anaconda3\Lib\site-packages\sklearn\utils\_response.py", line 73, in _get_response_values
    prediction_method = _check_response_method(estimator, response_method)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Amruta\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1940, in _check_response_method
    raise AttributeError(
AttributeError: RandomForestClassifier has none of the following attributes: decision_function.

During handling of the

In [None]:
param_grid = {
  'learning_rate': [0.01, 0.1, 0.2],  
  'n_estimators'       :  [10,25,50, 100, 200],
  "max_depth"          :  range(2,10),
  "min_samples_split"  :  range(2,10),
  "min_samples_leaf"   :  range(1,10),
  
 }

GB_model =  GradientBoostingClassifier(random_state=10)

grid_search = GridSearchCV(estimator=GB_model, param_grid=param_grid, cv=3, scoring='roc_auc')

grid_search.fit(train_scaled, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

In [95]:
performance(LR_model,test_scaled,y_test)
performance(DT_model,test_scaled,y_test)
performance(RF_model,test_scaled,y_test)
performance(AB_model,test_scaled,y_test)
performance(GB_model,test_scaled,y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[10054     0]
 [ 7403     0]]
              precision    recall  f1-score   support

           0       0.58      1.00      0.73     10054
           1       0.00      0.00      0.00      7403

    accuracy                           0.58     17457
   macro avg       0.29      0.50      0.37     17457
weighted avg       0.33      0.58      0.42     17457

AUC score 0.495486953189505
[[5882 4172]
 [4298 3105]]
              precision    recall  f1-score   support

           0       0.58      0.59      0.58     10054
           1       0.43      0.42      0.42      7403

    accuracy                           0.51     17457
   macro avg       0.50      0.50      0.50     17457
weighted avg       0.51      0.51      0.51     17457

AUC score 0.5022326687004588
[[8470 1584]
 [6213 1190]]
              precision    recall  f1-score   support

           0       0.58      0.84      0.68     10054
           1       0.43      0.16      0.23      7403

    accuracy                           0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[ 523, 9531],
       [ 411, 6992]], dtype=int64)

In [None]:
from sklearn.metrics import f1_score

In [61]:
df_train.head()

Unnamed: 0,ID,AGE,GENDER,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,PAST_ACCIDENTS,OUTCOME,Category_0,Category_1,Category_2,Category_3,Category_4,Category_5,Category_6,Category_HatchBack,Category_SUV,Category_Sedan,Category_Sports Car
0,816393,2.0,0,2.0,2.0,2.0,0.63805,0,0,0,0,37379,11000,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,251762,1.0,1,2.0,1.0,2.0,0.475741,1,1,1,0,10238,9000,0,0,1,1,0,0,0,0,0,0,1,0,0,0
2,481952,2.0,1,2.0,0.0,2.0,0.839817,1,1,1,1,10238,12000,0,0,1,1,0,0,0,0,0,0,0,0,1,0
3,3506,2.0,1,2.0,1.0,3.0,0.682527,1,1,0,1,92099,6000,1,0,1,1,0,0,0,0,0,0,0,0,1,0
4,498013,2.0,0,2.0,0.0,1.0,0.572184,1,0,1,1,32122,15000,0,1,0,1,0,0,0,0,0,0,0,0,1,0


In [99]:
SS_train=train_scaled[["AGE","DRIVING_EXPERIENCE","CREDIT_SCORE","VEHICLE_YEAR","ANNUAL_MILEAGE","SPEEDING_VIOLATIONS","PAST_ACCIDENTS"]]
SS_test =test_scaled[["AGE","DRIVING_EXPERIENCE","CREDIT_SCORE","VEHICLE_YEAR","ANNUAL_MILEAGE","SPEEDING_VIOLATIONS","PAST_ACCIDENTS"]]
SS_df_test = df_test[["AGE","DRIVING_EXPERIENCE","CREDIT_SCORE","VEHICLE_YEAR","ANNUAL_MILEAGE","SPEEDING_VIOLATIONS","PAST_ACCIDENTS"]]

In [96]:
LR_model =  LogisticRegression()
KNN_model=  KNeighborsClassifier()
DT_model =  DecisionTreeClassifier(random_state=10)

LR_model.fit(SS_train,y_train)
KNN_model.fit(SS_train,y_train)
DT_model.fit(SS_train,y_train)

In [97]:
LR_pred = LR_model.predict(SS_test)
KNN_pred = KNN_model.predict(SS_test)
DT_pred = DT_model.predict(SS_test)


print("Logistic Regression:",f1_score(y_test,LR_pred))
print("KNN",f1_score(y_test,KNN_pred))
print("Decision tree ",f1_score(y_test,DT_pred))


Logistic Regression: 0.0
KNN 0.37371572178283685
Decision tree  0.5167964283769805


In [104]:
from sklearn.metrics import f1_score
y_pred_test=DT_model.predict(SS_df_test)
y_pred_test= pd.Series(y_pred_test)
print(y_pred_test.shape)
y_pred_test.head()

df_test_id = pd.read_csv("test_insurance.csv")
df_test_id = df_test_id.iloc[:,0]
df_test_id.shape

output=DT_model.predict(SS_df_test)

output = pd.concat([df_test_id,y_pred_test],axis=1)
output.rename(columns={0: 'OUTCOME'}, inplace=True)
output.head()

(17715,)


Unnamed: 0,ID,OUTCOME
0,303713,0
1,141107,0
2,447316,0
3,196066,0
4,179947,1


In [106]:
output.to_csv('output_file5.csv', index=False)

In [68]:
performance(LR_model,SS_test,y_test)
performance(DT_model,SS_test,y_test)

[[10054     0]
 [ 7403     0]]
              precision    recall  f1-score   support

           0       0.58      1.00      0.73     10054
           1       0.00      0.00      0.00      7403

    accuracy                           0.58     17457
   macro avg       0.29      0.50      0.37     17457
weighted avg       0.33      0.58      0.42     17457

AUC score 0.49646001823840313
[[3836 6218]
 [2657 4746]]
              precision    recall  f1-score   support

           0       0.59      0.38      0.46     10054
           1       0.43      0.64      0.52      7403

    accuracy                           0.49     17457
   macro avg       0.51      0.51      0.49     17457
weighted avg       0.52      0.49      0.49     17457

AUC score 0.5113155675548177


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[3836, 6218],
       [2657, 4746]], dtype=int64)

In [91]:
DT_model =  DecisionTreeClassifier(criterion='gini',
    splitter='best',
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=5,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    class_weight="balanced",
    ccp_alpha=0.0,
    random_state=10
  )

DT_model.fit(SS_train,y_train)

DT_pred = DT_model.predict(SS_test)

print("Decision tree ",f1_score(y_test,DT_pred))
performance(DT_model,SS_test,y_test)

Decision tree  0.3128689492325856
[[7454 2600]
 [5548 1855]]
              precision    recall  f1-score   support

           0       0.57      0.74      0.65     10054
           1       0.42      0.25      0.31      7403

    accuracy                           0.53     17457
   macro avg       0.49      0.50      0.48     17457
weighted avg       0.51      0.53      0.51     17457

AUC score 0.4924562032053791


array([[7454, 2600],
       [5548, 1855]], dtype=int64)

In [94]:

RF_model1 =  RandomForestClassifier(n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='sqrt',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,
    random_state=10)

RF_model1.fit(SS_train,y_train)


In [95]:
RF_pred1 = RF_model1.predict(SS_test)
print("f1 score:",f1_score(y_test,RF_pred1))
performance(RF_model1,SS_test,y_test)

f1 score: 0.21096531380327535
[[8663 1391]
 [6366 1037]]
              precision    recall  f1-score   support

           0       0.58      0.86      0.69     10054
           1       0.43      0.14      0.21      7403

    accuracy                           0.56     17457
   macro avg       0.50      0.50      0.45     17457
weighted avg       0.51      0.56      0.49     17457

AUC score 0.4970231867730546


array([[8663, 1391],
       [6366, 1037]], dtype=int64)

In [69]:
RF_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 10,
 'verbose': 0,
 'warm_start': False}

In [None]:
round1 = 
xgb_classifier2 = XGBClassifier(
    objective="binary:logistic",  
    eval_metric="logloss",         
    use_label_encoder=False,       
    n_estimators=1000,             
    early_stopping_rounds=10,      
    verbosity=1,
    gamma =  0,
    learning_rate=0.01,
    max_depth = 5,
    reg_lambda =10.0,
    scale_pos_weight=5  