In [59]:
import pandas as pd


In [60]:
dataset = pd.read_csv('/workspaces/BSMM-8740-1/DataSets/Car_Insurance_Claim.csv')

In [61]:
dataset

Unnamed: 0,ID,AGE,GENDER,RACE,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,VEHICLE_TYPE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,OUTCOME
0,569520,65+,female,majority,0-9y,high school,upper class,0.629027,1.0,after 2015,0.0,1.0,10238,12000.0,sedan,0,0,0,0.0
1,750365,16-25,male,majority,0-9y,none,poverty,0.357757,0.0,before 2015,0.0,0.0,10238,16000.0,sedan,0,0,0,1.0
2,199901,16-25,female,majority,0-9y,high school,working class,0.493146,1.0,before 2015,0.0,0.0,10238,11000.0,sedan,0,0,0,0.0
3,478866,16-25,male,majority,0-9y,university,working class,0.206013,1.0,before 2015,0.0,1.0,32765,11000.0,sedan,0,0,0,0.0
4,731664,26-39,male,majority,10-19y,none,working class,0.388366,1.0,before 2015,0.0,0.0,32765,12000.0,sedan,2,0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,323164,26-39,female,majority,10-19y,university,upper class,0.582787,1.0,before 2015,0.0,0.0,10238,16000.0,sedan,0,0,1,0.0
9996,910346,26-39,female,majority,10-19y,none,middle class,0.522231,1.0,after 2015,0.0,1.0,32765,,sedan,1,0,0,0.0
9997,468409,26-39,male,majority,0-9y,high school,middle class,0.470940,1.0,before 2015,0.0,1.0,10238,14000.0,sedan,0,0,0,0.0
9998,903459,26-39,female,majority,10-19y,high school,poverty,0.364185,0.0,before 2015,0.0,1.0,10238,13000.0,sedan,2,0,1,1.0


In [62]:
categorical_features = ['AGE', 'GENDER', 'RACE', 'DRIVING_EXPERIENCE', 
                        'EDUCATION', 'INCOME', 'VEHICLE_OWNERSHIP', 
                        'VEHICLE_YEAR', 'MARRIED', 'CHILDREN', 
                        'POSTAL_CODE', 'VEHICLE_TYPE']

numeric_features = ['CREDIT_SCORE', 'ANNUAL_MILEAGE', 'SPEEDING_VIOLATIONS', 
                    'DUIS', 'PAST_ACCIDENTS' ]

target = 'OUTCOME'

In [63]:
X = dataset[categorical_features + numeric_features]

In [64]:
y = dataset[target]

In [65]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [66]:
preprocessor = ColumnTransformer(
    transformers = [
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', SimpleImputer(strategy='median'), numeric_features)
    ]
)

In [67]:
X = preprocessor.fit_transform(X)

In [68]:
from sklearn.model_selection import train_test_split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)


In [70]:
from sklearn.tree import DecisionTreeClassifier

In [71]:
model = DecisionTreeClassifier(random_state=42)

In [72]:
model.fit(X_train, y_train)

In [73]:
y_pred = model.predict(X_test)

In [74]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score


In [75]:
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('ROC AUC: {:.2f}'.format(roc_auc_score(y_test, y_pred)))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))


Accuracy: 0.78
ROC AUC: 0.74
Confusion Matrix:
 [[1161  206]
 [ 239  394]]


Unnamed: 0,0.0,1.0,accuracy,macro avg,weighted avg
precision,0.829286,0.656667,0.7775,0.742976,0.774652
recall,0.849305,0.622433,0.7775,0.735869,0.7775
f1-score,0.839176,0.639092,0.7775,0.739134,0.775849
support,1367.0,633.0,0.7775,2000.0,2000.0


In [76]:
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
pd.DataFrame(confusion_matrix(y_test, y_pred), columns=['Predicted No', 'Predicted Yes'], index=['Actual No', 'Actual Yes'])

Accuracy: 0.78


Unnamed: 0,Predicted No,Predicted Yes
Actual No,1161,206
Actual Yes,239,394


In [77]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


In [78]:
clf_dt = DecisionTreeClassifier(random_state=42)
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 4, 6, 8, 10, 12]}
gs_dt = RandomizedSearchCV(clf_dt, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=20)
gs_dt.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[CV 2/5; 1/10] START criterion=entropy, max_depth=2.............................
[CV 2/5; 1/10] END criterion=entropy, max_depth=2;, score=0.829 total time=   0.0s
[CV 1/5; 1/10] START criterion=entropy, max_depth=2.............................
[CV 3/5; 1/10] START criterion=entropy, max_depth=2.............................
[CV 1/5; 1/10] END criterion=entropy, max_depth=2;, score=0.831 total time=   0.0s
[CV 3/5; 1/10] END criterion=entropy, max_depth=2;, score=0.834 total time=   0.0s
[CV 4/5; 1/10] START criterion=entropy, max_depth=2.............................
[CV 4/5; 1/10] END criterion=entropy, max_depth=2;, score=0.830 total time=   0.0s
[CV 5/5; 1/10] START criterion=entropy, max_depth=2.............................
[CV 1/5; 2/10] START criterion=gini, max_depth=4................................
[CV 5/5; 1/10] END criterion=entropy, max_depth=2;, score=0.824 total time=   0.0s
[CV 2/5; 2/10] START criterion=gini, max_depth=4................................
[CV 1/5; 2/10] END

In [79]:
print('Best Score:', gs_dt.best_score_)
print('Best Params:', gs_dt.best_params_)
print('Best Estimator:', gs_dt.best_estimator_)

Best Score: 0.9074954545454545
Best Params: {'max_depth': 6, 'criterion': 'entropy'}
Best Estimator: DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state=42)


In [80]:
dt_classifier = DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=6, random_state=42)
dt_classifier.fit(X_train, y_train)


In [84]:
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('ROC AUC: {:.2f}'.format(roc_auc_score(y_test, y_pred)))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))

Accuracy: 0.78
ROC AUC: 0.74
Confusion Matrix:
 [[1161  206]
 [ 239  394]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.83      0.85      0.84      1367
         1.0       0.66      0.62      0.64       633

    accuracy                           0.78      2000
   macro avg       0.74      0.74      0.74      2000
weighted avg       0.77      0.78      0.78      2000



Unnamed: 0,0.0,1.0,accuracy,macro avg,weighted avg
precision,0.829286,0.656667,0.7775,0.742976,0.774652
recall,0.849305,0.622433,0.7775,0.735869,0.7775
f1-score,0.839176,0.639092,0.7775,0.739134,0.775849
support,1367.0,633.0,0.7775,2000.0,2000.0


In [86]:
from sklearn.ensemble import RandomForestClassifier

In [88]:
rf_classifier = RandomForestClassifier(class_weight='balanced', criterion='gini', max_depth=6, n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)