In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
%matplotlib inline

In [34]:
Username = 'admin'
Password = 'amazing_people'
Host = 'alphacode-explore.ccwgqdqrrmvt.eu-west-1.rds.amazonaws.com'
Port = '1433'
Database = 'Mamello'

connection = create_engine(
    'mssql+pymssql://' +
    Username + ':' + Password + '@' + Host + ':' + Port + '/' + Database
)


In [35]:
sql_query_claims = "Select * from claims_data"
sql_query_test = "Select * from test_set"

In [36]:
df_claims = pd.read_sql_query(sql_query_claims, connection)
df_test = pd.read_sql_query(sql_query_test, connection)

df_claims_no_labels = df_claims.drop(['insurance_claim', 'claim_amount'], axis=1)
df_test_no_index = df_test.drop(['row_id'], axis=1)

df_combined = pd.concat([df_claims_no_labels, df_test_no_index],sort=False).reset_index(drop=True)

# Pre-processing

In [37]:
# Lables
y = df_claims['insurance_claim']

# Features
X = df_claims.drop(['insurance_claim', 'claim_amount'], axis=1)

# Index
r = df_test['row_id']

# Transforming the features

In [38]:
combined_transformed = pd.get_dummies(df_combined, drop_first=True)

# split the transformed DataFrame
X_transformed = combined_transformed.iloc[:len(y), :]
test_transformed = combined_transformed.iloc[len(y):, :]

# Train/Test Split

In [39]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=47)

# Training

In [40]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

lm = LogisticRegression()
lm.fit(X_train,y_train)

#K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

#Decision Tree
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=50)
tree.fit(X_train, y_train)

#Random Forest
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100, random_state=23)
forest.fit(X_train, y_train)

#Support Vector Machines (SVC)
from sklearn.svm import SVC

svm = SVC(random_state=23)
svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=23, shrinking=True, tol=0.001,
    verbose=False)

# Predicting

In [41]:
pred_lm = lm.predict(X_test)
pred_knn = knn.predict(X_test)
pred_tree = tree.predict(X_test)
pred_forest = forest.predict(X_test)
pred_svm = svm.predict(X_test)

# Testing

In [42]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

labels = ['No', 'Yes']

Confusion Matrix

In [43]:
print('Logistic Regression Confusion Matrix')
pd.DataFrame(data=confusion_matrix(y_test, pred_lm), index=labels, columns=labels)

Logistic Regression Confusion Matrix


Unnamed: 0,No,Yes
No,80,22
Yes,26,140


In [44]:
print('K-Nearest Neighbors Confusion Matrix')
pd.DataFrame(data=confusion_matrix(y_test, pred_knn), index=labels, columns=labels)

K-Nearest Neighbors Confusion Matrix


Unnamed: 0,No,Yes
No,61,41
Yes,25,141


In [45]:
print('Decision Tree Confusion Matrix')
pd.DataFrame(data=confusion_matrix(y_test, pred_tree), index=labels, columns=labels)

Decision Tree Confusion Matrix


Unnamed: 0,No,Yes
No,80,22
Yes,13,153


In [46]:
print('Random Forest Confusion Matrix')
pd.DataFrame(data=confusion_matrix(y_test, pred_forest), index=labels, columns=labels)

Random Forest Confusion Matrix


Unnamed: 0,No,Yes
No,77,25
Yes,16,150


In [47]:
print('Support Vector Machine Confusion Matrix')
pd.DataFrame(data=confusion_matrix(y_test, pred_svm), index=labels, columns=labels)

Support Vector Machine Confusion Matrix


Unnamed: 0,No,Yes
No,80,22
Yes,24,142


Classification Report

In [48]:
from sklearn.metrics import classification_report

In [49]:
print('Logistic Regression - Classification Report')
print(classification_report(y_test, pred_lm, target_names=['No', 'Yes']))

Logistic Regression - Classification Report
              precision    recall  f1-score   support

          No       0.75      0.78      0.77       102
         Yes       0.86      0.84      0.85       166

    accuracy                           0.82       268
   macro avg       0.81      0.81      0.81       268
weighted avg       0.82      0.82      0.82       268



In [50]:
print('K-Nearest Neighbor - Classification Report')
print(classification_report(y_test, pred_knn, target_names=['No', 'Yes']))

K-Nearest Neighbor - Classification Report
              precision    recall  f1-score   support

          No       0.71      0.60      0.65       102
         Yes       0.77      0.85      0.81       166

    accuracy                           0.75       268
   macro avg       0.74      0.72      0.73       268
weighted avg       0.75      0.75      0.75       268



In [51]:
print('Decision Tree - Classification Report')
print(classification_report(y_test, pred_tree, target_names=['No', 'Yes']))

Decision Tree - Classification Report
              precision    recall  f1-score   support

          No       0.86      0.78      0.82       102
         Yes       0.87      0.92      0.90       166

    accuracy                           0.87       268
   macro avg       0.87      0.85      0.86       268
weighted avg       0.87      0.87      0.87       268



In [52]:
print('Random Forest - Classification Report')
print(classification_report(y_test, pred_forest, target_names=['No', 'Yes']))

Random Forest - Classification Report
              precision    recall  f1-score   support

          No       0.83      0.75      0.79       102
         Yes       0.86      0.90      0.88       166

    accuracy                           0.85       268
   macro avg       0.84      0.83      0.83       268
weighted avg       0.85      0.85      0.85       268



In [53]:
print('SVM - Classification Report')
print(classification_report(y_test, pred_svm, target_names=['No', 'Yes']))

SVM - Classification Report
              precision    recall  f1-score   support

          No       0.77      0.78      0.78       102
         Yes       0.87      0.86      0.86       166

    accuracy                           0.83       268
   macro avg       0.82      0.82      0.82       268
weighted avg       0.83      0.83      0.83       268



# Tuning

Tuning KNN

In [54]:
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_20 = KNeighborsClassifier(n_neighbors=20)
knn_25 = KNeighborsClassifier(n_neighbors=25)

knn_3.fit(X_train, y_train)
knn_5.fit(X_train, y_train)
knn_20.fit(X_train, y_train)
knn_25.fit(X_train, y_train)

pred_knn_3 = knn_3.predict(X_test)
pred_knn_5 = knn_5.predict(X_test)
pred_knn_20 = knn_20.predict(X_test)
pred_knn_25 = knn_25.predict(X_test)

labels = ['No', 'Yes']

pred = {'k=3':pred_knn_3, 'k=5':pred_knn_5, 'k=20':pred_knn_20, 'k=25':pred_knn_25}

for k,v in pred.items():
    print('{}'.format(k))
    print(pd.DataFrame(data=confusion_matrix(y_test, v), index=labels, columns=labels))
    print('\n')
    
pred = {'k=3':pred_knn_3, 'k=5':pred_knn_5, 'k=20':pred_knn_20, 'k=25':pred_knn_25}

for k,v in pred.items():
    print('{}'.format(k))
    print(classification_report(y_test, v, target_names=['No', 'Yes']))
    print('\n')

k=3
     No  Yes
No   57   45
Yes  27  139


k=5
     No  Yes
No   61   41
Yes  25  141


k=20
     No  Yes
No   78   24
Yes  30  136


k=25
     No  Yes
No   70   32
Yes  17  149


k=3
              precision    recall  f1-score   support

          No       0.68      0.56      0.61       102
         Yes       0.76      0.84      0.79       166

    accuracy                           0.73       268
   macro avg       0.72      0.70      0.70       268
weighted avg       0.73      0.73      0.73       268



k=5
              precision    recall  f1-score   support

          No       0.71      0.60      0.65       102
         Yes       0.77      0.85      0.81       166

    accuracy                           0.75       268
   macro avg       0.74      0.72      0.73       268
weighted avg       0.75      0.75      0.75       268



k=20
              precision    recall  f1-score   support

          No       0.72      0.76      0.74       102
         Yes       0.85      0.82     

Tuning Decision Tree

In [55]:
# max_depth=5 and min_samples_leaf=5
tree_0 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, random_state=50)

# max_depth=2 and min_samples_leaf=5
tree_1 = DecisionTreeClassifier(max_depth=2, min_samples_leaf=5, random_state=50)

# max_depth=5 and min_samples_leaf=4
tree_2 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=4, random_state=50)

# max_depth=7 and min_samples_leaf=3
tree_3 = DecisionTreeClassifier(max_depth=7, min_samples_leaf=3, random_state=50)

tree_0.fit(X_train, y_train)
tree_1.fit(X_train, y_train)
tree_2.fit(X_train, y_train)
tree_3.fit(X_train, y_train)

pred_tree_0 = tree_1.predict(X_test)
pred_tree_1 = tree_1.predict(X_test)
pred_tree_2 = tree_2.predict(X_test)
pred_tree_3 = tree_3.predict(X_test)

pred = {'max_depth=5; min_samples_leaf=5':pred_tree_0, 
        'max_depth=2; min_samples_leaf=5':pred_tree_1, 
        'max_depth=5; min_samples_leaf=4':pred_tree_2, 
        'max_depth=7; min_samples_leaf=3':pred_tree_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(pd.DataFrame(data=confusion_matrix(y_test, v), index=labels, columns=labels))
    print('\n')

pred = {'max_depth=5; min_samples_leaf=5':pred_tree_0, 
        'max_depth=2; min_samples_leaf=5':pred_tree_1, 
        'max_depth=5; min_samples_leaf=4':pred_tree_2, 
        'max_depth=7; min_samples_leaf=3':pred_tree_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(classification_report(y_test, v, target_names=['No', 'Yes']))
    print('\n')

max_depth=5; min_samples_leaf=5
     No  Yes
No   19   83
Yes   6  160


max_depth=2; min_samples_leaf=5
     No  Yes
No   19   83
Yes   6  160


max_depth=5; min_samples_leaf=4
     No  Yes
No   78   24
Yes  34  132


max_depth=7; min_samples_leaf=3
     No  Yes
No   53   49
Yes  18  148


max_depth=5; min_samples_leaf=5
              precision    recall  f1-score   support

          No       0.76      0.19      0.30       102
         Yes       0.66      0.96      0.78       166

    accuracy                           0.67       268
   macro avg       0.71      0.58      0.54       268
weighted avg       0.70      0.67      0.60       268



max_depth=2; min_samples_leaf=5
              precision    recall  f1-score   support

          No       0.76      0.19      0.30       102
         Yes       0.66      0.96      0.78       166

    accuracy                           0.67       268
   macro avg       0.71      0.58      0.54       268
weighted avg       0.70      0.67      0.60

Tuning Random Forest

In [56]:
# 5 trees in forest
forest_1 = RandomForestClassifier(n_estimators=5, random_state=23)

# 20 trees in forest
forest_2 = RandomForestClassifier(n_estimators=20, random_state=23)

# 100 trees in forest
forest_3 = RandomForestClassifier(n_estimators=100, random_state=23)

forest_1.fit(X_train, y_train)
forest_2.fit(X_train, y_train)
forest_3.fit(X_train, y_train)

pred_forest_1 = forest_1.predict(X_test)
pred_forest_2 = forest_2.predict(X_test)
pred_forest_3 = forest_3.predict(X_test)

labels = ['No', 'Yes']

pred = {'trees = 5':pred_forest_1, 
        'trees = 20':pred_forest_2, 
        'trees = 100':pred_forest_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(pd.DataFrame(data=confusion_matrix(y_test, v), index=labels, columns=labels))
    print('\n')

pred = {'trees = 5':pred_forest_1, 
        'trees = 20':pred_forest_2, 
        'trees = 100':pred_forest_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(classification_report(y_test, v, target_names=['No', 'Yes']))
    print('\n')

trees = 5
     No  Yes
No   70   32
Yes  25  141


trees = 20
     No  Yes
No   79   23
Yes  23  143


trees = 100
     No  Yes
No   77   25
Yes  16  150


trees = 5
              precision    recall  f1-score   support

          No       0.74      0.69      0.71       102
         Yes       0.82      0.85      0.83       166

    accuracy                           0.79       268
   macro avg       0.78      0.77      0.77       268
weighted avg       0.79      0.79      0.79       268



trees = 20
              precision    recall  f1-score   support

          No       0.77      0.77      0.77       102
         Yes       0.86      0.86      0.86       166

    accuracy                           0.83       268
   macro avg       0.82      0.82      0.82       268
weighted avg       0.83      0.83      0.83       268



trees = 100
              precision    recall  f1-score   support

          No       0.83      0.75      0.79       102
         Yes       0.86      0.90      0.88 

Tuning SVM

In [57]:
# kernel=linear, C=1, gamma=0.01
svm_0 = SVC(kernel='linear', C=1, gamma=0.01, random_state=23)

# kernel=linear, C=1, gamma=0.1
svm_1 = SVC(kernel='linear', C=1, gamma=0.1, random_state=23)

# kernel=rbf, C=10, gamma=0.01
svm_2 = SVC(kernel='rbf', C=10, gamma=0.01, random_state=23)

# kernel=poly, C=100, gamma=1
svm_3 = SVC(kernel='poly', C=100, gamma=1, random_state=23)

svm_0.fit(X_train, y_train)
svm_1.fit(X_train, y_train)
svm_2.fit(X_train, y_train)
svm_3.fit(X_train, y_train)

pred_svm_0 = svm_0.predict(X_test)
pred_svm_1 = svm_1.predict(X_test)
pred_svm_2 = svm_2.predict(X_test)
pred_svm_3 = svm_3.predict(X_test)

labels = ['No', 'Yes']

pred = {'kernel=linear, C=1, gamma=0.01':pred_svm_0, 
        'kernel=linear, C=1, gamma=0.1':pred_svm_1, 
        'kernel=rbf, C=10, gamma=0.01':pred_svm_2, 
        'kernel=poly, C=100, gamma=1':pred_svm_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(pd.DataFrame(data=confusion_matrix(y_test, v), index=labels, columns=labels))
    print('\n')

pred = {'kernel=linear, C=1, gamma=0.01':pred_svm_0, 
        'kernel=linear, C=1, gamma=0.1':pred_svm_1, 
        'kernel=rbf, C=10, gamma=0.01':pred_svm_2, 
        'kernel=poly, C=100, gamma=1':pred_svm_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(classification_report(y_test, v, target_names=['No', 'Yes']))
    print('\n')

kernel=linear, C=1, gamma=0.01
     No  Yes
No   82   20
Yes  23  143


kernel=linear, C=1, gamma=0.1
     No  Yes
No   82   20
Yes  23  143


kernel=rbf, C=10, gamma=0.01
     No  Yes
No   78   24
Yes  24  142


kernel=poly, C=100, gamma=1
     No  Yes
No   70   32
Yes  18  148


kernel=linear, C=1, gamma=0.01
              precision    recall  f1-score   support

          No       0.78      0.80      0.79       102
         Yes       0.88      0.86      0.87       166

    accuracy                           0.84       268
   macro avg       0.83      0.83      0.83       268
weighted avg       0.84      0.84      0.84       268



kernel=linear, C=1, gamma=0.1
              precision    recall  f1-score   support

          No       0.78      0.80      0.79       102
         Yes       0.88      0.86      0.87       166

    accuracy                           0.84       268
   macro avg       0.83      0.83      0.83       268
weighted avg       0.84      0.84      0.84       268




# KNN Scaled

In [58]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_transformed)
X_standardize = pd.DataFrame(X_scaled,columns=X_transformed.columns)

X_train, X_test, y_train, y_test = train_test_split(X_standardize, y, test_size=0.2, random_state=47)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

pred_knn = knn.predict(X_test)

labels = ['No', 'Yes']

print('Confusion Matrix')
print(pd.DataFrame(data=confusion_matrix(y_test, pred_knn), index=labels, columns=labels))

print('Classification Report')
print(classification_report(y_test, pred_knn, target_names=['No', 'Yes']))

Confusion Matrix
     No  Yes
No   63   39
Yes  73   93
Classification Report
              precision    recall  f1-score   support

          No       0.46      0.62      0.53       102
         Yes       0.70      0.56      0.62       166

    accuracy                           0.58       268
   macro avg       0.58      0.59      0.58       268
weighted avg       0.61      0.58      0.59       268



# Deciding on the algorithms (& tunings)

Decision Tree has the best f1 score.

In [59]:
test_pred_tree = tree.predict(test_transformed)

In [60]:
test_pred_tree

array(['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
       'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
       'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no', 'yes',
       'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
       'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes',
       'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes',
       'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'no', 'yes',
       'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'no',
       'yes', 'yes', 'no', 'yes', 'yes', 'no', 'yes', 'no', 'no', 'yes',
       'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no', 'yes', 'yes',
       'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes',
       'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no',
       'no', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'yes',
       'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', '

In [61]:
r

0        1
1        2
2        3
3        4
4        5
      ... 
295    296
296    297
297    298
298    299
299    300
Name: row_id, Length: 300, dtype: object

In [62]:
type(r)

pandas.core.series.Series

In [63]:
type(test_pred_tree)

numpy.ndarray

In [64]:
submission = pd.DataFrame({'row_id':r.values,'prediction': test_pred_tree})
submission.head(50)

Unnamed: 0,row_id,prediction
0,1,yes
1,2,yes
2,3,yes
3,4,yes
4,5,yes
5,6,yes
6,7,yes
7,8,yes
8,9,yes
9,10,yes


# Save the DataFrame to SQL

In [65]:
submission.to_sql('classification_model_results', con=connection, if_exists='append', index=False,method='multi',chunksize=500)

In [66]:
connection.table_names()

['claims_data',
 'classification_model_results',
 'football_players-a-1298',
 'test_set']