In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv('carInsurance_train.csv')

In [3]:
data

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,Outcome,CallStart,CallEnd,CarInsurance
0,1,32,management,single,tertiary,0,1218,1,0,telephone,28,jan,2,-1,0,,13:45:20,13:46:30,0
1,2,32,blue-collar,married,primary,0,1156,1,0,,26,may,5,-1,0,,14:49:03,14:52:08,0
2,3,29,management,single,tertiary,0,637,1,0,cellular,3,jun,1,119,1,failure,16:30:24,16:36:04,1
3,4,25,student,single,primary,0,373,1,0,cellular,11,may,2,-1,0,,12:06:43,12:20:22,1
4,5,30,management,married,tertiary,0,2694,0,0,cellular,3,jun,1,-1,0,,14:35:44,14:38:56,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3996,28,technician,single,tertiary,0,0,1,0,cellular,25,may,1,40,2,failure,17:46:28,17:50:57,1
3996,3997,49,admin.,divorced,secondary,0,124,1,1,cellular,29,jul,19,-1,0,,14:49:16,14:51:21,0
3997,3998,27,admin.,single,secondary,0,-400,0,1,cellular,8,jul,1,-1,0,,12:19:03,12:23:53,0
3998,3999,36,entrepreneur,single,tertiary,0,658,1,0,cellular,29,jan,1,227,3,failure,11:27:35,11:29:14,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                4000 non-null   int64 
 1   Age               4000 non-null   int64 
 2   Job               3981 non-null   object
 3   Marital           4000 non-null   object
 4   Education         3831 non-null   object
 5   Default           4000 non-null   int64 
 6   Balance           4000 non-null   int64 
 7   HHInsurance       4000 non-null   int64 
 8   CarLoan           4000 non-null   int64 
 9   Communication     3098 non-null   object
 10  LastContactDay    4000 non-null   int64 
 11  LastContactMonth  4000 non-null   object
 12  NoOfContacts      4000 non-null   int64 
 13  DaysPassed        4000 non-null   int64 
 14  PrevAttempts      4000 non-null   int64 
 15  Outcome           958 non-null    object
 16  CallStart         4000 non-null   object
 17  CallEnd       

## 1. PREPROCESSING

In [6]:
{column: len(data[column].unique()) for column in data.columns}

{'Id': 4000,
 'Age': 70,
 'Job': 12,
 'Marital': 3,
 'Education': 4,
 'Default': 2,
 'Balance': 2178,
 'HHInsurance': 2,
 'CarLoan': 2,
 'Communication': 3,
 'LastContactDay': 31,
 'LastContactMonth': 12,
 'NoOfContacts': 35,
 'DaysPassed': 330,
 'PrevAttempts': 20,
 'Outcome': 4,
 'CallStart': 3777,
 'CallEnd': 3764,
 'CarInsurance': 2}

In [5]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [7]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop Id column - 4000 unique values
    df = df.drop('Id', axis=1)
    
    # Drop Outcome column (too many missing values)
    df = df.drop('Outcome', axis=1)
    
    # Fill categorical missing values with column modes
    for column in ['Job', 'Education', 'Communication']:
        df[column] = df[column].fillna(df[column].mode()[0])
    
    # Extract duration feature
    df['CallDuration'] = (pd.to_datetime(df['CallEnd']) - pd.to_datetime(df['CallStart'])).apply(lambda x: x.seconds)
    df = df.drop(['CallStart', 'CallEnd'], axis=1)
    
    # Binary encoding
    df['Communication'] = df['Communication'].replace({'telephone': 0, 'cellular': 1})
    
    # Ordinal encoding
    df['Education'] = df['Education'].replace({'primary': 0, 'secondary': 1, 'tertiary': 2})
    df['LastContactMonth'] = df['LastContactMonth'].replace({
        'jan': 0, 'feb': 1, 'mar': 2, 'apr': 3, 'may': 4, 'jun': 5, 'jul': 6, 'aug': 7, 'sep': 8, 'oct': 9, 'nov': 10, 'dec': 11
    })
    
    # One-hot encoding
    for column in ['Job', 'Marital']:
        df = onehot_encode(df, column)
    
    # Split df into X and y
    y = df['CarInsurance']
    X = df.drop('CarInsurance', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [9]:
X_train

Unnamed: 0,Age,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,...,Job_management,Job_retired,Job_self-employed,Job_services,Job_student,Job_technician,Job_unemployed,Marital_divorced,Marital_married,Marital_single
1850,0.819191,-1.794479,-0.115721,-0.232386,1.015114,-0.389602,0.269047,1.674151,-0.485851,-0.196439,...,-0.539368,3.738083,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
2293,-0.980287,1.262741,-0.115721,-0.467176,1.015114,2.566725,0.269047,-0.812400,0.297780,-0.196439,...,-0.539368,-0.267517,5.196152,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
576,0.305054,-0.265869,-0.115721,-0.424115,1.015114,2.566725,0.269047,1.555744,-0.485851,0.120764,...,-0.539368,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
2731,0.990570,1.262741,-0.115721,-0.285504,-0.985111,-0.389602,0.269047,0.134857,1.865043,-0.513642,...,1.854023,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
1051,-0.637530,1.262741,-0.115721,-0.305620,-0.985111,-0.389602,0.269047,-0.575586,0.689596,0.120764,...,1.854023,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3839,0.047986,-1.794479,-0.115721,-0.428516,-0.985111,-0.389602,0.269047,-1.167622,0.689596,-0.196439,...,-0.539368,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
1096,-0.380461,-0.265869,-0.115721,-0.436373,1.015114,-0.389602,0.269047,-0.457179,-0.485851,-0.513642,...,-0.539368,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
3980,0.647812,-1.794479,-0.115721,-0.441402,1.015114,-0.389602,-3.716829,-0.220364,-0.485851,-0.196439,...,-0.539368,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,-1.163160,1.521055
235,-0.123393,1.262741,-0.115721,-0.446745,-0.985111,-0.389602,0.269047,0.490079,1.865043,-0.513642,...,-0.539368,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438


In [11]:
y_train

1850    0
2293    0
576     0
2731    1
1051    0
       ..
3839    0
1096    0
3980    0
235     0
1061    0
Name: CarInsurance, Length: 2800, dtype: int64

## 2. TRAINING

In [12]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.


## 3. RESULTS

In [13]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

                   Logistic Regression: 78.58%
                   K-Nearest Neighbors: 73.17%
                         Decision Tree: 76.42%
Support Vector Machine (Linear Kernel): 78.42%
   Support Vector Machine (RBF Kernel): 78.75%
                        Neural Network: 79.58%
                         Random Forest: 83.67%
                     Gradient Boosting: 83.42%
