# Import Packages and Classifiers

In [1]:
#!pip install --user scikit-learn

In [2]:
#!pip install --user matplotlib

In [3]:
#!pip install --user pandas

In [4]:
#!pip install --user numpy

In [5]:
#!pip install --user seaborn

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns


In [7]:
import pandas as pd 
import numpy as np

https://scikit-learn.org/stable/modules/cross_validation.html

# Import Data 

In [8]:
hr_data = pd.read_csv("hr_data.csv")
insurance_data = pd.read_csv("insurance_data.csv")

# First Dataset 
https://www.kaggle.com/arashnic/imbalanced-data-practice?select=aug_train.csv

## Prep Data for Modeling 

In [10]:
insurance_data.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [11]:
le = preprocessing.LabelEncoder()
le.fit(insurance_data.Gender.unique())
insurance_data['Gender'] = le.transform(insurance_data.Gender)

#Vehicle_Age 
le1 = preprocessing.LabelEncoder()
le1.fit(insurance_data.Vehicle_Age.unique())
insurance_data['Vehicle_Age'] = le1.transform(insurance_data.Vehicle_Age)

#Vehicle_Damage 
le2 = preprocessing.LabelEncoder()
le2.fit(insurance_data.Vehicle_Damage.unique())
insurance_data['Vehicle_Damage'] = le2.transform(insurance_data.Vehicle_Damage)

In [12]:
target = 'Response'
cols = ['Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage']

#splitn into training and testing data sets
train, test = train_test_split(insurance_data, test_size=0.3)

## Decision Tree with Pruning 
https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html

https://www.analyticsvidhya.com/blog/2020/10/cost-complexity-pruning-decision-trees/

In [13]:
decision_tree = DecisionTreeClassifier(random_state=0)
path = decision_tree.cost_complexity_pruning_path(train[cols], train[target])
alphas, impurities = path.ccp_alphas, path.impurities


In [14]:
from sklearn.metrics import accuracy_score
accuracy_train, accuracy_test = [], []

for i in alphas:
    tree = DecisionTreeClassifier(ccp_alpha = i)
    
    tree.fit(train[cols], train[target])
    y_train_pred = tree.predict(train[cols])
    y_test_pred = tree.predict(test[cols])
    
    accuracy_train.append(accuracy_score(train[target], y_train_pred))
    accuracy_test.append(accuracy_score(test[target], y_test_pred))
    
    
sns.set()
plt.figure(figsize=(14,7))
sns.lineplot(y = accuracy_train, x = alphas, label = "Train Accuracy")
sns.lineplot(y = accuracy_test, x = alphas, label = "Test Accuracy")

KeyboardInterrupt: 

## Neural Network
https://scikit-learn.org/stable/modules/neural_networks_supervised.html

In [31]:
nn = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

nn.fit(train[cols], train[target])

accuracy_score(test[target], nn.predict(test[cols]))

0.8360968887105638

## Boosting 
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#:~:text=An%20AdaBoost%20%5B1%5D%20classifier%20is,focus%20more%20on%20difficult%20cases.

In [32]:
ada = AdaBoostClassifier(n_estimators=100, random_state=0)
ada.fit(train[cols], train[target])

accuracy_score(test[target], ada.predict(test[cols]))

0.8428916587437961

## Support Vector Machines 
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html#sphx-glr-auto-examples-classification-plot-digits-classification-py

In [None]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(train[cols], train[target])

accuracy_score(test[target], clf.predict(test[cols]))

## K Nearest Neighbors
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html


In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train[cols], train[target])

accuracy_score(test[target], neigh.predict(test[cols]))

# Second Dataset   

https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists?select=aug_train.csv

## Prep Data for Modeling 

In [None]:
hr_data.columns

In [None]:
target = ''
cols = []

#splitn into training and testing data sets
train, test = train_test_split(hr_data, test_size=0.3)

## Decision Tree with Pruning 

In [None]:
accuracy_train, accuracy_test = [], []

for i in alphas:
    tree = DecisionTreeClassifier(ccp_alpha = i)
    
    tree.fit(train[cols], train[target])
    y_train_pred = tree.predict(train[cols])
    y_test_pred = tree.predict(test[cols])
    
    accuracy_train.append(accuracy_score(train[target], y_train_pred))
    accuracy_test.append(accuracy_score(test[target], y_test_pred))
    
    
sns.set()
plt.figure(figsize=(14,7))
sns.lineplot(y = accuracy_train, x = alphas, label = "Train Accuracy")
sns.lineplot(y = accuracy_test, x = alphas, label = "Test Accuracy")

## Neural Network

In [None]:
nn = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

nn.fit(train[cols], train[target])

accuracy_score(test[target], nn.predict(test[cols]))

## Boosting 

In [None]:
ada = AdaBoostClassifier(n_estimators=100, random_state=0)
ada.fit(train[cols], train[target])

accuracy_score(test[target], ada.predict(test[cols]))

## Support Vector Machines 

In [None]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(train[cols], train[target])

accuracy_score(test[target], clf.predict(test[cols]))

## K Nearest Neighbors

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train[cols], train[target])

accuracy_score(test[target], neigh.predict(test[cols]))