In [None]:
import numpy as np 
import pandas as pd

## Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

## Loading Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
org_data = pd.read_csv('hurn_Modelling.csv')

In [None]:
org_data.dropna(inplace=True)

In [None]:
org_data

In [None]:
org_data.columns = train_df.columns

In [None]:
train_df = pd.concat([train_df, org_data], axis = 0)

In [None]:
train_df

In [None]:
train_df.groupby('Exited').size()

In [None]:
def drop_columns(df, columns):
    df = df.drop(columns=columns)
    return df

In [None]:
columns = ['id', 'CustomerId']
train_df = drop_columns(train_df, columns)
test_df = drop_columns(test_df, columns)

In [None]:
train_df

In [None]:
categories = ['Geography', 'Surname', 'Gender']

In [None]:
numericals = [i for i in train_df.columns if i not in categories and i != 'Exited']

In [None]:
numericals

In [None]:
label_encoder = LabelEncoder()

In [None]:
def transform_categorical_cols(df, cols):
    for i in range(len(cols)):
        df[cols[i]] = label_encoder.fit_transform(df[cols[i]])

In [None]:
transform_categorical_cols(train_df, categories)

In [None]:
transform_categorical_cols(test_df, categories)

In [None]:
sns.heatmap(train_df.corr(method='pearson'))

In [None]:
X = train_df.drop('Exited', axis = 1).values
y = train_df.Exited.values
validation_size = 0.20
seed = 42
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = validation_size, random_state = seed, stratify=y)

## Evaluate Algorithms

In [None]:
num_folds = 10
scoring = 'roc_auc'

In [None]:
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()), ('LR', LogisticRegression())])))
pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()), ('LDA', LinearDiscriminantAnalysis())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsClassifier())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()), ('NB', GaussianNB())])))
pipelines.append(('ScaledXGB', Pipeline([('Scaler', StandardScaler()), ('XGB', XGBClassifier())])))
pipelines.append(('ScaledLGBM', Pipeline([('Scaler', StandardScaler()), ('LGBM', LGBMClassifier())])))
results = []
names = []

for name, model in pipelines:
    strat_kfold = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, scoring=scoring, cv=strat_kfold)
    results.append(cv_results)
    names.append(name)
    msg = f'{name}, {cv_results.mean()}, {cv_results.std()}'
    print(msg)

## Metrics for the Algorithms

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [None]:
models = [('LDA', LinearDiscriminantAnalysis()), ('XGB', XGBClassifier()), ('LGBM', LGBMClassifier()), ('KNN', KNeighborsClassifier()), ('LR', LogisticRegression()), ('NB', GaussianNB())]

for name, model in models:
    scaler = StandardScaler()
    rescaled_X = scaler.fit_transform(X_train)
    model.fit(rescaled_X, y_train)
    rescaled_validation_X = scaler.transform(X_validation)
    predictions = model.predict(rescaled_validation_X)
    print(name)
    print('*'*50)
#     print(roc_auc_score(y_validation, predictions))
    print(f1_score(y_validation, predictions))
    print(confusion_matrix(y_validation, predictions))
    print('*'*50)

## Algorithm Tuning

### Tuning LDA

In [None]:
scaler = StandardScaler()
rescaled_X = scaler.fit_transform(X_train)
param_grid = {'solver':['svd', 'lsqr'], 'shrinkage':[None] + [x / 10 for x in range(0, 11)] + ['auto']}
model = LinearDiscriminantAnalysis()
strat_kfold = StratifiedKFold(n_splits = num_folds, shuffle=True, random_state=seed)
grid = GridSearchCV(estimator = model, param_grid = param_grid, cv=strat_kfold, scoring=scoring)
grid_results = grid.fit(rescaled_X, y_train)

In [None]:
print(f'{grid_results.best_score_}, {grid_results.best_params_}')

## Ensemble Methods

In [None]:
ensembles = []
ensembles.append(('ScaledAB', Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostClassifier())])))
ensembles.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingClassifier())])))
ensembles.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()), ('RF', RandomForestClassifier())])))
ensembles.append(('ScaledET', Pipeline([('Scaler', StandardScaler()), ('ET', ExtraTreesClassifier())])))

results = []
names = []
for name, model in ensembles:
    strat_kfold = StratifiedKFold(n_splits = num_folds, shuffle=True, random_state = seed)
    cv_results = cross_val_score(model, X_train, y_train, cv = strat_kfold, scoring = scoring)
    results.append(cv_results)
    names.append(name)
    msg = f"{name}, {cv_results.mean()}, {cv_results.std()}"
    print(msg)

## Checking the metrics for Ensemble Models

In [None]:
models = [('AB', AdaBoostClassifier()), ('GBM', GradientBoostingClassifier())]
for name, model in models:
    scaler = StandardScaler()
    rescaled_X = scaler.fit_transform(X_train)
    model.fit(rescaled_X, y_train)
    rescaled_validation_X = scaler.transform(X_validation)
    predictions = model.predict(rescaled_validation_X)
    print(name)
    print('*'*50)
#     print(roc_auc_score(y_validation, predictions))
    print(f1_score(y_validation, predictions))
    print(confusion_matrix(y_validation, predictions))
    print('*'*50)

## Tuning Ensemble Models

### Tuning Gradient Boosting Classifier

In [None]:
scaler = StandardScaler()
rescaled_X = scaler.fit_transform(X_train)
estimators = np.arange(50, 500, 50)
param_grid = dict(n_estimators = estimators)
model = GradientBoostingClassifier()
strat_kfold = StratifiedKFold(n_splits = 4, shuffle = True, random_state = seed)
grid = GridSearchCV(model, param_grid = param_grid, cv = strat_kfold, scoring = scoring)
grid_results = grid.fit(rescaled_X, y_train)
print(grid_results.best_score_, grid_results.best_params_)

## Finalize Model

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
scaler = StandardScaler()
rescaled_X = scaler.fit_transform(X_train)
xgb_model = XGBClassifier()
lgbm_model = LGBMClassifier()

#Create a voting model
voting_model = VotingClassifier(
                    estimators = [
#                         ('ab', ab_model),
                        ('xgb', xgb_model),
                        ('lgbm', lgbm_model),
                    ],
                    voting = 'soft'
)
voting_model.fit(rescaled_X, y_train)

In [None]:
rescaled_validation_X = scaler.transform(X_validation)
predictions = voting_model.predict_proba(rescaled_validation_X)[:, 1]
print(roc_auc_score(y_validation, predictions))

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
rescaled_validation_X = scaler.transform(X_validation)
predictions = voting_model.predict_proba(rescaled_validation_X)[:, 1]
print(roc_auc_score(y_validation, predictions))

## Model for Competition

In [None]:
rescaled_X_test_df = scaler.transform(test_df)
predictions = voting_model.predict_proba(rescaled_X_test_df)[:, 1]

In [None]:
test_df['Exited'] = predictions