# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')
churn = pd.read_csv("Churn_Modelling.csv")

In [2]:
churn = churn.dropna().drop_duplicates().drop(["RowNumber", "CustomerId", "Surname"], axis = 1)

# Training/Testing Split

In [3]:
X = churn.loc[:, "CreditScore":"EstimatedSalary"]
y = churn["Exited"]
X_train1, X_test, y_train1, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42, stratify=churn['Exited'])

# Imbalanced Classes

In [4]:
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train1, y_train1)

# Validation Folds

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

# Pre-processing

In [6]:
num_vars = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
cat_vars = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_vars),
        ('cat', categorical_transformer, cat_vars)
    ])

In [7]:
print(preprocessor.fit_transform(X_train))

[[ 0.15707436  0.08233622  0.69233076 ...  1.          1.
   0.        ]
 [ 0.78274786 -0.48503    -1.36075642 ...  1.          1.
   0.        ]
 [-0.63271022  1.0279466   0.35014957 ...  1.          0.
   1.        ]
 ...
 [ 1.71612963  1.50075178  0.00796837 ...  0.          1.
   0.        ]
 [-1.02247403  0.74426348  1.37669316 ...  0.          1.
   0.        ]
 [-0.6532241  -1.05239623  1.03451196 ...  1.          1.
   0.        ]]


# Fitting Models

Logistic regression

knn

decision tree

random forest

xg boost
