<a href="https://colab.research.google.com/github/louisisaacdiouf/Churn-Prediction-with-Artificial-Neural-Network/blob/main/churn_prediction_with_artificial_neural_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DEEP LEARNING DE A à Z - UDEMY

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install scikeras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.metrics import accuracy_score, confusion_matrix
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
import multiprocessing

In [4]:
data = pd.read_csv('/content/drive/MyDrive/__udemy-datasets/Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
print("{0:30} {1}".format('Colonnes','Valeurs uniques'))
for col in data.columns:
    print("{0:30} {1}".format(col,len(np.unique(data[col]))))

Colonnes                       Valeurs uniques
RowNumber                      10000
CustomerId                     10000
Surname                        2932
CreditScore                    460
Geography                      3
Gender                         2
Age                            70
Tenure                         11
Balance                        6382
NumOfProducts                  4
HasCrCard                      2
IsActiveMember                 2
EstimatedSalary                9999
Exited                         2


In [6]:
print(np.unique(np.unique(data['Surname'], return_counts=True)[1]))

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 28 29 32]


In [7]:
n_cpus = multiprocessing.cpu_count()
n_cpus

2

In [45]:
class ANN:
    def __init__(self, df, n_neuron_in=6, n_neuron_layers=[32,32], n_neuron_out=1, epochs=2, batch_size=50, validation_split=0.3):
        self.df = df
        self.n_neuron_in = n_neuron_in
        self.n_neuron_layers = n_neuron_layers
        self.n_neuron_out = n_neuron_out
        self.epochs = epochs
        self.batch_size = batch_size
        self.validation_split = validation_split
        
    def train_preprocessing(self, X):
        # Encodeur
        cat_encoder = OneHotEncoder()

        # Encodage des variables catégorielles
        x_categorical = cat_encoder.fit_transform(X.select_dtypes('object')).toarray()
        self.cat_encoder = cat_encoder

        # Nombre de colonnes avant encodage
        n_column_ini = len(X.columns)

        # Suppression des colonnes à encoder
        X = X.drop(columns=X.select_dtypes('object').columns)

        # Assemblage du nouveau tableau
        X = np.concatenate((x_categorical,np.array(X)), axis=1)
        n_vars = len(X[0])
        
        # Scaleur
        scaler = StandardScaler()

        # Normalisation des données
        X = scaler.fit_transform(X)
        self.scaler = scaler
        return X
        
       
        
    def training(self):
        df = self.df
        # ---------------------------------------------------
        # ------------------ PREPROCESSING ------------------
        # ---------------------------------------------------
        # Variables inutiles
        
        df = data.drop(columns=['RowNumber','CustomerId','Surname'])
        
        # Création des variables
        X = df.iloc[:,:-1]
        y = df.iloc[:,-1]
        
        self.X = X
        
        # Preprocessing
        X = self.train_preprocessing(X)
        
        n_vars = len(X[0])
        
        # Création des jeux de données train et test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        
        # --------------------------------------------------
        # --------------- CREATION DU MODELE ---------------
        # --------------------------------------------------  
        
        n_neuron_in = self.n_neuron_in
        n_neuron_layers = self.n_neuron_layers
        n_neuron_out = self.n_neuron_out
        epochs = self.epochs
        batch_size = self.batch_size
        validation_split = self.validation_split

        model = Sequential()
        model.add(Dense(units=n_neuron_in,
                        activation='relu',
                        # kernel_initializer='uniform',
                        input_dim=n_vars))

        for n in n_neuron_layers:
            model.add(Dense(units=n, activation='relu',
                        # kernel_initializer='uniform'
                        ))

        model.add(Dense(units=n_neuron_out, activation='sigmoid',
                        # kernel_initializer='uniform'
                        ))

        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
        model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=validation_split, use_multiprocessing=True)
        
        y_pred = (model.predict(X_test) > 0.8).astype(int)

        self.training_confusion_matrix = confusion_matrix(y_test,y_pred)

        self.training_evaluation = model.evaluate(X_test, y_test, verbose=0)
        
        self.model = model
        
        
    def kFold(self):
        df = self.df
        # ---------------------------------------------------
        # ------------------ PREPROCESSING ------------------
        # ---------------------------------------------------
        # Variables inutiles
        
        df = data.drop(columns=['RowNumber','CustomerId','Surname'])
        
        # Création des variables
        X = df.iloc[:,:-1]
        y = df.iloc[:,-1]
        
        self.X = X
        
        # Preprocessing
        X = self.train_preprocessing(X)
        
        n_vars = len(X[0])
        
        # Création des jeux de données train et test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        
        # --------------------------------------------------
        # --------------- CREATION DU MODELE ---------------
        # --------------------------------------------------  
        
        n_neuron_in = self.n_neuron_in
        n_neuron_layers = self.n_neuron_layers
        n_neuron_out = self.n_neuron_out
        epochs = self.epochs
        batch_size = self.batch_size
        validation_split = self.validation_split
        
        def build_classifier():
            model = Sequential()
            model.add(Dense(units=n_neuron_in,
                            activation='relu',
                            # kernel_initializer='uniform',
                            input_dim=n_vars))
            model.add(Dropout(rate=0.1))
            
            for n in n_neuron_layers:
                model.add(Dense(units=n, activation='relu',
                            # kernel_initializer='uniform'
                            ))
                model.add(Dropout(rate=0.1))

            model.add(Dense(units=n_neuron_out, activation='sigmoid',
                            # kernel_initializer='uniform'
                            ))

            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
            
            return model
        
        classifier = KerasClassifier(model=build_classifier, batch_size=self.batch_size, epochs=self.epochs)

        # n_cpus = multiprocessing.cpu_count()
        precisions = cross_val_score(estimator=classifier,X=X_train,y=y_train,cv=10)
        print(f"Precision mean : {precisions.mean()}")
        print(f"Precision STD : {precisions.std()}")
        
        classifier.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=validation_split, use_multiprocessing=True)
              
        y_pred = (classifier.predict(X_test) > 0.8).astype(int)

        self.kFold_confusion_matrix = confusion_matrix(y_test,y_pred)

        self.kFold_score = classifier.score(X_test, y_test)
        
        self.kFoldClassifier = classifier


    def GridSearch(self, bs, ep, optim, cross_val):
        df = self.df
        # ---------------------------------------------------
        # ------------------ PREPROCESSING ------------------
        # ---------------------------------------------------
        # Variables inutiles
        
        df = data.drop(columns=['RowNumber','CustomerId','Surname'])
        
        # Création des variables
        X = df.iloc[:,:-1]
        y = df.iloc[:,-1]
        
        self.X = X
        
        # Preprocessing
        X = self.train_preprocessing(X)
        
        n_vars = len(X[0])
        
        # Création des jeux de données train et test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        
        # --------------------------------------------------
        # --------------- CREATION DU MODELE ---------------
        # --------------------------------------------------  
        
        n_neuron_in = self.n_neuron_in
        n_neuron_layers = self.n_neuron_layers
        n_neuron_out = self.n_neuron_out
        epochs = self.epochs
        batch_size = self.batch_size
        validation_split = self.validation_split
        
        def build_classifier():
            model = Sequential()
            model.add(Dense(units=n_neuron_in,
                            activation='relu',
                            # kernel_initializer='uniform',
                            input_dim=n_vars))
            model.add(Dropout(rate=0.1))
            
            for n in n_neuron_layers:
                model.add(Dense(units=n, activation='relu',
                            # kernel_initializer='uniform'
                            ))
                model.add(Dropout(rate=0.1))

            model.add(Dense(units=n_neuron_out, activation='sigmoid',
                            # kernel_initializer='uniform'
                            ))

            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
            
            return model
        
        classifier = KerasClassifier(model=build_classifier, batch_size=self.batch_size, epochs=self.epochs)
        parameters = {
            "batch_size": bs,
            "epochs": ep,
            "optimizer": optim,
        }

        # n_cpus = multiprocessing.cpu_count()
        gridsearch = GridSearchCV(estimator=classifier,
                                  param_grid=parameters,
                                  scoring="accuracy",
                                  cv=cross_val)
        
        gridsearch.fit(X_train, y_train, use_multiprocessing=True)
        
        y_pred = (gridsearch.predict(X_test) > 0.8).astype(int)
        self.gridsearch_confusion_matrix = confusion_matrix(y_test,y_pred)
        
        self.gridsearch = gridsearch
        
        
    def predict_preprocessing(self,X):
        x_categorical = self.cat_encoder.transform(X.select_dtypes('object')).toarray()
        X = X.drop(columns=X.select_dtypes('object').columns)
        X = np.concatenate((x_categorical,np.array(X)), axis=1)        
        X = self.scaler.transform(X)
        return X        
        
    def predict(self,X,strategy='classic'):
        X = self.predict_preprocessing(X)
        if strategy=='classic':
            return self.model.predict(X)
        elif strategy=='kFold':
            return self.kFoldClassifier.predict_proba(X)
        elif strategy=='gridSearch':
            return self.gridsearch.predict_proba(X)

In [46]:
ai = ANN(data, epochs=150, n_neuron_layers=[16,8,4,8,16], batch_size=128)

In [47]:
cols = list(data.drop(columns=['RowNumber','CustomerId','Surname']).columns[:-1])
client = pd.DataFrame(data=[[619,'France','Male',40,3,60000,2,1,1,50000]], columns=cols)
# print(client.dtypes)
client

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Male,40,3,60000,2,1,1,50000


In [48]:
ai.training()

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [49]:
print(f"training loss = {ai.training_evaluation[0]} \ntraining accuracy = {ai.training_evaluation[1]}")
print("\n---------------------------------------------------------")
print("----------------- CONFUSION MATRIX ----------------------")
print("---------------------------------------------------------")
print(ai.training_confusion_matrix)
print(f"\nProbability of churn = {ai.predict(client)[0][0] * 100} %")

training loss = 0.35500919818878174 
training accuracy = 0.8523333072662354

---------------------------------------------------------
----------------- CONFUSION MATRIX ----------------------
---------------------------------------------------------
[[2354   25]
 [ 458  163]]

Probability of churn = 2.004053071141243 %


In [50]:
ai.kFold()

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [53]:
print(f"kFold score = {ai.kFold_score}")
print("\n---------------------------------------------------------")
print("----------------- CONFUSION MATRIX ----------------------")
print("---------------------------------------------------------")
print(ai.kFold_confusion_matrix)
print(f"\nkFold Probability of churn = {ai.predict(client,strategy='kFold')[1] * 100} %")

kFold score = 0.855

---------------------------------------------------------
----------------- CONFUSION MATRIX ----------------------
---------------------------------------------------------
[[2261  118]
 [ 317  304]]

kFold Probability of churn = 6.7180342972278595 %


In [None]:
ai.GridSearch([32,64,80], [100,200], ["adam","rmsprop"], cross_val=5)

In [None]:
print("Best Parameters :")
print(ai.gridsearch.best_params_)
print("\n-----------------------------------\n")
print("Best Estimator :")
print(ai.gridsearch.best_estimator_)
print("\n-----------------------------------\n")
print("Best Score :")
print(ai.gridsearch.best_score_)
print("\n---------------------------------------------------------")
print("----------------- CONFUSION MATRIX ----------------------")
print("---------------------------------------------------------")
print(ai.gridsearch_confusion_matrix)

# ai.GridSearch([20,32,40], [500], ["adam"], cross_val=5)
print(f"\nGridSearch Probability of churn = {ai.predict(client,strategy='gridSearch')[1] * 100} %")

### Réponse : Le client étudié a de fortes chances de rester avec la banque