# Importation des librairies

In [1]:
# 'os' module provides functions for interacting with the operating system 
import os
# 'Numpy' is used for mathematical operations on large, multi-dimensional arrays and matrices
import numpy as np
# 'Pandas' is used for data manipulation and analysis
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
import mlxtend.feature_selection as fs
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
import pickle

# Importation des données

In [2]:
data = pd.read_excel('banques.xlsx')
data.head()


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,OUI
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,NON
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,OUI
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,NON
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,NON


# Information des données

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  object 
dtypes: float64(2), int64(8), object(4)
memory usage: 1.1+ MB


# Resumé des données statistiques

In [4]:
data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48


# Resume des données categorielles

In [5]:
data.describe(exclude=[np.number])

Unnamed: 0,Surname,Geography,Gender,Exited
count,10000,10000,10000,10000
unique,2932,3,2,2
top,Smith,France,Male,NON
freq,32,5014,5457,7963


# Verification de presence de données nulle

In [6]:
data.isnull().sum() 

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Aucune donnée nulle dans ce dataset

# Separation des données avec la classe cible


# Encodage des données

In [7]:
#Encodage "geography" 
geo_encoder = LabelBinarizer()
geo_encoder.fit(data['Geography'])

#Encodage "Gender"
gender_encoder = LabelBinarizer()
gender_encoder.fit(data['Gender'])

#Encodage "Label"
encoder_label = LabelEncoder()
encoder_label.fit(data['Exited'])


In [8]:

gender = gender_encoder.transform(data['Gender'])
geo = geo_encoder.transform(data['Geography'])
label = encoder_label.transform(data['Exited']) 

df = data[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']]
column_geography = ['Geography_'+ val for val in geo_encoder.classes_]

df_geography = pd.DataFrame(geo,columns=column_geography, index=data.index)
df_genre = pd.DataFrame(gender, columns=['Gender'], index=data.index)

df = pd.concat([df,df_geography,df_genre], axis=1)
df['Exited'] = label





In [9]:
df

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender,Exited
0,619,42,2,0.00,1,1,1,101348.88,1,0,0,0,1
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0,0
2,502,42,8,159660.80,3,1,0,113931.57,1,0,0,0,1
3,699,39,1,0.00,2,0,0,93826.63,1,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.10,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64,1,0,0,1,0
9996,516,35,10,57369.61,1,1,1,101699.77,1,0,0,1,0
9997,709,36,7,0.00,1,0,1,42085.58,1,0,0,0,1
9998,772,42,3,75075.31,2,1,0,92888.52,0,1,0,1,1


# Normalisation des données

In [10]:
def normalize (data, method = "Min-Max") :
    '''
    Normaliser un dataFrame
    -@param method egal a Min-Max (default) ou z-score
    - valeur de retour le dataset
    '''
    col = data.columns
    if method == "Min-Max" :
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(data)
        data[col] = scaled_data
    elif method == "z-score":
        scaler = StandardScaler()
        # Normaliser les données
        scaled_data = scaler.fit_transform(data)
        data[col] = scaled_data
    else :
        print("la methode entree est invalide. veuillez regarder la documentation.")
    return data

In [11]:
# X = normalize(X,'z-score')
# X

# Division en données de test et d'entrainement et sauvegarde


In [12]:
data_train, data_test = train_test_split(df, test_size=0.2, random_state=1, stratify=df['Exited'])

data_train.to_excel('banque_train.xlsx')
data_test.to_excel('banque_test.xlsx')



In [13]:
X_train = data_train.iloc[:,:-1].values
y_train = data_train.iloc[:,-1].to_list()
X_test = data_test.iloc[:,:-1].values
y_test = data_test.iloc[:,-1].to_list()

# Grid search cv


In [14]:

# tuned_parameters =  [
#     [{'penalty': ['l2', None], 'tol' : [1e-4,1e-5], 'max_iter' : [10,100,1000], 'fit_intercept' : [True, False]}],
#     [{'max_depth': list(range(10, 15)), 'max_features': list(range(0,4))}],
#     [{'n_estimators' : [50,100,200],'random_state' : [None], 'learning_rate' : [1.,0.8,0.5],'algorithm' : ['SAMME','SAMME.R']}],
#     [{'n_estimators':[10,20,100],'max_samples':[0.5,1.0],'max_features':[0.5,1.0],'random_state':[None]}]
#     ]
# algorithms = [LogisticRegression(),RandomForestClassifier(),AdaBoostClassifier(),BaggingClassifier()]
# algorithm_names = ["LogisticRegression","RandomForest","AdaBoost","Bagging classifier"]

# dataset = []
# model_names = []
# best_parametre = []
# score = []
# for i in range(len(algorithms)):
#     model = algorithms[i]
#     clf = GridSearchCV(model, tuned_parameters[i], cv = 5, scoring='accuracy')
#     clf.fit(X_train, y_train)
#     model_names.append(algorithm_names[i])
#     best_parametre.append(clf.best_params_)
#     score.append(clf.best_score_ )


# new_data = pd.DataFrame({"Algorithme": model_names, "best parametre": best_parametre, "score" : score})



In [15]:
# Random forest n_estimator 300
# rfc = RandomForestClassifier()

# forest_params = [{'n_estimators':list(range(0,300)) }]

# clf = GridSearchCV(rfc, forest_params, cv = 10, scoring='accuracy', n_jobs = 8)

# clf.fit(X_train, y_train)

# print(clf.best_params_)

# print(clf.best_score_)

In [16]:
def do_features_selection(modele, x_train, y_train, select_method = "forward", cv = 5, feature_names=[], scoring = 'accuracy'):
    methode = 0 if select_method =="backward" else 1

    k_features = x_train.shape[1] if methode else 1

    feat_select = fs.SequentialFeatureSelector(modele, k_features=k_features, forward=methode, scoring=scoring, cv=cv)
    feat_select = feat_select.fit(x_train, y_train)
    print(feat_select.get_metric_dict())
    fig1 = plot_sfs(feat_select.get_metric_dict(), kind='std_dev')

    # plt.ylim([0.8, 1])
    methode = "backward" if methode == 0 else 'forward'
    plt.title('Sequential {} Selection (w. StdDev)'.format(methode))
    plt.grid()
    plt.show()

    results = pd.DataFrame.from_dict(feat_select.get_metric_dict()).T
    results["avg_score"] = [np.sqrt(elt) for elt in list(results["avg_score"])]
    return results[["feature_names", "avg_score"]]

In [17]:
#test de la selection des variables
# model_rfc = clf.best_estimator_
# model_rfc
# feature = do_features_selection(modele = model_rfc,x_train = X_train, y_train = y_train, feature_names = list(X_train.columns) )
# feature


In [18]:
#Backward selection
# feature_backward = do_features_selection(modele = model_rfc,x_train = X_train, y_train = y_train,select_method = "backward", feature_names = list(X_train.columns) )
# feature_backward

Pour l'interpretation on ad eux possibilités :
    -- creer un modele d'arbre de decision 
    -- 5 variables qu'on veut retenir , faire l'analyse de données

### Arbre de decision

In [19]:
# rfc = DecisionTreeClassifier()

# forest_params = [{'max_depth' : [10,100,1000,10000], 'min_samples_split' : [2,10,100], 'min_samples_leaf': [1,5,10], 'max_features' : ["sqrt","log2"]}]

# clf = GridSearchCV(rfc, forest_params, cv = 10, scoring='accuracy', n_jobs = 8)

# clf.fit(X_train, y_train)

# print(clf.best_params_)

# print(clf.best_score_)

# Prise en compte de la selection des varaibles

In [20]:
# my  = ['Age', 'Balance', 'NumOfProducts', 'IsActiveMember', 'Geography_Spain']
# data_feature = X_train.loc[:,my]

In [21]:
# rfc_features = RandomForestClassifier()

# forest_params = [{'n_estimators':list(range(0,300)) }]

# clf_feature = GridSearchCV(rfc_features, forest_params, cv = 10, scoring='accuracy', n_jobs = 8)

# clf_feature.fit(data_feature, y_train)

# print(clf_feature.best_params_)

# print(clf_feature.best_score_)

In [22]:
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test,y_test)
# result

In [23]:
# X = X_test.iloc[[10]]
# loaded_model.predict(X)
# encoder.inverse_transform([1])


In [24]:
# def predict (X) :
#     y_pred = loaded_model.predict(X)
#     return encoder.inverse_transform([int(elt) for elt in y_pred])[-1]


# Sauvegarde de la fonction et du model

In [25]:
# encoder = LabelEncoder()
# encoder.fit(data['Exited'])

In [26]:
# LabelBinarizer().fit_transform(data['Geography'])


In [27]:
# A revoir pour adapter avec les encoders definit en haut (le dummies ne fait pas l'affaire a moins d'ecrire une fonction avec stack())
# def preprocessing(X) :
#     # verifier si le fichier fournit est bon
#     if type(X) is dict :
#         to_delete = ['Surname', 'CustomerId','RowNumber']
#         #Supprimer les variables inutiles
#         X = pd.DataFrame.from_dict([X])
#         X = X.drop(to_delete,axis=1)
#         #Encodage des données
#         for var in X.select_dtypes('object')  :
#             data_encoded = pd.get_dummies(X[var], prefix=var)
#             # Ajouter les nouvelles colonnes binaires au DataFrame
#             X = pd.concat([X, data_encoded], axis=1)
#             # Supprimer la colonne originale de la variable catégorielle
#             X.drop(var, axis=1, inplace=True)
            
#         #on met les colonnes inexistante du a l'encodage
#         if('Geography_France') in X.columns :
#             pass
#         else : 
#             X['Geography_France'] = 0
            
#         if('Geography_Germany') in X.columns :
#             pass
#         else : 
#             X['Geography_Germany'] = 0
        
    
#         if('Geography_Germany') in X.columns :
#             pass
#         else : 
#             X['Geography_Germany'] = 0
            
#         if('Geography_Spain') in X.columns :
#             pass
#         else : 
#             X['Geography_Spain'] = 0
        
#         if('Gender_Female') in X.columns :
#             pass
#         else : 
#             X['Gender_Female'] = 0
        
#         if('Gender_Male') in X.columns :
#             pass
#         else : 
#             X['Gender_Male'] = 0
#         return X
        
#     else : 
#         print("Le fichier fournit n'est pas un dictionnaire")
#         return 0

In [28]:
# save_all_data = {'function' : preprocessing ,'model' : clf }
# pickle.dump(save_all_data,open('model_function_all_data.sav', 'wb'))

# save_feature_data = {'function' : preprocessing ,'model' : clf_feature }
# pickle.dump(save_feature_data,open('model_function_feature_data.sav', 'wb'))


In [29]:
X = { 'CustomerId' : 15634602, 'RowNumber' :1,'Surname' : 'Hargrave',	'CreditScore':619, 'Geography' : 'France', 'Gender' : 'Female' ,'Age' :	42, 'Tenure' :	2,	'Balance' :0.00, 'NumOfProducts' :1,'HasCrCard' :1, 'IsActiveMember' :1	, 'EstimatedSalary' :101348.88}


In [30]:
# fonction que j'avais fait pour predire les données 
# def result_prediction(X,filename='model_function_all_data.sav') : 
#     loaded_model = pickle.load(open(filename, 'rb'))
#     data = loaded_model['function'](X)
#     if(filename == 'model_function_all_data.sav') :
#         if  isinstance(data, pd.core.frame.DataFrame) :
#             y_pred = loaded_model['model'].predict(data)
#             result = encoder.inverse_transform([int(elt) for elt in y_pred])[-1] 
#             proba = loaded_model['model'].predict_proba(data)[:, 1]
#             return {'resultat' : result, 'proba' : proba}
#         else :
#             pass
#     else :
#         if  isinstance(data, pd.core.frame.DataFrame) :
#             my  = ['Age', 'Balance', 'NumOfProducts', 'IsActiveMember', 'Geography_Spain']
#             data = data.loc[:,my]
#             y_pred = loaded_model['model'].predict(data)
#             result = encoder.inverse_transform([int(elt) for elt in y_pred])[-1] 
#             proba = loaded_model['model'].predict_proba(data)[:, 1]
#             return {'resultat' : result, 'proba' : proba}
#         else :
#             pass

In [31]:
model = RandomForestClassifier(n_estimators=100,max_depth=10,criterion='gini', min_samples_split=2)
model.fit(X_train, y_train)

In [32]:
model.score(X_test,y_test)

0.8655

In [33]:
class Model :
    def __init__(self,encoders,model) :
        self.encoders = encoders
        self.model = model
    
    def predict(self,X) : 
        return 0
    

In [34]:
clf = Model(encoders={'geo' : geo_encoder, 'gender' : gender_encoder, 'labels': encoder_label},model = model )

In [35]:
pickle.dump(clf,open('model.sav', 'wb'))

In [36]:
inf = pickle.load(open('model.sav', 'rb'))

In [37]:
inf.encoders['geo'].transform(['France'])

array([[1, 0, 0]])

In [38]:
type(X_test)

numpy.ndarray

In [39]:
prob = model.predict_proba(df.iloc[[0],:-1])
label_ind,proba = prob.argmax(), prob.max()
encoder_label.inverse_transform([label_ind])[-1], proba




('NON', 0.6894089050726778)

In [40]:
df.iloc[[0]] #.values.reshape(-1, 1)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender,Exited
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0,1


In [41]:
encoder_label.inverse_transform([label_ind])[-1] 

'NON'