In [2]:
# Librairies
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import pickle


In [3]:
# Chargement des données
df = pd.read_csv("https://assets-datascientest.s3-eu-west-1.amazonaws.com/de/total/churn.csv")
df = df.reset_index()

In [4]:
# Identification des valeurs manquantes pour TotalCharges:
df = df.replace(' ', np.nan)
df['TotalCharges'] = df['TotalCharges'].astype('float64')

In [5]:
# On remplace les valeurs nulles par 0 pour la variable TotalCharges, ces données correspondent aux factures totales de nouveaux clients.
df.TotalCharges = df.TotalCharges.fillna(0)

In [6]:
# Mise à jour de la variable SeniorCitizen
df['SeniorCitizen'].replace({0:'No',1:'Yes'}, inplace=True)
# Mise à jour de la variable Churn
df['Churn'].replace({'No':0,'Yes':1}, inplace=True)
# Conversion de la variable TotalCharges en numérique
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

In [7]:
# Définition des variables pour la construction des pipelines
# Définition des variables catégorielles
colonnes_categorielles = ['gender',
                          'SeniorCitizen',
                          'Partner',
                          'Dependents',
                          'PhoneService',
                          'MultipleLines',
                          'OnlineSecurity',
                          'Contract',
                          'PaperlessBilling',
                          'PaymentMethod',
                          'StreamingMovies',
                          'StreamingTV',
                          'TechSupport',
                          'DeviceProtection',
                          'InternetService',
                          'OnlineBackup']

# Variables numériques
colonnes_numeriques =['tenure',
                      'MonthlyCharges',
                      'TotalCharges']

# Variable à supprimer
colonnes_a_supprimer='customerID'

# Variable cible
colonne_cible = 'Churn'

In [8]:
# Suppression des colonnes qui non nécessaires aux modèles notemment 'customerID'
df = df.drop('index',axis='columns')

In [9]:
# Pipeline de construction du modèle et sauvegarde

# Varaibles explicatives
X = df.drop([colonne_cible, colonnes_a_supprimer], axis='columns')

# Variable cible
y = df[colonne_cible]

# Transformation des données et construction du pipeline du modèle de regressionlogistique
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean')),
       ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant')),
       ('onehot',OneHotEncoder())
])

preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, colonnes_numeriques),
    ('categorical', categorical_transformer, colonnes_categorielles)
]) 

pipeline_reglog = Pipeline(steps = [
               ('preprocessor', preprocessor),
               ('Oversampling', SMOTE()),
               ('regressor',LogisticRegression())
           ])
# Construction du modèle de regression logistique
model_reglog = pipeline_reglog.fit(X, y)
# Sauvegarde du modèle de regression logistique
pickle.dump(model_reglog, open('model_regressionlogistique.pkl','wb'))

# Pipeline adboost
pipeline_adboost = Pipeline(steps = [
               ('preprocessor', preprocessor),
               ('Oversampling', SMOTE()),
               ('regressor',AdaBoostClassifier())
           ])
# Construction du modèle AdboostClassifier
model_adboost = pipeline_adboost.fit(X,y)
# Sauvegarde du modèle
pickle.dump(model_adboost, open('model_AdboostClassifier.pkl', 'wb'))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
