In [407]:
from __future__ import print_function
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import numpy as np

import sklearn
import sklearn.datasets

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

from scipy.sparse import vstack

%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format

In [324]:
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv', na_values=' ')

data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Посмотрим размер таблицы:

In [325]:
data.shape


(7043, 21)

Посмотрим информацию о каждом столбце:

In [326]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Видим, что только один столбец имеет пропущенные значения - это столбец TotalCharges и пропущенных значений не много. В данном случая я думаю можно просто удалить строки с пропущенными значениями данного столбца, т.к. пропущенные значения чисто индивидуальные и нет возможности вывести их из других значений.

In [327]:
data = data.dropna(how="any", axis=0)

In [328]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


Сейчас види, что все столбцы имеют значения.

Посмотрим корректность значений каждого столбца типа object.

In [329]:
for col_name in data.columns:
  if data[col_name].dtype.name == "object":
    print(col_name, ":\n", data[col_name].unique(), "\n")

customerID :
 ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK'] 

gender :
 ['Female' 'Male'] 

Partner :
 ['Yes' 'No'] 

Dependents :
 ['No' 'Yes'] 

PhoneService :
 ['No' 'Yes'] 

MultipleLines :
 ['No phone service' 'No' 'Yes'] 

InternetService :
 ['DSL' 'Fiber optic' 'No'] 

OnlineSecurity :
 ['No' 'Yes' 'No internet service'] 

OnlineBackup :
 ['Yes' 'No' 'No internet service'] 

DeviceProtection :
 ['No' 'Yes' 'No internet service'] 

TechSupport :
 ['No' 'Yes' 'No internet service'] 

StreamingTV :
 ['No' 'Yes' 'No internet service'] 

StreamingMovies :
 ['No' 'Yes' 'No internet service'] 

Contract :
 ['Month-to-month' 'One year' 'Two year'] 

PaperlessBilling :
 ['Yes' 'No'] 

PaymentMethod :
 ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)'] 

Churn :
 ['No' 'Yes'] 



Видим, что у столбца MultipleLines имеется три значения: 'No phone service', 'No', 'Yes'. В данном случае значение 'No phone service' является поясняющим и его нужно преобразовать в значение 'No'. Аналогично поступаем со значением 'No internet service' в других столбцах.

In [330]:
data["MultipleLines"] = data["MultipleLines"].str.replace("No phone service", "No")

for col_name in ["OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]:
  data[col_name] = data[col_name].str.replace("No internet service", "No")

Преобразуем категориальные переменные.

In [331]:
# Формирем список имен столбцов с категориальными признаками
col_names = [col_name for col_name in data.columns if data[col_name].dtype.name == "object"]

label_encoder = LabelEncoder()

for col_name in col_names:
  label_encoder.fit(data[col_name])
  data[col_name] = label_encoder.transform(data[col_name])

In [332]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,5365,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,2,29.85,29.85,0
1,3953,1,0,0,0,34,1,0,0,1,0,1,0,0,0,1,0,3,56.95,1889.5,0
2,2558,1,0,0,0,2,1,0,0,1,1,0,0,0,0,0,1,3,53.85,108.15,1
3,5524,1,0,0,0,45,0,0,0,1,0,1,1,0,0,1,0,0,42.3,1840.75,0
4,6500,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1


Столбец customerID не требуется для анализа данных, удаляем его.

In [333]:
table_ids = data["customerID"].copy()

data.drop("customerID", axis=1, inplace=True)

In [334]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,1,0,1,0,0,0,1,0,3,56.95,1889.5,0
2,1,0,0,0,2,1,0,0,1,1,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,0,0,1,0,1,1,0,0,1,0,0,42.3,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1


Вроде все, можно приступать к обучению.
Еще, ядумаю, можно бинарные столбца с содержимиым Yes, No объединить в один столбец с соотвествующими наименованиями, тем самым уменьшив размерность данных, но т.к. у меня еще прозрачный пояс по ML, попробую так.

**Алгоритмы**

Разбиваем наш датасет на тренироваочные и тестовые данные.

In [335]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data.iloc[:, -1], test_size=0.2, random_state=123)

Нормализуем данные

In [336]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

Для данных алгоритмов нет разницы нормализованы ли данные или нет, поэтому далее будем использовать не нормализованные.

In [337]:
clf = GradientBoostingClassifier().fit(X_train, y_train)

y_test_pred = clf.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.8052594171997157

**XGBoost**

In [338]:
xgb_params = {'booster': 'gbtree', 'max_depth': 3, 'eta': 0.1, 
     'objective': 'binary:logistic', 'nthread': 1, 'eval_metric' : "error"}

bst = xgb.train(xgb_params, xgb.DMatrix(X_train, label=y_train), num_boost_round=100)

y_test_pred = np.where(bst.predict(xgb.DMatrix(X_test, label=y_test)) > 0.5, 1, 0) 

accuracy_score(y_test, y_test_pred)

0.8031272210376688

**LigthGBM**

In [439]:
train_data = lgb.Dataset(X_train, label=y_train)

param = {'objective': 'binary'}
param['metric'] = 'auc'

bst = lgb.train(param, train_data)

y_test_pred = np.where(bst.predict(X_test) > 0.5, 1, 0)

accuracy_score(y_test, y_test_pred)

0.7953091684434968

**CatBoost**

In [340]:
model = CatBoostClassifier(loss_function="Logloss", eval_metric="AUC", custom_metric="F1", 
                           random_seed=42, logging_level="Silent", use_best_model=False)

model.fit(X_train, y_train,
          eval_set=(X_test, y_test),
          plot=False)

y_test_pred = model.predict(X_test)

accuracy_score(y_test, y_test_pred)



0.8002842928216063

На тестовых данных все модели показывают примерно одинаковые результаты - 80 %

**Настойка гиперпараметров**

In [423]:
gs = GridSearchCV(estimator=GradientBoostingClassifier(),
                  param_grid=[{"n_estimators": [30, 50, 70, 100, 150],
                               "max_depth": [2, 3, 4, 6]}],
                  scoring="accuracy",
                  cv=10)
gs.fit(data.iloc[:, :-1], data.iloc[:, -1])

print(gs.best_score_, gs.best_params_)

0.8057424350187506 {'max_depth': 3, 'n_estimators': 100}


In [425]:
model = CatBoostClassifier(loss_function="Logloss", eval_metric="AUC", custom_metric="F1", 
                           random_seed=42, logging_level="Silent", use_best_model=False)

gs = GridSearchCV(estimator=model,
                  param_grid=[{"iterations": [50, 70, 100, 150, 200],
                               "depth": [2, 3, 4, 6, 8]}],
                  scoring="accuracy",
                  cv=10)
gs.fit(data.iloc[:, :-1], data.iloc[:, -1])

print(gs.best_score_, gs.best_params_)

0.8078763497349023 {'depth': 4, 'iterations': 150}


In [432]:
X = data.iloc[:, :-1]; y = data.iloc[:, -1]

In [437]:
best_score = 0; best_iter = 0; best_depth = 0

for n_iter in [70, 100, 150, 200, 250, 300]:
  for depth in [2, 3, 4, 6]:
    scores = []
    for idxs_train, idxs_test in StratifiedKFold(n_splits=10).split(X, y):
      xgb_params = {'booster': 'gbtree', 'max_depth': depth, 'eta': 0.1, 
        'objective': 'binary:logistic', 'nthread': 1, 'eval_metric' : "error"}

      bst = xgb.train(xgb_params, xgb.DMatrix(X.values[idxs_train], label=y.values[idxs_train]), num_boost_round=n_iter)

      y_test_pred = np.where(bst.predict(xgb.DMatrix(X.values[idxs_test], label=y.values[idxs_test])) > 0.5, 1, 0)

      scores.append(accuracy_score(y.values[idxs_test], y_test_pred))

    sc = np.mean(scores)
    if sc > best_score:
      best_score = sc
      best_iter = n_iter
      best_depth = depth

print(best_score, best_iter, best_depth)





0.8057432432432433 200 2


In [438]:
best_score = 0; best_iter = 0; best_depth = 0

for n_iter in [70, 100, 150, 200, 250, 300]:
  for depth in [2, 3, 4, 6]:
    scores = []
    for idxs_train, idxs_test in StratifiedKFold(n_splits=10).split(X, y):
      train_data = lgb.Dataset(X.values[idxs_train], label=y.values[idxs_train])

      param = {'num_leaves': 2**depth, 'objective': 'binary'}
      param['metric'] = 'auc'

      num_round = n_iter

      bst = lgb.train(param, train_data, num_round)

      y_test_pred = np.where(bst.predict(X.values[idxs_test]) > 0.5, 1, 0)

      scores.append(accuracy_score(y.values[idxs_test], y_test_pred))

    sc = np.mean(scores)
    if sc > best_score:
      best_score = sc
      best_iter = n_iter
      best_depth = depth

print(best_score, best_iter, best_depth)

0.8047469045001939 70 3


Всех лучше показал себя CastBoost - 80.78 % при 150 деревьях и глубине 4

При самых лучших настройках модели практически не показывают себя лучше, очевидно необходимо уменьшать размерность.