# 통신사 고객 이탈 예측 

### 목적
* 파악한 특성들을 이용하여 기대 가치가 큰 고객의 이탈 징후를 잘 탐지하는 예측 모델 생성.
[참고](https://brunch.co.kr/@gimmesilver/53)
    - 예측 모델의 목표는 오차를 최소화하는 것이 아니다.
        - 실제 서비스에서 예측 모델을 사용하는 본질적인 목적은 정답을 잘 맞추겠다가 아니라 예측 모델을 실전에 적용함으로써 이익을 얻는 것. 
        - 따라서, 실제 예측 모델이 목표로 해야할 것은 오차를 최소화하는 것이 아니라 모델 적용을 통해 기대되는 이익을 최대화하는 것. 
        
### 문제 정의
* 지도학습, 분류

### 성능 지표
* 재현율
    * 실제 이탈하는 사람 중 이탈이라고 예측하는 것이 중요하기 때문.
    * 실제 이탈하지 않은 사람을 이탈이라고 예측하면 회사 측에서 손해액 발생

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder  # 더미변수 생성 

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import time

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  # 분류
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv("../../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.shape

(7043, 21)

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
X = df.drop(["Churn"],axis = 1)
y = df["Churn"]

In [6]:
y.replace("No",0,inplace = True)
y.replace("Yes",1,inplace = True)

In [7]:
y.value_counts()/len(y)

0    0.73463
1    0.26537
Name: Churn, dtype: float64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 20171490)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5634, 20)
(1409, 20)
(5634,)
(1409,)


In [10]:
cate_features = []

for column in df.columns:
    if df[column].dtypes==object:
        cate_features.append(column)
        
cate_features

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'TotalCharges']

In [11]:
num_features = []

for feature in list(df.columns.values):
    if feature not in cate_features:
        num_features.append(feature)
        
num_features

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'Churn']

In [12]:
cate_features.remove("customerID")
cate_features.remove("TotalCharges")
num_features.remove("Churn")

In [13]:
scaler_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', MinMaxScaler(feature_range=(0,1))),
])

In [14]:
selector = ColumnTransformer([
    ('scaler', scaler_pipeline, num_features),
    ('one_hot', OneHotEncoder(), cate_features),  # 나열되지 않은 나머지 열은 삭제됨. 
],remainder="drop")

X_prepared = selector.fit_transform(X_train)

In [15]:
X_train.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
4275,2692-PFYTJ,Female,0,No,No,1,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Mailed check,25.75,25.75
4626,0929-PECLO,Female,1,No,No,63,Yes,Yes,Fiber optic,No,No,Yes,No,No,Yes,Month-to-month,Yes,Bank transfer (automatic),89.6,5538.8
2487,6620-HVDUJ,Male,0,No,No,24,Yes,No,DSL,Yes,Yes,No,Yes,No,No,Month-to-month,No,Bank transfer (automatic),60.45,1440.75
2163,4021-RQSNY,Male,1,Yes,No,29,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.5,3004.15
2652,4727-MCYZG,Male,0,No,No,1,Yes,No,DSL,No,No,No,No,No,Yes,Month-to-month,No,Mailed check,55.55,55.55


In [16]:
X_prepared.shape

(5634, 44)

### 여러 모델 적용하여 비교하기 

In [18]:
models = [LogisticRegression(), RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1), GradientBoostingClassifier(), AdaBoostClassifier(), XGBClassifier()]
names = ["Logistic","RF","GB","Ada","XGB"]

for model, name in zip(models, names):
    print(name)
    start = time.time()
    cv_predict = cross_val_predict(model, X_prepared, y_train,cv=10)

    print(classification_report(y_train, cv_predict))
    print("--------------------------------------")

Logistic
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      4129
           1       0.66      0.55      0.60      1505

    accuracy                           0.80      5634
   macro avg       0.75      0.72      0.73      5634
weighted avg       0.79      0.80      0.80      5634

--------------------------------------
RF
              precision    recall  f1-score   support

           0       0.82      0.93      0.87      4129
           1       0.68      0.43      0.53      1505

    accuracy                           0.79      5634
   macro avg       0.75      0.68      0.70      5634
weighted avg       0.78      0.79      0.78      5634

--------------------------------------
GB
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      4129
           1       0.65      0.52      0.58      1505

    accuracy                           0.80      5634
   macro avg       0.75      0.71  

In [30]:
from sklearn.model_selection import GridSearchCV

grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg_cv = GridSearchCV(LogisticRegression(),grid,cv=10,scoring = "recall")
logreg_cv.fit(X_prepared, y_train)

Traceback (most recent call last):
  File "C:\Users\a0105\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\a0105\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\a0105\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\a0105\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\a0105\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\

GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2']},
             scoring='recall')

In [39]:
pd.DataFrame(logreg_cv.cv_results_)["mean_test_score"].sort_values(ascending = False)

7     0.547510
13    0.546185
11    0.546185
9     0.546185
5     0.539541
3     0.479104
1     0.083687
0          NaN
2          NaN
4          NaN
6          NaN
8          NaN
10         NaN
12         NaN
Name: mean_test_score, dtype: float64

In [None]:
param_xgb = {'booster': ['gbtree'],
                  'objective' : ['binary:logistic'], 
                  'eval_metric' : ['error'],
                  'n_estimators':[100],
                  'max_depth': [3,5,7,9],
                  'min_child_weight':[1,3,5,7],
                  'learning_rate' : [0.01,0.05,0.1],
                  'scale_pos_weight':[1,3,5],
                  'colsample_bytree':[0.5,0.7,0.9]}

grid = GridSearchCV(estimator = XGBClassifier(), param_grid=param_xgb, scoring="recall", verbose=1, n_jobs=-1, cv=10)
grid.fit(X_prepared, y_train)

Fitting 10 folds for each of 432 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   59.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 11.1min


In [None]:
param_ada = {
 'n_estimators': [50, 100],
 'learning_rate' : [0.01,0.05,0.1,0.3,1],
 'loss' : ['linear', 'square', 'exponential']
 }

Ada_cv = RandomizedSearchCV(AdaBoostClassifier(), param_distributions = param_ada,cv=10, n_iter = 10, n_jobs=-1)

Ada_cv.fit(X_prepared, y_train)

feature engineering이 필요할 것으로 예상됨.