In [105]:
import pandas as pd
import tensorflow as tf
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [96]:
data = pd.read_csv('customer_churn.csv')
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [97]:
data.drop('customerID',axis=1,inplace=True)
#data.TotalCharges = pd.to_numeric(data.TotalCharges,errors='coerce')

data.replace('No internet service','No',inplace=True)
data.replace('No phone service','No',inplace=True)

data.TotalCharges = pd.to_numeric(data.TotalCharges,errors='coerce')
data.TotalCharges.fillna(data.TotalCharges.median(),inplace=True)

data = pd.get_dummies(data,drop_first=True)

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   tenure                                 7043 non-null   int64  
 2   MonthlyCharges                         7043 non-null   float64
 3   TotalCharges                           7043 non-null   float64
 4   gender_Male                            7043 non-null   bool   
 5   Partner_Yes                            7043 non-null   bool   
 6   Dependents_Yes                         7043 non-null   bool   
 7   PhoneService_Yes                       7043 non-null   bool   
 8   MultipleLines_Yes                      7043 non-null   bool   
 9   InternetService_Fiber optic            7043 non-null   bool   
 10  InternetService_No                     7043 non-null   bool   
 11  Onli

In [99]:
data.isnull().sum()

SeniorCitizen                            0
tenure                                   0
MonthlyCharges                           0
TotalCharges                             0
gender_Male                              0
Partner_Yes                              0
Dependents_Yes                           0
PhoneService_Yes                         0
MultipleLines_Yes                        0
InternetService_Fiber optic              0
InternetService_No                       0
OnlineSecurity_Yes                       0
OnlineBackup_Yes                         0
DeviceProtection_Yes                     0
TechSupport_Yes                          0
StreamingTV_Yes                          0
StreamingMovies_Yes                      0
Contract_One year                        0
Contract_Two year                        0
PaperlessBilling_Yes                     0
PaymentMethod_Credit card (automatic)    0
PaymentMethod_Electronic check           0
PaymentMethod_Mailed check               0
Churn_Yes  

In [100]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [101]:
print(data['Churn_Yes'].value_counts())

data0 = data[data['Churn_Yes']==False]
data1 = data[data['Churn_Yes']==True]


Churn_Yes
False    5174
True     1869
Name: count, dtype: int64


In [133]:
def set_model(data):
    X = data.drop(columns=['Churn_Yes'],axis=1)
    y= data['Churn_Yes']
    
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
    
    log = LogisticRegression()
    log.fit(X_train,y_train)
    print(log.score(X_test,y_test))
    
    return log.predict(X_test)
    


Undersampling

In [136]:
under_data0 = data0.sample(data1.shape[0])
under_data = pd.concat([under_data0,data1],axis=0)

pred = set_model(under_data)

0.7486631016042781


Oversampling

In [138]:
over_data1 = data1.sample(data0.shape[0],replace=True)
over_data = pd.concat([data0,over_data1],axis=0)

pred = set_model(over_data)

0.7705314009661836


SMOTE

In [111]:
from imblearn.over_sampling import SMOTE

In [137]:
smote = SMOTE(sampling_strategy='minority')

X = data.drop(columns=['Churn_Yes'],axis='columns')
y = data.Churn_Yes

X, y= smote.fit_resample(X,y)
smote_data = pd.concat([X,y],axis=1)
smote_data

pred = set_model(smote_data)

0.8009661835748793


Use of Ensemble with undersampling

In [142]:
data0.shape[0]/(data0.shape[0]/data1.shape[0])

1869.0

In [168]:
predictions=[]
for i in range(0,data0.shape[0],1869):
    ensemble_data = pd.concat([data0[i:i+1869],data1])
    pred = set_model(ensemble_data)
    predictions.append(pred)


0.7754010695187166
0.7780748663101604
0.75642965204236


In [228]:
def majority_boolean(bool1, bool2, bool3):
    # OR işlemiyle en az bir True değeri kontrolü
    print(bool1)
    print(bool2)
    print(bool3)
    any_true = bool1 or bool2 or bool3
    
    # AND işlemiyle çoğunluğun True olduğu kontrolü
    majority_true = (bool1 and bool2) or (bool1 and bool3) or (bool2 and bool3)
    
    # Sonucu döndürme
    return any_true and majority_true

In [236]:
index = 5
majority_boolean(predictions[0][index], predictions[1][index],predictions[2][index])

False
False
True


False