In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('../dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df.shape

(7043, 21)

In [4]:
df.drop(columns=['customerID'], axis=1, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [6]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [7]:
df.dropna(how='any', inplace=True)

In [8]:
df.select_dtypes('object').columns.tolist()

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn']

In [9]:
df.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [None]:
df = pd.get_dummies(df, dtype=int, drop_first=True)
df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29.85,29.85,0,1,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
1,0,34,56.95,1889.5,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,2,53.85,108.15,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,1
3,0,45,42.3,1840.75,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,0,2,70.7,151.65,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,1


In [None]:
x = df.drop(columns=['Churn_Yes'], axis=1)
y = df['Churn_Yes']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [16]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)

model = LogisticRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print(round(accuracy_score(y_test, y_pred)*100, 2))

79.24


### PCA

In [27]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [28]:
explaind_variance = pca.explained_variance_ratio_

print(explaind_variance)

[9.99858069e-01 1.24090191e-04 1.71403599e-05 1.23908957e-07
 6.46782051e-08 5.36963154e-08 4.90502886e-08 4.47010962e-08]


In [29]:
model_pca = LogisticRegression()
model_pca.fit(x_train_pca, y_train)

y_pred_pca = model_pca.predict(x_test_pca)
print(round(accuracy_score(y_test, y_pred_pca)*100, 2))

79.34


In [30]:
x_train_pca

array([[-1.73557980e+03,  1.14703762e+01, -3.14509391e+00, ...,
        -9.41485806e-01, -4.71213527e-01,  1.26464983e+00],
       [-6.93381879e+02,  2.74968043e+00,  1.08863716e+00, ...,
        -1.48587905e-01,  6.21563306e-01, -6.56392665e-01],
       [-2.05491013e+03,  2.56021665e+01,  3.60852622e-01, ...,
        -2.11093675e-02, -6.55972990e-01,  1.24716501e+00],
       ...,
       [-1.49080352e+03,  3.08552940e+01,  4.01382997e+00, ...,
         3.36697712e-02,  5.51501000e-01, -6.58005540e-01],
       [-1.28638201e+03, -4.31904367e+01,  1.10625476e+01, ...,
        -6.38138126e-01, -4.75423899e-01,  2.37975364e-01],
       [-8.11772162e+02,  1.67076366e+01,  2.49118303e+00, ...,
         8.15092669e-01,  4.33861558e-01, -5.68807435e-01]])