# Churn prediction using logistic regression 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 


In [2]:
df = pd.read_csv("churndata.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [5]:
df.shape

(7043, 21)

In [6]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## keeping important columns 

In [7]:
columns_to_keep = ['gender','SeniorCitizen','Partner','Dependents','tenure','PhoneService','MultipleLines','Contract','TotalCharges','Churn']


df = df[columns_to_keep]

In [8]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,Month-to-month,29.85,No
1,Male,0,No,No,34,Yes,No,One year,1889.5,No
2,Male,0,No,No,2,Yes,No,Month-to-month,108.15,Yes
3,Male,0,No,No,45,No,No phone service,One year,1840.75,No
4,Female,0,No,No,2,Yes,No,Month-to-month,151.65,Yes


## Encode binary variables(E.g. Yes/NO columns)

binary_columns = ['partner','Dependents','PhoneService','PaperlessBilling','Churn']

In [9]:
# user label Encoder
from sklearn.preprocessing import LabelEncoder
# making object of label encoder
label_encoder = LabelEncoder()
#list of columns to label encode 
categorical_cols = ['gender','Partner','Dependents','PhoneService','MultipleLines','Contract','Churn']

for cols in categorical_cols :
    df[cols] = label_encoder.fit_transform(df[cols])

In [10]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,29.85,0
1,1,0,0,0,34,1,0,1,1889.5,0
2,1,0,0,0,2,1,0,0,108.15,1
3,1,0,0,0,45,0,1,1,1840.75,0
4,0,0,0,0,2,1,0,0,151.65,1
...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,1,1990.5,0
7039,0,0,1,1,72,1,2,1,7362.9,0
7040,0,0,1,1,11,0,1,0,346.45,0
7041,1,1,1,0,4,1,2,0,306.6,1


## Other easy technique 

In [11]:
# binary_column = ['Partner','Dependents','PhoneService','Churn']

# df[binary_column] = df[binary_column].replace({'Yes': 1,'No':0})

## Split data into training and testing set 

In [12]:
x = df.drop('Churn', axis = 1)
y = df['Churn']


In [13]:
#split data into training and testing 
from sklearn.model_selection import train_test_split 

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)


In [14]:
# convert 'TotalCharges' column to float and handle errors ='coerce' to replace non-numeric values with NaN
x_train['TotalCharges'] = pd.to_numeric(x_train['TotalCharges'],errors='coerce')
x_test['TotalCharges'] = pd.to_numeric(x_test['TotalCharges'],errors='coerce')

In [15]:
x_train.isnull().sum()

gender            0
SeniorCitizen     0
Partner           0
Dependents        0
tenure            0
PhoneService      0
MultipleLines     0
Contract          0
TotalCharges     10
dtype: int64

In [16]:
# replace missing values in 'TotalCharges' column with the mean of the column

x_train['TotalCharges'].fillna(x_train['TotalCharges'].mean(), inplace=True)

x_test['TotalCharges'].fillna(x_test['TotalCharges'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_train['TotalCharges'].fillna(x_train['TotalCharges'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_test['TotalCharges'].fillna(x_test['TotalCharges'].mean(), inplace=True)


In [17]:
x_train.isnull().sum()

gender           0
SeniorCitizen    0
Partner          0
Dependents       0
tenure           0
PhoneService     0
MultipleLines    0
Contract         0
TotalCharges     0
dtype: int64

# Standardize features 

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [19]:
x_train

array([[-1.02516569, -0.4377492 , -0.96957859, ..., -1.00053704,
         0.37290835, -0.42210502],
       [-1.02516569, -0.4377492 , -0.96957859, ...,  1.10833901,
         1.5775905 ,  1.25536015],
       [ 0.97545208, -0.4377492 ,  1.03137591, ...,  0.05390099,
        -0.83177379, -1.00299144],
       ...,
       [ 0.97545208, -0.4377492 ,  1.03137591, ..., -1.00053704,
        -0.83177379, -0.87799925],
       [ 0.97545208,  2.28441306, -0.96957859, ...,  1.10833901,
        -0.83177379, -0.48254445],
       [ 0.97545208, -0.4377492 , -0.96957859, ..., -1.00053704,
         0.37290835, -0.81110232]])

# logistic regression

In [20]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression()

lg.fit(x_train,y_train)

y_pred = lg.predict(x_test)

In [21]:
y_pred 

array([1, 0, 0, ..., 0, 0, 1])

# Accuracy Score 

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7757274662881476

# Sava Model 

In [23]:
import pickle 
pickle.dump(lg, open('churn_logistic_model.pkl','wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))