In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report , confusion_matrix , recall_score , precision_score , roc_auc_score

In [None]:
data = pd.read_csv("/content/Telco-Customer-Churn.csv")

In [None]:
data.describe()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
data.shape

(7043, 21)

In [None]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'] , errors = "coerce")
original_rows = len(data)

data = data.dropna(subset = ["TotalCharges"])
print(f"Cleaned Total Charges : {original_rows - len(data)} rows with NaN")

Cleaned Total Charges : 11 rowswith NaN


In [None]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [None]:
target_feature = 'Churn'
numeric_features = ['tenure','MonthlyCharges','TotalCharges']
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
     'PaymentMethod']

In [None]:
X = data[numeric_features + categorical_features]
y = data[target_feature]

In [None]:
from collections import Counter
Counter(y)

Counter({'No': 5163, 'Yes': 1869})

In [None]:

from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X, y , test_size = 0.3 , random_state=42,stratify = y)
##stratify use when class is imbalance

In [None]:
y_train = y_train.map({'Yes' : 1 , 'No' : 0})
y_test = y_test.map({'Yes' : 1, 'No' : 0})
y = y.map({'Yes':1, 'No': 0})

In [None]:
print(f"Original data churn rate: {y.mean():.4f}")
print(f"Training data churn rate: {y_train.mean():.4f}")
print(f"Testing data churn rate: {y_test.mean():.4f}")


Original data churn rate: 0.2658
Training data churn rate: 0.2657
Testing data churn rate: 0.2659


In [None]:
from sklearn.compose import ColumnTransformer
##1 --- Numeric preprocessig batch
numeric_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

##2 --- categorical preprocessing batch
categorical_transformer = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy = 'most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown = 'ignore'))

])
##--- 3. Combine the above batches with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ],
    remainder = 'drop'
)
##--- 4. Create the final, full-stack pipeline
clf_pipeline = Pipeline(steps =[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        class_weight = 'balanced',
        random_state = 42
    ))
])




In [None]:
clf_pipeline.fit(X_train,y_train)

In [None]:
y_predict = clf_pipeline.predict(X_test)
print(y_predict)

[0 1 0 ... 1 0 1]


In [None]:
y_prob = clf_pipeline.predict_proba(X_test)[:1]
print(y_prob)

[[0.60282891 0.39717109]]


In [None]:
print(confusion_matrix(y_test,y_predict))

[[1110  439]
 [ 115  446]]


In [None]:
print(recall_score(y_test,y_predict))

0.7950089126559715


In [None]:
print(precision_score(y_test , y_predict))

0.503954802259887
