In [1]:
#importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore")

In [2]:
#loading the dataset
df = pd.read_csv("/Users/parakhchokshi/Documents/Portfolio/Telecom/Telecom.csv")

In [3]:
#displaying first 5 rows
df.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
1707,4854-CIDCF,Female,1,No,No,3,Yes,No,Fiber optic,No,...,Yes,No,No,No,Month-to-month,No,Electronic check,73.85,196.4,No
2553,9137-NOQKA,Male,1,No,No,2,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Electronic check,19.2,37.2,No
6930,5570-PTWEH,Female,0,Yes,No,3,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Credit card (automatic),75.15,216.75,Yes
344,5924-SNGKP,Female,0,No,Yes,41,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Credit card (automatic),20.25,865.0,No
566,6967-QIQRV,Male,0,Yes,Yes,15,Yes,No,Fiber optic,Yes,...,No,Yes,Yes,Yes,One year,No,Electronic check,101.9,1667.25,No


In [4]:
#general information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
#customerID has very high cardinality (all unique values) hence we are dropping it
df.drop('customerID',axis='columns',inplace=True)

In [6]:
df['gender'].value_counts()

gender
Male      3555
Female    3488
Name: count, dtype: int64

In [7]:
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

In [8]:
df['Partner'].value_counts()

Partner
No     3641
Yes    3402
Name: count, dtype: int64

In [9]:
df['Partner'] = df['Partner'].map({'No': 0, 'Yes': 1})

In [10]:
df['Dependents'].value_counts()

Dependents
No     4933
Yes    2110
Name: count, dtype: int64

In [11]:
df['Dependents'] = df['Dependents'].map({'No': 0, 'Yes': 1})

In [12]:
df['PhoneService'].value_counts()

PhoneService
Yes    6361
No      682
Name: count, dtype: int64

In [13]:
df['PhoneService'] = df['PhoneService'].map({'No': 0, 'Yes': 1})

In [14]:
df['MultipleLines'].value_counts()

MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64

In [15]:
mapping_dict = {'No phone service': 0, 'No': 0, 'Yes': 1}

df['MultipleLines'] = df['MultipleLines'].map(mapping_dict)

In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [17]:
df.InternetService = le.fit_transform(df.InternetService)

In [18]:
df.Contract = le.fit_transform(df.Contract)

In [19]:
df.PaymentMethod = le.fit_transform(df.PaymentMethod)

In [20]:
df.sample(4)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
3102,1,0,0,0,5,1,0,0,Yes,No,No,Yes,No,No,0,No,2,54.2,308.25,Yes
1006,1,0,0,0,18,1,1,1,No,No,No,Yes,No,Yes,0,Yes,2,90.1,1612.75,Yes
105,0,0,0,0,5,0,0,0,No,No,No,No,No,No,0,No,3,24.3,100.2,No
5559,1,0,0,0,13,0,0,0,No,Yes,No,No,No,No,0,Yes,2,31.65,389.95,No


In [21]:
df.replace('No internet service','No',inplace=True)
df.replace('No phone service','No',inplace=True)

In [22]:
yes_no_columns = ['Partner','Dependents','PhoneService','MultipleLines','OnlineSecurity','OnlineBackup',
                  'DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']
for col in yes_no_columns:
    df[col].replace({'Yes': 1,'No': 0},inplace=True)

In [23]:
pd.to_numeric(df.TotalCharges,errors='coerce').isnull().sum()

11

In [24]:
df[pd.to_numeric(df.TotalCharges,errors='coerce').isnull()]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,1,0,1,1,0,0,0,0,1,0,1,1,1,0,2,1,0,52.55,,0
753,0,0,0,1,0,1,0,2,0,0,0,0,0,0,2,0,3,20.25,,0
936,1,0,1,1,0,1,0,0,1,1,1,0,1,1,2,0,3,80.85,,0
1082,0,0,1,1,0,1,1,2,0,0,0,0,0,0,2,0,3,25.75,,0
1340,1,0,1,1,0,0,0,0,1,1,1,1,1,0,2,0,1,56.05,,0
3331,0,0,1,1,0,1,0,2,0,0,0,0,0,0,2,0,3,19.85,,0
3826,0,0,1,1,0,1,1,2,0,0,0,0,0,0,2,0,3,25.35,,0
4380,1,0,1,1,0,1,0,2,0,0,0,0,0,0,2,0,3,20.0,,0
5218,0,0,1,1,0,1,0,2,0,0,0,0,0,0,1,1,3,19.7,,0
6670,1,0,1,1,0,1,1,0,0,1,1,1,1,0,2,0,3,73.35,,0


In [25]:
df.shape

(7043, 20)

In [26]:
#Removing rows with spaces in Total Charge
df = df[df.TotalCharges!=' ']
df.shape

(7032, 20)

In [27]:
from sklearn.model_selection import train_test_split

X = df.drop('Churn', axis=1)  # Features
y = df['Churn']  # Target variable

# Spliting the data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, roc_auc_score, f1_score, recall_score, accuracy_score, precision_score

In [29]:
# Create a dictionary of classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Boosted Trees': GradientBoostingClassifier(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(probability=True)  # Note: probability=True for calculating ROC AUC
}


In [30]:
scoring = {
    'ROC AUC': 'roc_auc',
    'F1 Score': 'f1',
    'Recall': 'recall',
    'Accuracy': 'accuracy',
    'Precision': 'precision'
}

In [31]:
results = {}

for clf_name, clf in classifiers.items():
    scores = cross_validate(clf, X, y, scoring=scoring, cv=5)
    results[clf_name] = scores

# Display the results
for clf_name, scores in results.items():
    print(f"\nResults for {clf_name}:")
    for metric, values in scores.items():
        print(f"{metric}: {values.mean():.4f} (std: {values.std():.4f})")


Results for Logistic Regression:
fit_time: 0.0826 (std: 0.0335)
score_time: 0.0134 (std: 0.0041)
test_ROC AUC: 0.8424 (std: 0.0098)
test_F1 Score: 0.5953 (std: 0.0133)
test_Recall: 0.5527 (std: 0.0123)
test_Accuracy: 0.8002 (std: 0.0078)
test_Precision: 0.6453 (std: 0.0202)

Results for Decision Tree:
fit_time: 0.0290 (std: 0.0047)
score_time: 0.0074 (std: 0.0019)
test_ROC AUC: 0.6545 (std: 0.0199)
test_F1 Score: 0.4921 (std: 0.0272)
test_Recall: 0.5019 (std: 0.0286)
test_Accuracy: 0.7247 (std: 0.0153)
test_Precision: 0.4830 (std: 0.0276)

Results for Random Forest:
fit_time: 0.3496 (std: 0.0153)
score_time: 0.0337 (std: 0.0006)
test_ROC AUC: 0.8234 (std: 0.0107)
test_F1 Score: 0.5466 (std: 0.0086)
test_Recall: 0.4799 (std: 0.0085)
test_Accuracy: 0.7883 (std: 0.0075)
test_Precision: 0.6355 (std: 0.0251)

Results for Boosted Trees:
fit_time: 0.5270 (std: 0.0016)
score_time: 0.0090 (std: 0.0001)
test_ROC AUC: 0.8438 (std: 0.0103)
test_F1 Score: 0.5853 (std: 0.0167)
test_Recall: 0.5222 (

In [32]:
X_holdout, y_holdout = X_holdout, y_holdout

In [38]:
# Assuming classifiers is a dictionary of models
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Boosted Trees': GradientBoostingClassifier(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(probability=True)
}

# Train each model on the training set and evaluate on the holdout set
results_holdout = {}

for clf_name, clf in classifiers.items():
    # Train the model on the training set
    clf.fit(X_train, y_train)
    
    # Make predictions on the holdout set
    y_pred_holdout = clf.predict(X_holdout)
    
    # Calculate metrics on the holdout set
    f1_holdout = f1_score(y_holdout, y_pred_holdout)
    accuracy_holdout = accuracy_score(y_holdout, y_pred_holdout)
    precision_holdout = precision_score(y_holdout, y_pred_holdout)
    recall_holdout = recall_score(y_holdout, y_pred_holdout)

    # Calculate ROC AUC on the holdout set
    roc_auc_holdout = roc_auc_score(y_holdout, clf.predict_proba(X_holdout)[:, 1])
    
    # Store the results in a dictionary
    results_holdout[clf_name] = {
        'F1 Score': f1_holdout,
        'Accuracy': accuracy_holdout,
        'Precision': precision_holdout,
        'Recall': recall_holdout,
        'ROC AUC': roc_auc_holdout
    }

# Display the results for all models on the holdout set
for clf_name, metrics in results_holdout.items():
    print(f"\nResults for {clf_name} on Holdout Set:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")



Results for Logistic Regression on Holdout Set:
F1 Score: 0.5593
Accuracy: 0.7861
Precision: 0.6181
Recall: 0.5107
ROC AUC: 0.8274

Results for Decision Tree on Holdout Set:
F1 Score: 0.4852
Accuracy: 0.7164
Precision: 0.4688
Recall: 0.5027
ROC AUC: 0.6472

Results for Random Forest on Holdout Set:
F1 Score: 0.5385
Accuracy: 0.7868
Precision: 0.6341
Recall: 0.4679
ROC AUC: 0.8119

Results for Boosted Trees on Holdout Set:
F1 Score: 0.5477
Accuracy: 0.7875
Precision: 0.6307
Recall: 0.4840
ROC AUC: 0.8320

Results for KNN on Holdout Set:
F1 Score: 0.5061
Accuracy: 0.7697
Precision: 0.5887
Recall: 0.4439
ROC AUC: 0.7359

Results for SVM on Holdout Set:
F1 Score: 0.0000
Accuracy: 0.7342
Precision: 0.0000
Recall: 0.0000
ROC AUC: 0.7880


In [None]:
#Logistic regression classifier model performs the best for Cross Validation and Holdout considering different metrics
#Values for ROC AUC and F1 are the best for logistic regression classifier model

In [48]:
# Initialize the model
model = LogisticRegression()

# Feature names
feature_names = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 
                 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 
                 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
                 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 
                 'MonthlyCharges', 'TotalCharges']

# Fit the model on the training data
model.fit(X_train, y_train)

# Get coefficients
coefficients = model.coef_[0]

# Get absolute coefficients to find the magnitude of impact
absolute_coefficients = np.abs(coefficients)

# Get indices of top 5 features
top_indices = np.argsort(absolute_coefficients)[::-1][:5]

# Get names of top 5 features (assuming you have feature names)
top_features = [feature_names[i] for i in top_indices]

print("Top 5 impactful features:")
for feature in top_features:
    print(feature)

Top 5 impactful features:
Contract
PhoneService
OnlineSecurity
TechSupport
SeniorCitizen
