In [23]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split #splitting data into training and test sets
from sklearn.ensemble import RandomForestClassifier # classification model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix #model evaluation
from sklearn.preprocessing import StandardScaler, OneHotEncoder #data preprocessing (scaling, encoding)
from sklearn.compose import ColumnTransformer #apply different transformers to different columns
from sklearn.pipeline import Pipeline #chain preprocessing and modeling steps

#Load the dataset from Kaggle
file_path = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please ensure the CSV file is in the correct directory.")
    exit()

print("\nFirst 5 rows of the dataset:\n", df.head())
print("\nGeneral information about the dataset:\n")
df.info()

Dataset loaded successfully.

First 5 rows of the dataset:
    customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  

In [24]:
#Data Analysis & Preprocessing


if 'customerID' in df.columns:
    df = df.drop('customerID', axis=1)
    print("\n'customerID' column dropped.")


df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

print("\n'TotalCharges' column converted to numeric.")


# Verify if there are any NaNs *before* dropping them
print("\nChecking for NaNs before dropping:")
print(df.isnull().sum()) 

#Handle missing values
df.dropna(inplace=True) # Remove rows with any NaN values
print(f"Number of rows after dropping NaNs: {df.shape[0]}")


# Re-verify that no NaNs remain in the DataFrame after dropping
print("\nChecking for NaNs AFTER dropping:")
print(df.isnull().sum()) 

#Convert target label 'Churn' to numerical (0 and 1)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
print("\n'Churn' column converted to 0 and 1.")
print("Distribution of 'Churn' after conversion:\n", df['Churn'].value_counts())

#Separate features (X) and target label (y)
X = df.drop('Churn', axis=1)
y = df['Churn']


# Ensure the target variable `y` itself does not have any NaNs
if y.isnull().any():
    print("\nWARNING: NaNs still present in the target variable (y)! This will cause issues with stratify.")
else:
    print("\nNo NaNs found in target variable (y). Ready for splitting.")

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

print(f"\nNumerical features: {numerical_features.tolist()}")
print(f"Categorical features: {categorical_features.tolist()}")

#Create a Preprocessing and Modeling Pipeline (as before)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', RandomForestClassifier(random_state=42))])

print("\nPreprocessing and modeling pipeline created.")

#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nNumber of samples in training set (X_train):", X_train.shape[0])
print("Number of samples in test set (X_test):", X_test.shape[0])
print(f"Churn class distribution in y_train:\n{y_train.value_counts(normalize=True)}")
print(f"Churn class distribution in y_test:\n{y_test.value_counts(normalize=True)}")


'customerID' column dropped.

'TotalCharges' column converted to numeric.

Checking for NaNs before dropping:
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64
Number of rows after dropping NaNs: 7032

Checking for NaNs AFTER dropping:
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessB

In [25]:
#Train the Model ---
model_pipeline.fit(X_train, y_train)

print("\nModel pipeline (including preprocessing and Random Forest) trained successfully.")


Model pipeline (including preprocessing and Random Forest) trained successfully.


In [26]:
#Evaluate the Model
y_pred = model_pipeline.predict(X_test)

print("\nPredictions on the test set completed.")

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")

# Classification Report: Provides precision, recall, F1-score for each class.
print("\nClassification Report:\n", classification_report(y_test, y_pred, labels=[0, 1], target_names=['No Churn', 'Churn']))

# Confusion Matrix: Shows the counts of correct and incorrect predictions for each class.
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Interpretation of Confusion Matrix:
# [[True Negatives (TN)  False Positives (FP)]
#  [False Negatives (FN) True Positives (TP)]]
# TN: Customers who did not churn, correctly predicted as 'No Churn'.
# FP: Customers who did not churn, incorrectly predicted as 'Churn'. (Type I error)
# FN: Customers who did churn, incorrectly predicted as 'No Churn'. (Type II error - often the most critical in churn prediction)
# TP: Customers who did churn, correctly predicted as 'Churn'.


Predictions on the test set completed.

Model Accuracy: 0.79

Classification Report:
               precision    recall  f1-score   support

    No Churn       0.83      0.90      0.86      1033
       Churn       0.63      0.48      0.55       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.70      1407
weighted avg       0.78      0.79      0.78      1407


Confusion Matrix:
 [[927 106]
 [193 181]]


In [27]:
#Make Predictions with the Trained Model on New Data ---
new_customer_data = pd.DataFrame([[
    'Male', '0', 'Yes', 'No', 24, 'Yes', 'No', 'Fiber optic', 'No',
    'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'One year', 'Yes', 'Electronic check', 70.00, 1680.00
]], columns=[
    'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService',
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
    'TotalCharges'
])

print(f"\nNew customer data for prediction:\n{new_customer_data}")

# Predict churn for the new customer data using the pipeline.
predicted_churn_id = model_pipeline.predict(new_customer_data)
predicted_churn_proba = model_pipeline.predict_proba(new_customer_data) # Get probabilities for each class

# Convert the numerical ID prediction to a human-readable label
predicted_churn_label = ['No Churn', 'Churn'][predicted_churn_id[0]]

print(f"Predicted churn status for this customer: {predicted_churn_label}")
print(f"Probability of 'No Churn': {predicted_churn_proba[0][0]:.2f}, Probability of 'Churn': {predicted_churn_proba[0][1]:.2f}")


New customer data for prediction:
  gender SeniorCitizen Partner Dependents  tenure PhoneService MultipleLines  \
0   Male             0     Yes         No      24          Yes            No   

  InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport  \
0     Fiber optic             No          Yes              Yes         Yes   

  StreamingTV StreamingMovies  Contract PaperlessBilling     PaymentMethod  \
0         Yes             Yes  One year              Yes  Electronic check   

   MonthlyCharges  TotalCharges  
0            70.0        1680.0  
Predicted churn status for this customer: No Churn
Probability of 'No Churn': 0.73, Probability of 'Churn': 0.27
