<a href="https://colab.research.google.com/github/pauljungdev/telco-churn-prediction/blob/main/c964_Capstone_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

## DATA ##

# Data from Kaggle to Github for easy access
csv_url ='https://raw.githubusercontent.com/thejunglife/churn-data/refs/heads/main/WA_Fn-UseC_-Telco-Customer-Churn.csv'

# Load data into data frame
df = pd.read_csv(csv_url)

df = df.dropna()

# Data preprocessing drop customerID column at it is not necassary
df = df.drop(columns=['customerID'])

# Converts (yes/no) to 0/1
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
df[binary_cols] = df[binary_cols].apply(lambda x: x.map({'Yes': 1, 'No': 0}))

# Converts female/male to 0/1
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Male' else 0)

# Converts false/true to 0/1
df = pd.get_dummies(df, columns=['MultipleLines', 'InternetService', 'OnlineSecurity',
                                 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                                 'StreamingMovies', 'Contract', 'PaymentMethod'
                                 ], drop_first=True)
for col in df.columns:
    if df[col].dtype == 'bool':
        df[col] = df[col].astype(int)

# Convert TotalCharges to numeric as it is an object

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill any NaN values with 0
df['TotalCharges'] = df['TotalCharges'].fillna(0)


## Training the Model ##

# Split data to train and test
X = df.drop(columns=['Churn'])
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest Model
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# Evaluate the Model scores
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

## Visualizations ##

# Histogram of Monthly Charges
plt.figure(figsize=(6,4))
sns.histplot(df['MonthlyCharges'], bins=30, kde=True, color='blue')
plt.title('Monthly Charges Distribution')
plt.xlabel('Monthly Charges')
plt.ylabel('Number of Customers')
plt.show()

# Bar Chart of Attributes Affecting Churn
feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
plt.figure(figsize=(8,5))
feature_importances.nlargest(10).plot(kind='barh', color='green')
plt.title('Top 10 Attributes Affecting Churn')
plt.xlabel('Importance for Churn Prediciton')
plt.ylabel('Customer Attributes')
plt.show()

# Confusion Matrix for Churn Status
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Churn Status')
plt.ylabel('Actual Churn Status')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

##Save the Model to use for UI##

with open('churn_model.pkl', 'wb') as f:
    pickle.dump(rf, f)

In [None]:
# Write the file to Google Colab to run with streamlit command
%%writefile app.py
import streamlit as st
import pandas as pd
import pickle

# Load the trained model
with open('churn_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Feature names from the model
expected_columns = model.feature_names_in_.tolist()

## Web UI ##

st.title('Customer Churn Prediction')

st.sidebar.header('Customer Details')

# Customer Inputs

gender = st.sidebar.radio('Gender', ['Male', 'Female'], index=None)
SeniorCitizen = st.sidebar.radio('Senior Citizen (0 = No | 1 = Yes)', [0, 1], index=None)
Partner = st.sidebar.radio('Partner', ['Yes', 'No'], index=None)
Dependents = st.sidebar.radio('Dependents', ['Yes', 'No'], index=None)
tenure = st.sidebar.number_input('Tenure (months)', min_value=0, max_value=100)
PhoneService = st.sidebar.radio('Phone Service', ['Yes', 'No'], index=None)
MultipleLines = st.sidebar.radio('Multiple Lines', ['No', 'Yes', 'No phone service'], index=None)
InternetService = st.sidebar.radio('Internet Service', ['DSL', 'Fiber optic', 'No'], index=None)
OnlineSecurity = st.sidebar.radio('Online Security', ['No', 'Yes', 'No internet service'], index=None)
OnlineBackup = st.sidebar.radio('Online Backup', ['No', 'Yes', 'No internet service'], index=None)
DeviceProtection = st.sidebar.radio('Device Protection', ['No', 'Yes', 'No internet service'], index=None)
TechSupport = st.sidebar.radio('Tech Support', ['No', 'Yes', 'No internet service'], index=None)
StreamingTV = st.sidebar.radio('Streaming TV', ['No', 'Yes', 'No internet service'], index=None)
StreamingMovies = st.sidebar.radio('Streaming Movies', ['No', 'Yes', 'No internet service'], index=None)
Contract = st.sidebar.radio('Contract', ['Month-to-month', 'One year', 'Two year'], index=None)
PaperlessBilling = st.sidebar.radio('Paperless Billing', ['Yes', 'No'], index=None)
PaymentMethod = st.sidebar.selectbox('Payment Method', ['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)'], index=None)
MonthlyCharges = st.sidebar.number_input('Monthly Charges', min_value=0.0)
TotalCharges = st.sidebar.number_input('Total Charges', min_value=0.0)

# Message to make sure all inputs are entered no 'None' values
if None in [gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService,
            MultipleLines, InternetService, OnlineSecurity, OnlineBackup,
            DeviceProtection, TechSupport, StreamingTV, StreamingMovies,
            Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges]:
    st.sidebar.warning("Please fill in all the fields.")
    st.stop()

# Categorical inputs to numerical values
binary_map = {'Yes': 1, 'No': 0}
gender = 1 if gender == 'Male' else 0
Partner, Dependents, PhoneService, PaperlessBilling = map(binary_map.get, [Partner, Dependents, PhoneService, PaperlessBilling])

# Input DataFrame
input_data = pd.DataFrame([[gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService,
                            MonthlyCharges, TotalCharges]], columns=['gender', 'SeniorCitizen',
                            'Partner', 'Dependents', 'tenure', 'PhoneService', 'MonthlyCharges', 'TotalCharges'])

# One-hot encode categorical features
categorical_features = {
    'MultipleLines': MultipleLines,
    'InternetService': InternetService,
    'OnlineSecurity': OnlineSecurity,
    'OnlineBackup': OnlineBackup,
    'DeviceProtection': DeviceProtection,
    'TechSupport': TechSupport,
    'StreamingTV': StreamingTV,
    'StreamingMovies': StreamingMovies,
    'Contract': Contract,
    'PaymentMethod': PaymentMethod
}

categorical_df = pd.get_dummies(pd.DataFrame([categorical_features]), prefix_sep='_')

# Merge numerical and categorical data
final_input = pd.concat([input_data, categorical_df], axis=1)

# Check to see all columns are in the final inputs
for col in expected_columns:
    if col not in final_input.columns:
        final_input[col] = 0

# Make sure the column order matches the trained model columns
final_input = final_input[expected_columns]

# Prediction Button
predict_button = st.sidebar.button('Predict Churn')

if predict_button:

    prediction = model.predict(final_input)[0]
    probabilty = model.predict_proba(final_input)[0][1]

    st.write(f'Prediction: {"Likely to Churn" if prediction == 1 else "Unlikely to Churn"}')
    st.write(f'Probability of Churn: {probabilty:.2f}')

In [None]:
!pip install streamlit -q

In [None]:
!wget -q -O - ipv4.icanhazip.com

In [None]:
!streamlit run app.py & npx localtunnel --port 8501