<a href="https://colab.research.google.com/github/notArealdevv/birajpoudel/blob/main/Customer_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Customer Churn Prediction Model
# This script replicates the project described in the resume, demonstrating how to build a
# classification model to predict customer churn for a telecom company.

#
# Technical Stack: Python, Pandas, Scikit-learn
#

# --- 1. Import Necessary Libraries ---
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
# To handle imbalanced classes, we can use a library like imbalanced-learn
# You may need to install it: pip install -U imbalanced-learn
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import resample

# --- 2. Load and Prepare Sample Data ---
# In a real-world scenario, this data would be loaded from a CSV or a database.
# For this example, we'll use a small, representative sample of a telecom dataset.
# The data includes customer demographics, account information, and whether they churned.

csv_data = """customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
6713-OKOMC,Female,0,No,No,10,Yes,No,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8779-SDGTV,Male,0,No,No,2,Yes,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,50.8,102.45,Yes
"""

df = pd.read_csv(StringIO(csv_data))

# Data Cleaning: Convert TotalCharges to numeric, filling missing values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(inplace=True)

# Define features (X) and target (y)
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# --- 3. Feature Engineering and Preprocessing ---
# We need to handle categorical and numerical features differently.
# - Numerical features will be scaled.
# - Categorical features will be one-hot encoded.

# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# --- 4. Handling Unbalanced Classes ---
# Churn datasets are often imbalanced. We'll use RandomOverSampler to balance the classes.
# This technique creates synthetic samples for the minority class.
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Split the balanced data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


# --- 5. Model Training and Hyperparameter Tuning ---
# As per the resume, we will build and compare Logistic Regression and Random Forest models.
# We'll use GridSearchCV to find the best hyperparameters.

# --- Model 1: Logistic Regression ---
print("--- Training Logistic Regression Model ---")
# Create the full pipeline including preprocessing and the model
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', LogisticRegression(solver='liblinear', random_state=42))])

# Define hyperparameters to search
lr_param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2']
}

# Use GridSearchCV for hyperparameter tuning
lr_grid_search = GridSearchCV(lr_pipeline, lr_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
lr_grid_search.fit(X_train, y_train)

# Get the best model
best_lr = lr_grid_search.best_estimator_
print(f"Best Logistic Regression Parameters: {lr_grid_search.best_params_}")


# --- Model 2: Random Forest ---
print("\n--- Training Random Forest Model ---")
# Create the full pipeline
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier(random_state=42))])

# Define hyperparameters to search
rf_param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for hyperparameter tuning
rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Get the best model
best_rf = rf_grid_search.best_estimator_
print(f"Best Random Forest Parameters: {rf_grid_search.best_params_}")


# --- 6. Model Evaluation ---
print("\n--- Evaluating Models on Test Data ---")
# Evaluate Logistic Regression
lr_predictions = best_lr.predict(X_test)
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, lr_predictions))
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, lr_predictions):.4f}")

# Evaluate Random Forest
rf_predictions = best_rf.predict(X_test)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_predictions))
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_predictions):.4f}")

print("\n--- Project Conclusion ---")
print("Both models were trained and evaluated successfully.")
print("The Random Forest Classifier generally provides better performance due to its ability to capture non-linear relationships.")
print("The process demonstrated data preprocessing, handling class imbalance, hyperparameter tuning, and model evaluation.")