<a href="https://colab.research.google.com/github/maniappa/DDG-Assignment1/blob/main/Python_Code_for_Regression_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import r2_score, mean_squared_error

# --- Load and Clean Data ---
# Load the dataset from the CSV file.
# A try-except block is used to handle the case where the file is not found.
try:
    df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
except FileNotFoundError:
    print("Error: The dataset file 'WA_Fn-UseC_-Telco-Customer-Churn.csv' was not found.")
    # Exit if the file doesn't exist, as the rest of the script depends on it.
    exit()

# --- Initial Data Cleaning for 'TotalCharges' ---
# The 'TotalCharges' column may contain empty spaces for new customers.
# 1. Convert the column to a numeric type. `errors='coerce'` will turn any non-numeric values into NaN (Not a Number).
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# 2. Fill the resulting NaN values with 0. This is a logical imputation as new customers have 0 total charges.
df['TotalCharges'].fillna(0, inplace=True)

# Drop the customerID column as it is just an identifier and has no predictive value.
df = df.drop('customerID', axis=1)


# ==============================================================================
# PART 1: PREDICTING CUSTOMER CHURN (LOGISTIC REGRESSION)
# ==============================================================================
print("--- Part 1: Predicting Customer Churn with Logistic Regression ---")

# --- 1. Data Preparation for Churn Prediction ---

# Define the target variable (y) and features (X).
X_churn = df.drop('Churn', axis=1)
y_churn = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0) # Convert 'Yes'/'No' to 1/0

# Identify numerical and categorical feature names
numerical_features_churn = X_churn.select_dtypes(include=np.number).columns.tolist()
categorical_features_churn = X_churn.select_dtypes(include=['object']).columns.tolist()

# Create a preprocessing pipeline using ColumnTransformer.
# This ensures that the same transformations are applied consistently.
preprocessor_churn = ColumnTransformer(
    transformers=[
        # 'num' pipeline: Applies StandardScaler to numerical features.
        ('num', StandardScaler(), numerical_features_churn),
        # 'cat' pipeline: Applies OneHotEncoder to categorical features.
        # handle_unknown='ignore' prevents errors if new categories appear in test data.
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_churn)
    ])

# Split the data into training (80%) and testing (20%) sets.
# stratify=y_churn ensures the proportion of churned customers is the same in both sets.
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_churn, y_churn, test_size=0.2, random_state=42, stratify=y_churn
)

# --- 2. Build and Train the Logistic Regression Model ---

# Create the full model pipeline by chaining the preprocessor and the classifier.
log_reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_churn),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Train the model on the training data.
log_reg_pipeline.fit(X_train_c, y_train_c)

# --- 3. Evaluate the Churn Model ---
# Make predictions on the unseen test data.
y_pred_c = log_reg_pipeline.predict(X_test_c)
y_pred_proba_c = log_reg_pipeline.predict_proba(X_test_c)[:, 1] # Probabilities for the '1' class

# Calculate and print the classification metrics.
print("\nLogistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test_c, y_pred_c):.4f}")
print(f"Precision: {precision_score(y_test_c, y_pred_c):.4f}")
print(f"Recall: {recall_score(y_test_c, y_pred_c):.4f}")
print(f"F1-Score: {f1_score(y_test_c, y_pred_c):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test_c, y_pred_proba_c):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_c, y_pred_c))


# ==============================================================================
# PART 2: PREDICTING TOTAL CHARGES (LINEAR REGRESSION)
# ==============================================================================
print("\n\n--- Part 2: Predicting Total Charges with Linear Regression ---")

# --- 1. Data Preparation for Total Charges Prediction ---

# Define the target (y) and features (X). 'Churn' is dropped from features.
X_charges = df.drop(['TotalCharges', 'Churn'], axis=1)
y_charges = df['TotalCharges']

# Identify numerical and categorical feature names for this new feature set.
numerical_features_charges = X_charges.select_dtypes(include=np.number).columns.tolist()
categorical_features_charges = X_charges.select_dtypes(include=['object']).columns.tolist()

# Create the preprocessor for the regression task.
preprocessor_charges = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_charges),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_charges)
    ])

# Split the data into training and testing sets.
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_charges, y_charges, test_size=0.2, random_state=42
)

# --- 2. Build and Train the Linear Regression Model ---

# Create the full model pipeline.
lin_reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_charges),
    ('regressor', LinearRegression())
])

# Train the model.
lin_reg_pipeline.fit(X_train_r, y_train_r)

# --- 3. Evaluate the Total Charges Model ---
# Make predictions on the test data.
y_pred_r = lin_reg_pipeline.predict(X_test_r)

# Calculate and print the regression metrics.
r2 = r2_score(y_test_r, y_pred_r)
mse = mean_squared_error(y_test_r, y_pred_r)
rmse = np.sqrt(mse)

print("\nLinear Regression Model Evaluation:")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(0, inplace=True)


--- Part 1: Predicting Customer Churn with Logistic Regression ---

Logistic Regression Model Evaluation:
Accuracy: 0.8055
Precision: 0.6572
Recall: 0.5588
F1-Score: 0.6040
ROC AUC Score: 0.8421

Confusion Matrix:
[[926 109]
 [165 209]]


--- Part 2: Predicting Total Charges with Linear Regression ---

Linear Regression Model Evaluation:
R-squared (R²): 0.9050
Mean Squared Error (MSE): 494452.12
Root Mean Squared Error (RMSE): $703.17


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# --- Load and Clean Data ---
try:
    df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
except FileNotFoundError:
    print("Error: The dataset file 'WA_Fn-UseC_-Telco-Customer-Churn.csv' was not found.")
    exit()

# Clean 'TotalCharges' column
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)
df = df.drop('customerID', axis=1)

# --- 1. Data Preparation for Churn Prediction ---
# This setup is the same as the baseline model
X = df.drop('Churn', axis=1)
y = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 2. Define the Model Pipeline ---
# We define the pipeline that will be used in the grid search.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# --- 3. Define the Hyperparameter Grid for Tuning ---
# These are the 'dials' we will tune for the Logistic Regression model.
# 'classifier__' prefix is used to specify that these parameters belong to the 'classifier' step of the pipeline.
param_grid = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'classifier__solver': ['liblinear']  # 'liblinear' is a good solver for this small dataset and works with both l1/l2
}

# --- 4. Perform Grid Search with Cross-Validation ---
# We will search for the best parameters based on the 'f1' score, as it provides
# a good balance between precision and recall for our imbalanced dataset.
# cv=5 means 5-fold cross-validation.
print("Starting hyperparameter tuning with GridSearchCV...")
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)

# Fit the grid search to the training data. This will test all parameter combinations.
grid_search.fit(X_train, y_train)

# --- 5. Display Tuning Results ---
print("\nHyperparameter tuning complete.")
print(f"Best F1-score found during tuning: {grid_search.best_score_:.4f}")
print("Best parameters found:")
print(grid_search.best_params_)

# --- 6. Evaluate the Tuned Model vs. Baseline ---
# Get the best model found by the grid search.
best_model = grid_search.best_estimator_

# Make predictions on the test set with the tuned model.
y_pred_tuned = best_model.predict(X_test)

# Baseline model (for comparison)
baseline_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test)

# --- 7. Final Comparison ---
print("\n--- Model Performance Comparison ---")

baseline_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_baseline),
    "Precision": precision_score(y_test, y_pred_baseline),
    "Recall": recall_score(y_test, y_pred_baseline),
    "F1-Score": f1_score(y_test, y_pred_baseline)
}

tuned_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_tuned),
    "Precision": precision_score(y_test, y_pred_tuned),
    "Recall": recall_score(y_test, y_pred_tuned),
    "F1-Score": f1_score(y_test, y_pred_tuned)
}

comparison_df = pd.DataFrame({
    'Baseline Model': baseline_metrics,
    'Tuned Model': tuned_metrics
})

print(comparison_df)

print("\n--- Detailed Report for Tuned Model ---")
print(classification_report(y_test, y_pred_tuned))


Starting hyperparameter tuning with GridSearchCV...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(0, inplace=True)



Hyperparameter tuning complete.
Best F1-score found during tuning: 0.5994
Best parameters found:
{'classifier__C': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}

--- Model Performance Comparison ---
           Baseline Model  Tuned Model
Accuracy         0.805536     0.801278
Precision        0.657233     0.647799
Recall           0.558824     0.550802
F1-Score         0.604046     0.595376

--- Detailed Report for Tuned Model ---
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.65      0.55      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.80      1409

