In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
# We use pandas to read the CSV file into a DataFrame
df = pd.read_csv('ecommerce.csv')

# Inspect the data
print("Dataset Info:")
print(df.info())

print("\nFirst 5 rows:")
display(df.head())

In [None]:
# 1. Target Variable Analysis
# We need to know if our dataset is balanced or imbalanced
plt.figure(figsize=(6, 4))
sns.countplot(x='Churn', data=df, palette='viridis')
plt.title('Target Distribution: Churn (1) vs No Churn (0)')
plt.show()

# 2. Numerical Feature Analysis
# Let's check the statistics of our numerical columns
display(df.describe())

# 3. Correlation Heatmap
# Checking which features are related to each other
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Feature Correlation')
plt.show()

In [None]:
# --- STEP 3.0a: INSPECTION BEFORE CLEANING ---
# It is crucial to check unique values to spot inconsistencies (e.g., "Mobile" vs "Mobile Phone")
print("--- Unique Value Inspection ---\n")

for col in df.columns:
    print(f"Feature: {col}")
    print(f"Count of unique values: {df[col].nunique()}")
    
    # If a column has too many unique values (like CashbackAmount), we only show the first 10 to keep output clean
    if df[col].nunique() > 20:
        print(f"Values (First 10 sample): {df[col].unique()[:10]} ...")
    else:
        print(f"Values: {df[col].unique()}")
    
    print("-" * 40)

print("\n" + "="*50 + "\n")

# --- STEP 3.0b: OUTLIER DETECTION ---

print("--- Outlier Detection (IQR Method) ---\n")

# Select only numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

for col in numerical_cols:
    # 1. Calculate Q1 (25th %), Q3 (75th %), and IQR
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # 2. Define Bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # 3. Count Outliers
    outliers_count = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
    
    # Only print if outliers exist
    if outliers_count > 0:
        print(f"Feature '{col}': {outliers_count} outliers ({outliers_count/len(df):.2%})")

print("\n" + "="*50 + "\n")

In [None]:
# --- STEP 3.1: DATA CLEANING ---

# Create a copy to keep the original loaded data safe
df_clean = df.copy()

# Drop ID column (it's unique for every user and has no predictive power)
# We check if it exists first to avoid errors if you run the cell twice
if 'CustomerID' in df_clean.columns:
    df_clean = df_clean.drop(columns=['CustomerID'])

df_clean['PreferredLoginDevice'] = df_clean['PreferredLoginDevice'].replace('Mobile Phone', 'Phone')
df_clean['PreferedOrderCat'] = df_clean['PreferedOrderCat'].replace('Mobile Phone', 'Phone')
df_clean['PreferedOrderCat'] = df_clean['PreferedOrderCat'].replace('Mobile', 'Phone')
# Handle Missing Values
# Strategy: Fill numerical gaps with the Median (robust to outliers)
for col in df_clean.columns:
    if df_clean[col].dtype != 'object':
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())

print("Missing values after cleaning:", df_clean.isnull().sum().sum())


# --- STEP 3.2: ENCODING CATEGORICAL VARIABLES ---

# Machine learning models cannot understand text. 
# We use One-Hot Encoding to convert them into binary columns (0s and 1s)
df_final = pd.get_dummies(df_clean, drop_first=True, dtype=int)

print("Final Data Shape:", df_final.shape)
display(df_final.head())

In [None]:
# Import the specific tools for this step
from sklearn.model_selection import ____________
from sklearn.preprocessing import StandardScaler

# 1. Separate Features (X) and Target (y)
# TODO: Drop the target column 'Churn' from X
X = df_final.drop(___, axis=1)
y = df_final['Churn']

# 2. Split into Training (80%) and Testing (20%)
# TODO: Complete the function to split the data
# random_state=42 ensures we get the same split every time
X_train, X_test, y_train, y_test = ___(X, y, test_size=0.2, random_state=42)

# 3. Scale the Data
# Models like KNN and Logistic Regression struggle if values aren't on the same scale
scaler = StandardScaler()

# TODO: Fit the scaler on training data ONLY and transform it
# Hint: Use .fit_transform()
X_train_scaled = scaler.___(X_train)

# TODO: Transform the test data using the parameters learned from training
# Hint: Use .transform() ONLY (Do NOT fit again!)
X_test_scaled = scaler.___(X_test)

print("Data successfully split and scaled.")

In [None]:
# --- TODO: IMPORT MODELS ---
# Import LogisticRegression, KNeighborsClassifier, DecisionTreeClassifier, 
# RandomForestClassifier, and SVC from sklearn
from sklearn.linear_model import ___
from sklearn.neighbors import ___
from sklearn.tree import ___
from sklearn.ensemble import ___
from sklearn.svm import ___

print("--- Training Baseline Models ---")

# 1. Logistic Regression
# TODO: Initialize with C=0.01 and random_state=42
clf_log = ___(C=0.01, random_state=42)
# TODO: Fit on training data
clf_log.fit(X_train_scaled, y_train)
print("--> Logistic Regression trained.")


# 2. K-Nearest Neighbors (KNN)
# TODO: Initialize with n_neighbors=25
clf_knn = ___(n_neighbors=25)
clf_knn.fit(X_train_scaled, y_train)
print("--> KNN trained.")


# 3. Decision Tree
# TODO: Initialize with max_depth=4 and random_state=42
clf_tree = ___(max_depth=4, random_state=42)
clf_tree.fit(X_train_scaled, y_train)
print("--> Decision Tree trained.")


# 4. Random Forest
# TODO: Initialize with n_estimators=10, max_depth=5, and random_state=42
clf_rf = ___(n_estimators=10, max_depth=5, random_state=42)
clf_rf.fit(X_train_scaled, y_train)
print("--> Random Forest trained.")


# 5. Support Vector Machine (SVM)
# TODO: Initialize with kernel='sigmoid', C=0.1, and random_state=42
clf_svm = ___(kernel='sigmoid', C=0.1, random_state=42)
clf_svm.fit(X_train_scaled, y_train)
print("--> SVM trained.")

In [None]:
# --- TODO: IMPORT METRICS ---
# Import accuracy, precision, recall, and f1
from sklearn.metrics import ___, ___, ___, ___
import pandas as pd

print("--- Baseline Evaluation Metrics ---\n")

# 1. Create a dictionary of our trained models
# (We use the models you just trained above)
trained_models = {
    "Logistic Regression": ___,
    "KNN": ___,
    "Decision Tree": ___,
    "Random Forest": ___,
    "SVM": ___
}

# 2. Initialize a list to store results
results_list = []

# 3. Calculate metrics for each model
for name, model in trained_models.items():
    # TODO: Generate predictions on the scaled TEST set
    y_pred = model.___(X_test_scaled)
    
    # TODO: Calculate Scores (Compare y_test with y_pred)
    acc = ___(y_test, y_pred)
    prec = ___(y_test, y_pred, zero_division=0)
    rec = ___(y_test, y_pred)
    f1 = ___(y_test, y_pred)
    
    # Append to list
    results_list.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1
    })

# 4. Convert to DataFrame
results_df = pd.DataFrame(results_list)

# 5. Display sorted by F1-Score
display(results_df.sort_values(by='F1-Score', ascending=False))

print("\nDescription of Metrics:")
print("- Accuracy:  Overall correctness")
print("- Precision: When it predicts 'Churn', how often is it right?")
print("- Recall:    Out of all actual 'Churners', how many did we catch?")
print("- F1-Score:  The harmonic mean of Precision and Recall (Balance).")

In [None]:
# --- TODO: IMPORT CROSS-VALIDATION TOOL ---
from sklearn.model_selection import ___

print("--- Cross-Validation Stability Check ---\n")

# Group the models you trained in Step 5 into a dictionary
models = {
    "Logistic Regression": clf_log,
    "KNN": clf_knn,
    "Decision Tree": clf_tree,
    "Random Forest": clf_rf,
    "SVM": clf_svm
}

# Loop through the models to test each one
for name, model in models.items():
    # TODO: Run Cross-Validation
    # - model: The model object
    # - X, y: The TRAINING data (X_train_scaled, y_train)
    # - cv: Number of folds (use 5)
    # - scoring: The metric we want (use 'accuracy')
    scores = ___(model, X_train_scaled, y_train, cv=5, scoring='___')
    
    # We print the Mean score and the Standard Deviation (Stability)
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")

In [None]:
# --- TODO: IMPORT GRID SEARCH ---
from sklearn.model_selection import ___

print("--- Starting Exhaustive Grid Search ---\n")
print("Note: This may take a few minutes because we are testing hundreds of combinations.\n")

# ==========================================
# 1. Tune Logistic Regression
# ==========================================
print("1. Tuning Logistic Regression...")

# TODO: Define the parameter grid
lr_grid = {
    # Try C values: [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    'C': [___], 
    # Try solvers: ['liblinear', 'lbfgs']
    'solver': [___],
    'max_iter': [1000]
}

# TODO: Initialize GridSearchCV
lr_search = ___(
    estimator=LogisticRegression(random_state=42),
    param_grid=___,
    cv=5, 
    n_jobs=-1, 
    scoring='accuracy'
)

# TODO: Fit on the Training data
lr_search.fit(X_train_scaled, y_train)

# TODO: Get the best model
best_log = lr_search.___

print(f"   Best LogReg Params: {lr_search.best_params_}")
print(f"   Best Accuracy: {lr_search.best_score_:.4f}\n")


# ==========================================
# 2. Tune K-Nearest Neighbors (KNN)
# ==========================================
print("2. Tuning KNN...")

knn_grid = {
    # TODO: Try odd neighbors from 3 to 21 (e.g. [3, 5, 7, ...])
    'n_neighbors': [___], 
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

knn_search = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=knn_grid,
    cv=5, 
    n_jobs=-1
)

knn_search.fit(X_train_scaled, y_train)
best_knn = knn_search.best_estimator_
print(f"   Best KNN Params: {knn_search.best_params_}")
print(f"   Best Accuracy: {knn_search.best_score_:.4f}\n")


# ==========================================
# 3. Tune Decision Tree
# ==========================================
print("3. Tuning Decision Tree...")

dt_grid = {
    'criterion': ['gini', 'entropy'],
    # TODO: Try max_depths: [None, 5, 10, 20, 30]
    'max_depth': [___],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10]
}

dt_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=dt_grid,
    cv=5, 
    n_jobs=-1
)

dt_search.fit(X_train_scaled, y_train)
best_tree = dt_search.best_estimator_
print(f"   Best Tree Params: {dt_search.best_params_}")
print(f"   Best Accuracy: {dt_search.best_score_:.4f}\n")


# ==========================================
# 4. Tune Random Forest
# ==========================================
print("4. Tuning Random Forest...")

rf_grid = {
    # TODO: Try n_estimators: [50, 100, 200]
    'n_estimators': [___],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

rf_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=rf_grid,
    cv=3,       # Reduced CV to 3 for RF to save time
    n_jobs=-1
)

rf_search.fit(X_train_scaled, y_train)
best_rf = rf_search.best_estimator_
print(f"   Best RF Params: {rf_search.best_params_}")
print(f"   Best Accuracy: {rf_search.best_score_:.4f}\n")


# ==========================================
# 5. Tune Support Vector Machine (SVM)
# ==========================================
print("5. Tuning SVM (WARNING: This is slow)...")

svm_grid = {
    # TODO: Try C values: [0.1, 1, 10, 100]
    'C': [___],
    # TODO: Try kernels: ['linear', 'rbf', 'poly']
    'kernel': [___],
    'gamma': ['scale', 'auto']
}

svm_search = GridSearchCV(
    estimator=SVC(probability=True, random_state=42),
    param_grid=svm_grid,
    cv=3,       # Reduced CV to 3 for SVM
    n_jobs=-1
)

svm_search.fit(X_train_scaled, y_train)
best_svm = svm_search.best_estimator_
print(f"   Best SVM Params: {svm_search.best_params_}")
print(f"   Best Accuracy: {svm_search.best_score_:.4f}\n")

print("--- Tuning Complete! ---")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# --- STEP 9: Final Comparison ---

print("--- Re-establishing Baseline Models (Weak Params) ---")
# We re-train these quickly to ensure the variables exist in this cell
# (No need to change anything here, just run it to set up the comparison)
clf_log = LogisticRegression(C=0.01, solver='liblinear', random_state=42).fit(X_train_scaled, y_train)
clf_knn = KNeighborsClassifier(n_neighbors=20).fit(X_train_scaled, y_train)
clf_tree = DecisionTreeClassifier(max_depth=2, random_state=42).fit(X_train_scaled, y_train)
clf_rf = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42).fit(X_train_scaled, y_train)
clf_svm = SVC(kernel='sigmoid', C=0.1, random_state=42).fit(X_train_scaled, y_train)
print("Baselines ready.\n")


print("--- Final Evaluation: Baseline vs. Tuned ---\n")

# 1. Define Helper Function to calculate all 4 metrics
def get_metrics(model, X, y):
    # TODO: Generate predictions
    y_pred = model.predict(X)
    
    # TODO: Return a dictionary with the 4 metrics
    return {
        'Accuracy': ___(y, y_pred),
        'Precision': ___(y, y_pred, zero_division=0),
        'Recall': ___(y, y_pred),
        'F1': ___(y, y_pred)
    }

# 2. Calculate Metrics for All Models
comparison_list = []

# Dictionary of pairs: (Baseline, Tuned)
models_dict = {
    "Logistic Regression": (clf_log, best_log),
    "KNN":                 (clf_knn, best_knn),
    "Decision Tree":       (clf_tree, best_tree),
    "Random Forest":       (clf_rf, best_rf),
    "SVM":                 (clf_svm, best_svm)
}

for name, (base_model, tuned_model) in models_dict.items():
    # TODO: Get metrics for Baseline model using the TEST set
    # Hint: Use X_test_scaled and y_test
    base_metrics = get_metrics(base_model, ___, ___)
    
    # TODO: Get metrics for Tuned model using the TEST set
    tuned_metrics = get_metrics(tuned_model, ___, ___)
    
    # Add to list
    comparison_list.append({
        'Model': name,
        'Base Acc': base_metrics['Accuracy'],
        'Tuned Acc': tuned_metrics['Accuracy'],
        'Base F1': base_metrics['F1'],
        'Tuned F1': tuned_metrics['F1'],
        'Base Recall': base_metrics['Recall'],
        'Tuned Recall': tuned_metrics['Recall']
    })

# 3. Create DataFrame
results_df = pd.DataFrame(comparison_list)

# 4. Display the detailed table (Sorted by Tuned F1 Score)
print("Final Detailed Scoreboard:")
display(results_df.round(4).sort_values(by='Tuned F1', ascending=False))


# 5. Visualize the Improvement
# Reshape data for plotting
results_melted = results_df.melt(id_vars='Model', 
                                 value_vars=['Base F1', 'Tuned F1'], 
                                 var_name='Type', 
                                 value_name='F1 Score')

plt.figure(figsize=(10, 6))
# TODO: Create a barplot
# x='Model', y='F1 Score', hue='Type'
sns.barplot(data=results_melted, x='___', y='___', hue='___', palette='viridis')

plt.title("Impact of Tuning on F1-Score (The Real Value)")
plt.ylim(0.0, 1.0) 
plt.ylabel("F1 Score")
plt.xticks(rotation=45)
plt.legend(loc='lower right')
plt.grid(axis='y', alpha=0.3)
plt.show()

# 6. Conclusion
best_model = results_df.loc[results_df['Tuned F1'].idxmax()]
print(f"üèÜ Best Model: {best_model['Model']}")
print(f"   Recall improved from {best_model['Base Recall']:.2f} to {best_model['Tuned Recall']:.2f}")
print(f"   (This means we catch more churners!)")