# **Our Datasets**

In [None]:
Telco_Customer_churn_data_path = "blastchar/telco-customer-churn"

# **Install and Import Libraries**

In [None]:
!pip install h2o

Collecting h2o
  Downloading h2o-3.46.0.7-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading h2o-3.46.0.7-py2.py3-none-any.whl (265.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.9/265.9 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: h2o
Successfully installed h2o-3.46.0.7


In [None]:
# Core Data Handling & System Libraries
import os
import kagglehub
import numpy as np
import pandas as pd

# Statistical Library For Our Analysis
from scipy.stats import chi2_contingency

# Machine Learning Models
from sklearn.tree import DecisionTreeClassifier
import h2o
from h2o.automl import H2OAutoML

# --- Data Visualization ---
import matplotlib.pyplot as plt
import seaborn as sns

# Bonus part
from google.colab import files
import json




# **Our Functions**

In [None]:
def get_kaggle_data(data_path):
  """
    Downloads a dataset from Kaggle Hub and returns the path to the CSV file.

    This function assumes that the downloaded dataset folder contains a single
    CSV file that is the main data source.

    Args:
        data_path (str): The Kaggle Hub path for the dataset,
                         e.g., "blastchar/telco-customer-churn".

    Returns:
        str: The full, absolute path to the downloaded CSV file. Returns an
             empty string if no file is found.
  """

  # Download the dataset and get the path to the downloaded folder
  path = kagglehub.dataset_download(data_path)

  # Get the first file in the folder
  file_name = os.listdir(path)[0]

  # Construct the full path to the CSV file
  csv_path = os.path.join(path, file_name)
  print("Path to CSV file:", csv_path)
  return csv_path

In [None]:
def Getting_continuous_cols(df, cols_list, num_of_unique_values):
  """
    Identifies continuous numerical columns from a given list of column names.

    A column is classified as "continuous" if it satisfies two conditions:
    1. The data type of the column is numeric (either an integer or a float).
    2. The number of unique values within the column is greater than or equal to
       the specified threshold. This is a common heuristic to distinguish
       truly continuous variables (like 'age' or 'price') from discrete
       numerical variables that act like categories (e.g., 'rating' from 1-5).

    Args:
        df (pd.DataFrame): The pandas DataFrame to analyze.
        cols_list (List[str]): A list of column names from the DataFrame that
                               should be evaluated.
        num_of_unique_values (int): The minimum number of unique values a column
                                    must have to be considered continuous.

    Returns:
        List[str]: A list containing the names of the columns that were
                   identified as continuous.
  """

  # Initiate an empty list
  continuous_cols = []

  # Check each column to see if it is an integer or float type, and if so, verify whether it exceeds the unique values limit.
  # If the column is int or float and it exceeds the limit, it will add to the continuous_cols list
  for col in cols_list:
    if df[col].nunique() >= num_of_unique_values and pd.api.types.is_numeric_dtype(df[col]):
      continuous_cols.append(col)
  return continuous_cols

In [None]:
def churn_category_counts(dataframe, churn_column, comparing_column):
  """
    Analyzes and ranks categories within a column by their churn ratio.

    Calculates the ratio of 'Yes' to 'No' churn for each category, sorts
    the results to show the highest-risk groups first, prints the table,
    and returns it as a DataFrame.
  """

  # Create a contingency table to count 'Yes'/'No' churns per category.
  churn_counts = pd.crosstab(dataframe[comparing_column], dataframe[churn_column])
  print(churn_counts)

  # Calculate the ratio of churners to non-churners for each category.
  churn_counts['Churn_Percentage'] = churn_counts['Yes'] /  churn_counts['No']

  # Sort the categories by the churn ratio in descending order.
  churn_counts.sort_values(by ='Churn_Percentage', ascending = False, inplace =True)

  # Print the resulting analysis to the console.
  print(churn_counts)

  # Return the analyzed DataFrame.
  return churn_counts

In [None]:
def Getting_statistics_information(contingency_table):
  """
    Performs a Chi-squared test for independence on a contingency table.

    Args:
        contingency_table (pd.DataFrame): A table with counts of two categorical variables.

    Returns:
        Optional[str]: The name of the index (the variable tested) if the result
        is statistically significant (p < 0.05), otherwise None.
  """


  # Perform the Chi-squared test for independence.
  chi2, p_value, dof, expected = chi2_contingency(contingency_table)

  # Print the key results of the statistical test.
  print("\nChi-squared statistic:", chi2)
  print("P-value:", p_value)
  print("Degrees of Freedom:", dof)

  # A p-value less than 0.05 indicates the result is statistically significant.
  if p_value < 0.05:
    print("statistically Significant")
    return contingency_table.index.name

In [None]:
def Affecting_factor(contingency_table):
  """
    Calculates and displays standardized residuals for a contingency table.

    Standardized residuals indicate the strength of the difference between
    observed and expected counts in each cell. Values > 2 or < -2 are
    typically considered significant contributors to the chi-squared result.

    Args:
        contingency_table (pd.DataFrame): A table with counts of two categorical variables.

    Returns:
        None: This function prints the results to the console.
  """

  # Perform the Chi-squared test to get the expected counts.
  chi2, p_value, dof, expected_counts = chi2_contingency(contingency_table)

  # Calculate standardized residuals to see which cells are most significant.
  residuals = (contingency_table - expected_counts) / np.sqrt(expected_counts)

  # Print the table of residuals.
  print("Standardized Residuals:")
  print(residuals)

In [None]:
def get_tree_thresholds(tree, feature_names):
    """
    Extracts the split thresholds from a trained decision tree.

    Args:
        tree (DecisionTreeClassifier): The trained scikit-learn Decision Tree model.
        feature_names (List[str]): A list of the feature names used to train the tree.

    Returns:
        Dict[str, List[float]]: A dictionary where keys are feature names and
        values are sorted lists of unique split points.
    """

    # Initialize a dictionary to hold the thresholds for each feature.
    thresholds = {name: [] for name in feature_names}
    tree_ = tree.tree_

    # Iterate through each node in the decision tree.
    for i in range(tree_.node_count):
        # Check if the node is a split node (not a leaf).
        if tree_.children_left[i] != tree_.children_right[i]:
            # Get the feature and threshold for the current split.
            feature_idx = tree_.feature[i]
            feature = feature_names[feature_idx]
            threshold = tree_.threshold[i]
            thresholds[feature].append(threshold)

    # Ensure thresholds for each feature are unique and sorted.
    for feature in thresholds:
        thresholds[feature] = sorted(list(set(thresholds[feature])))

    # Return the dictionary of split points.
    return thresholds

In [None]:
def create_tree_bins(df_input, feature_name, target_name):
    """
    Trains a Decision Tree to find optimal bins for a feature and adds
    them to the DataFrame as a new column. Note: The target column
    must be numeric (e.g., 0s and 1s).

    Args:
        df_input (pd.DataFrame): The input DataFrame.
        feature_name (str): The name of the continuous feature to bin.
        target_name (str): The name of the numeric target variable (e.g., 'Churn').

    Returns:
        pd.DataFrame: A new DataFrame with an added column for the feature bins.
    """

    # Creating the bins column by adding "_bins" to the column name
    bins_col_name = f"{feature_name}_bins"

    # Initialize a list for labels
    labels = []

    # Prepare data for the Decision Tree model
    df_input['Churn'] = df_input['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
    X = df_input[[feature_name]]
    y = df_input[target_name]

    # Train a simple tree to find the most predictive split points.
    tree_model = DecisionTreeClassifier(max_depth=3).fit(X, y)

    # Extract the split points (thresholds) from the trained tree.
    all_thresholds = get_tree_thresholds(tree_model, [feature_name])
    splits = all_thresholds[feature_name]

    # If the tree found no useful splits, return the original DataFrame.
    if not splits:
        print(f"The tree did not create any splits for '{feature_name}'.")
        return df_input

    # Create bin edges and descriptive labels from the split points.
    bin_edges = [-np.inf] + splits + [np.inf]
    data_min = df_input[feature_name].min()
    data_max = df_input[feature_name].max()

    for i in range(len(bin_edges) - 1):
        lower_bound = bin_edges[i]
        upper_bound = bin_edges[i+1]

        if i == 0:
            label_str = f"{data_min:.2f} - {upper_bound:.2f}"
        elif i == len(bin_edges) - 2:
            label_str = f"{lower_bound:.2f} - {data_max:.2f}"
        else:
            label_str = f"{lower_bound:.2f} - {upper_bound:.2f}"

        labels.append(label_str)

    # Create a copy of the input DataFrame to avoid modifying the original.
    df_output = df_input.copy(deep= True)

    # Create the new column with the data-driven bins.
    df_output[bins_col_name] = pd.cut(df_output[feature_name], bins=bin_edges, labels=labels)

    # Showing the results in the terminal
    print(f"Created bins for '{feature_name}': {splits}")

    # Return the new dataframe with the new bins column
    return df_output

# **Load the DataFrame and fix any datatype issues**

## **Load the DataFrame**

In [None]:
# read the CSV file using the correct path
df = pd.read_csv(get_kaggle_data(Telco_Customer_churn_data_path))
df.head()

Using Colab cache for faster access to the 'telco-customer-churn' dataset.
Path to CSV file: /kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## **Assess and fix any datatype issues**

In [None]:
df.info()

# Correcting the 'TotalCharges' column datatype
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace = True)


# **Statistical Analysis Part**

## **Creating Data-Driven Binning for Continuous Features**

In [None]:
# Identify potential categorical columns. This example selects all columns
# except for the very first (e.g., a customer ID) and the very last (e.g., the 'Churn' target).
# Note: This is a simplistic approach; manual selection is often more reliable.
cat_columns = list(df.columns[1:-1])

# Filter the list to find columns that are truly continuous.
# It uses a function to select numeric columns with more than 5 unique values.
continuous_cols = Getting_continuous_cols(df, cat_columns, 5)

# Assign the identified continuous columns to a 'features' list for processing.
features = continuous_cols


# --- 2. Data-Driven Binning for Continuous Features ---

# Create a deep copy of the original DataFrame to avoid modifying it directly.
# All transformations will be applied to this temporary DataFrame.
df_temp = df.copy(deep=True)

# Loop through each identified continuous feature to create optimal bins.
for col in features:
    # Print progress for the current feature being processed.
    print(f"--- Processing feature: {col} ---")

    # Call the create_tree_bins function to generate a new binned column
    # (e.g., 'tenure_bins'). The function returns the entire DataFrame with the
    # new column added, which then overwrites df_temp for the next loop iteration.
    df_temp = create_tree_bins(df_temp, col, 'Churn')

# Create a final 'cleaned' DataFrame that contains all the original data
# plus the new binned columns for each continuous feature.
df_cleaned = df_temp.copy(deep=True)

--- Processing feature: tenure ---
Created bins for 'tenure': [np.float64(1.5), np.float64(5.5), np.float64(10.5), np.float64(16.5), np.float64(22.5), np.float64(49.5), np.float64(70.5)]
--- Processing feature: MonthlyCharges ---
The tree did not create any splits for 'MonthlyCharges'.
--- Processing feature: TotalCharges ---
The tree did not create any splits for 'TotalCharges'.


# **Statistical Analysis for Categorical Features**

In [None]:
# Correcting the change happened to the churn column
df_cleaned['Churn'] = df['Churn']

# Identify the remaining categorical columns after continuous ones were processed.
cat_columns= list(df_cleaned.columns[1:])
cat_columns.remove('Churn')
continuous_cols = Getting_continuous_cols(df_cleaned, cat_columns, 5)
remaining_cols = [col for col in cat_columns if col not in continuous_cols]

# Initialize a list to store the names of statistically significant columns.
list_of_statistically_significant_cols = []


# Loop through each remaining categorical column to perform a Chi-squared test.
for col in remaining_cols:
  # Create a contingency table for the current column against 'Churn'.
  temp = churn_category_counts(dataframe= df_cleaned, churn_column ='Churn', comparing_column =col)
  print("")

  # Check for statistical significance. Call the function once and store the result.
  if Getting_statistics_information(temp) != None:
      list_of_statistically_significant_cols.append(Getting_statistics_information(temp))
      print("")

  # If the relationship is significant, analyze the standardized residuals.
  if col in list_of_statistically_significant_cols:
      Affecting_factor(temp)
  print("--------------------------------------------------------------------\n\n")



Churn     No  Yes
gender           
Female  2549  939
Male    2625  930
Churn     No  Yes  Churn_Percentage
gender                             
Female  2549  939          0.368380
Male    2625  930          0.354286


Chi-squared statistic: 0.5229748925747255
P-value: 0.7699055405365881
Degrees of Freedom: 2
--------------------------------------------------------------------


Churn            No   Yes
SeniorCitizen            
0              4508  1393
1               666   476
Churn            No   Yes  Churn_Percentage
SeniorCitizen                              
1               666   476          0.714715
0              4508  1393          0.309006


Chi-squared statistic: 162.45382398257178
P-value: 5.29176244875921e-36
Degrees of Freedom: 2
statistically Significant

Chi-squared statistic: 162.45382398257178
P-value: 5.29176244875921e-36
Degrees of Freedom: 2
statistically Significant

Standardized Residuals:
Churn                No       Yes  Churn_Percentage
SeniorCitizen      

## **Analysis Explanation**  
To identify which factors most influence customer churn, we applied a **Chi-squared test of independence**.  
This test measures whether there is a statistically significant relationship between each categorical variable (e.g., contract type, payment method, tech support) and churn outcomes.  

- A **p-value < 0.05** indicates that the variable is significantly associated with churn, meaning differences in churn rates across categories are unlikely due to random chance.  
- Variables with **higher differences in churn rates** across their categories are stronger predictors.  
- Variables with **similar churn rates** across categories (p ≥ 0.05) are considered weak or not significant.  

The sections below summarize the predictors of churn, ordered by their **level of impact** from strongest to weakest.  

---

## **Summary of Findings**

### **Primary Drivers of Churn (Highest Risk Segments)**  

#### **Strongest Predictors of Churn**  

**Contract Type**  
- Month-to-month contracts show extremely high churn (**74.5%**).  
- One-year (**12.7%**) and two-year (**2.9%**) contracts are much more stable.  
  -  *This is the single most critical churn driver.*  

**Payment Method**  
- Electronic check users have the highest churn (**82.8%**).  
- Other payment methods are far safer: mailed check (**23.6%**), bank transfer (**20.1%**), credit card (**18.0%**).  

**Internet Service Type**  
- Fiber optic customers churn heavily (**72.1%**).  
- DSL churns far less (**23.4%**).  
- Customers with no internet service are very low risk (**8.0%**).  

**Online Security**  
- No online security = **71.7%** churn.  
- With online security = **17.1%**.  
- No internet service = **8.0%**.  

**Tech Support**  
- No tech support = **71.3%** churn.  
- With tech support = **17.9%**.  
- No internet service = **8.0%**.  

**Senior Citizen Status**  
- Seniors churn at **71.5%**.  
- Non-seniors churn at only **30.9%**.  

---

### **Medium-Level Predictors**  

**Device Protection**  
- No protection = **64.3%** churn.  
- With protection = **29.0%**.  
- No internet service = **8.0%**.  

**Online Backup**  
- No backup = **66.5%** churn.  
- With backup = **27.4%**.  
- No internet service = **8.0%**.  

**Paperless Billing**  
- Paperless billing = **50.5%** churn.  
- Non-paperless = **19.5%**.  

**Partner Status**  
- Without partner = **49.2%**.  
- With partner = **24.5%**.  

**Dependents**  
- Without dependents = **45.5%**.  
- With dependents = **18.3%**.  

---

### **Lower-Level Predictors**  

**Streaming TV**  
- Without streaming = **50.4%**.  
- With streaming = **43.0%**.  
- No internet = **8.0%**.  

**Streaming Movies**  
- Without streaming = **50.8%**.  
- With streaming = **42.7%**.  
- No internet = **8.0%**.  

**Multiple Lines**  
- With multiple lines = **40.1%**.  
- Without = **33.4%**.  
- No phone service = **33.2%**.  

---

### **Weak or Non-Significant Predictors**  

**Phone Service**  
- With phone service = **36.4%**.  
- Without = **33.2%**.  
- *Not statistically significant.*  

**Gender**  
- Female = **36.8%**.  
- Male = **35.4%**.  
- *No meaningful difference.*  
---
---
### **Continous Data**
**Tenure**
- New customers have the highest churn risk. Those with a tenure of **0 - 1.5** months are the most likely to leave (churn ratio of **1.87**, meaning nearly two churn for every one who stays).
- Loyalty grows significantly over time. **Customers become much safer after 22 months**, with very low churn risk for those with tenures **over 4 years**.
---
**Monthly Charges**
- A specific mid-to-high price range has the highest churn (**43.4%**) for customers paying between **€68.83 - €70.33** per month.
- Customers on the cheapest and most expensive plans are far safer. Churn is lowest for those paying **€25.32 - €27.68 (4.6%)** and **€113.62 - €118.75 (8.6%)**.
---
**Total Charges**
- Customers with the lowest lifetime spending have **an extremely high churn rate (70.8%)** for the group that has paid between **€68.47 - €96.62** in total.
- Commitment increases with spending. The risk drops significantly for customers who have spent more, **with the safest group being those who have paid over €7805 (4.2% churn)**.

# **ML Model to Predict Customer Churn**

## **Model Initialization**

In [None]:
# Reviewing the columns that significantly affect churn
list_of_statistically_significant_cols

['SeniorCitizen',
 'Partner',
 'Dependents',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'tenure_bins']

In [None]:
# Start the H2O cluster. H2O is an in-memory platform, so this initializes it.
h2o.init()

# Ensure the 'Churn' column is included for model training.
list_of_statistically_significant_cols.append('Churn')

# Convert the pandas DataFrame to an H2OFrame, using only the significant columns.
h2o_df = h2o.H2OFrame(df_cleaned[list_of_statistically_significant_cols])

# Identify the target variable ('y') and the feature variables ('x').
y = 'Churn'
x = h2o_df.columns
x.remove(y)

# Tell H2O that 'Churn' is a categorical variable for classification.
h2o_df[y] = h2o_df[y].asfactor()

# Split the data into training (80%) and testing (20%) sets.
train, test = h2o_df.split_frame(ratios=[0.8], seed=42)


# Configure and initialize the AutoML process.
# It will train multiple models within the defined constraints.
aml = H2OAutoML(max_models=30,          # Run for a maximum of 30 models
                max_runtime_secs=480,   # Run for a maximum of 8 minutes
                seed=42,                # For reproducibility
                project_name="churn_prediction")

# Train the AutoML models
aml.train(x=x, y=y, training_frame=train)


# Get the leaderboard of all trained models, ranked by performance.
lb = aml.leaderboard

# Print the full leaderboard to see the results of the AutoML run.
print("H2O AutoML Leaderboard:")
print(lb.head(rows=lb.nrows))

# Get a direct reference to the best performing model.
best_model = aml.leader


# Use the best model to make predictions on the unseen test data.
predictions = best_model.predict(test)

# Print the first few predictions to see the model's output.
print("\nSample predictions:")
print(predictions.head())

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.28" 2025-07-15; OpenJDK Runtime Environment (build 11.0.28+6-post-Ubuntu-1ubuntu122.04.1); OpenJDK 64-Bit Server VM (build 11.0.28+6-post-Ubuntu-1ubuntu122.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.12/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpnycsgakj
  JVM stdout: /tmp/tmpnycsgakj/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpnycsgakj/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,07 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,5 months and 29 days
H2O_cluster_name:,H2O_from_python_unknownUser_9z5yu7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.168 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
H2O AutoML Leaderboard:
model_id                                                   auc    logloss     aucpr    mean_per_class_error      rmse       mse
GLM_1_AutoML_1_20250925_182340                        0.854692   0.406269  0.678006                0.23045   0.363416  0.132071
XGBoost_grid_1_AutoML_1_20250925_182340_model_4       0.851764   0.410709  0.669283                0.223793  0.36552   0.133605
GBM_1_AutoML_1_20250925_182340                        0.851338   0.410803  0.667415                0.234075  0.365795  0.133806
XGBoost_grid_1_AutoML_1_20250925_182340_model_6       0.850385   0.412498  0.662006                0.225766  0.366562  0.134368
GBM_5_AutoML_1_20250925_182340                        0.847054   0.416057  0.662834                0.243518  0.368035  0.13545
GBM_grid_1_AutoML_1

## **Model Evaluation**

In [None]:
# Generate model performance metrics on the test set
performance = best_model.model_performance(test_data=test)

# Extract the confusion matrix and AUC
cm = performance.confusion_matrix()
auc_roc = performance.auc()

# Print the matrix
print(cm)

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.34234068807420603
       No    Yes    Error    Rate
-----  ----  -----  -------  --------------
No     826   216    0.2073   (216.0/1042.0)
Yes    107   251    0.2989   (107.0/358.0)
Total  933   467    0.2307   (323.0/1400.0)


In [None]:
# Get the leader model from the AutoML run
best_model = aml.leader

# Get performance metrics on the TEST set
# The parameter to pass a new dataset is always 'test_data'
test_performance = best_model.model_performance(test_data=test)
test_auc = test_performance.auc()

# Get performance metrics on the TRAINING set
# Use the special flag 'train=True' to get the metrics on the original training data
train_performance = best_model.model_performance(train=True)
train_auc = train_performance.auc()

# Print the comparison to check for overfitting
print("--- Overfitting Check ---")
print(f"AUC on Training Set: {train_auc:.4f}")
print(f"AUC on Test Set:     {test_auc:.4f}")
print("-------------------------")

# Calculate the difference
auc_difference = train_auc - test_auc
print(f"Difference (Train - Test): {auc_difference:.4f}")
if auc_difference > 0:
  print("health model")
else:
  print('unhealth model')

--- Overfitting Check ---
AUC on Training Set: 0.8593
AUC on Test Set:     0.8294
-------------------------
Difference (Train - Test): 0.0299
health model


# **Bonus: Download the file**

In [None]:
def generate_and_download_infographic(df_analysis: pd.DataFrame, h2o_performance):
    """
    Calculates key metrics from the analysis and generates a dynamic HTML infographic.

    Args:
        df_analysis (pd.DataFrame): The final, cleaned DataFrame with binned columns.
        h2o_performance: The H2O model performance object from best_model.model_performance().
    """

    # Calculate all necessary stats from the input data ---

    # Overall Churn Rate
    df_analysis['Churn_numeric'] = df_analysis['Churn'].map({'Yes': 1, 'No': 0})
    overall_churn_rate = df_analysis['Churn_numeric'].mean() * 100

    # Churn by Contract Type
    contract_churn = df_analysis.groupby('Contract')['Churn_numeric'].mean().sort_values(ascending=False) * 100

    # Churn by Payment Method
    payment_churn = df_analysis.groupby('PaymentMethod')['Churn_numeric'].mean().sort_values(ascending=False) * 100

    # Churn by Tenure (using the binned column)
    tenure_churn_ratio = df_analysis.groupby('tenure_bins')['Churn_numeric'].agg(
        lambda x: x.sum() / (len(x) - x.sum()) if (len(x) - x.sum()) > 0 else 0
    ).sort_index()

    # Churn by Internet Service
    internet_churn = df_analysis.groupby('InternetService')['Churn_numeric'].mean().sort_values(ascending=False) * 100

    # Churn for Add-ons
    no_security_churn = df_analysis[df_analysis['OnlineSecurity'] == 'No']['Churn_numeric'].mean() * 100
    with_security_churn = df_analysis[df_analysis['OnlineSecurity'] == 'Yes']['Churn_numeric'].mean() * 100
    no_tech_churn = df_analysis[df_analysis['TechSupport'] == 'No']['Churn_numeric'].mean() * 100
    with_tech_churn = df_analysis[df_analysis['TechSupport'] == 'Yes']['Churn_numeric'].mean() * 100

    # Model Performance Metrics
    cm_object = h2o_performance.confusion_matrix()
    cm_df = cm_object.table.as_data_frame()
    cm_df.set_index(cm_df.columns[0], inplace=True)

    true_positives = cm_df.loc['Yes', 'Yes']
    false_positives = cm_df.loc['No', 'Yes']
    false_negatives = cm_df.loc['Yes', 'No']

    auc_score = h2o_performance.auc() * 100
    precision = (true_positives / (true_positives + false_positives)) * 100
    recall = (true_positives / (true_positives + false_negatives)) * 100
    f1_score = h2o_performance.F1()[0][1] * 100

    # Convert data to JSON for embedding in the HTML script tag
    chart_data_json = json.dumps({
        "contract": {"labels": contract_churn.index.tolist(), "data": contract_churn.values.tolist()},
        "payment": {"labels": payment_churn.index.tolist(), "data": payment_churn.values.tolist()},
        "tenure": {"labels": tenure_churn_ratio.index.astype(str).tolist(), "data": tenure_churn_ratio.values.tolist()},
        "internet": {"labels": internet_churn.index.tolist(), "data": internet_churn.values.tolist()},
        "addOns": {
            "noService": [no_security_churn, no_tech_churn],
            "withService": [with_security_churn, with_tech_churn]
        }
    })

    # Define the JavaScript template with placeholders ---
    # Literal {} braces are escaped as {{}} for the .format() method
    javascript_template = """
    <script type="text/javascript">
        const chartData = {chart_data_json};

        document.addEventListener('DOMContentLoaded', () => {{

            const brilliantBlues = ['#5386C3', '#2565AE', '#00449E'];
            const churnColor = '#FF6B6B';
            const noChurnColor = '#5386C3';
            const defaultFontColor = '#6b7280';

            Chart.defaults.font.family = "'Inter', sans-serif";
            Chart.defaults.color = defaultFontColor;

            const tooltipTitleCallback = {{
                plugins: {{
                    tooltip: {{
                        callbacks: {{
                            title: function(tooltipItems) {{
                                const item = tooltipItems[0];
                                let label = item.chart.data.labels[item.dataIndex];
                                if (Array.isArray(label)) {{
                                  return label.join(' ');
                                }} else {{
                                  return label;
                                }}
                            }}
                        }}
                    }}
                }}
            }};

            const processLabel = (label) => {{
                if (typeof label !== 'string') return label;
                if (label.length > 16) {{
                    const words = label.split(' ');
                    const lines = [];
                    let currentLine = '';
                    words.forEach(word => {{
                        if ((currentLine + word).length > 16) {{
                            lines.push(currentLine.trim());
                            currentLine = '';
                        }}
                        currentLine += word + ' ';
                    }});
                    lines.push(currentLine.trim());
                    return lines;
                }}
                return label;
            }};

            new Chart(document.getElementById('contractChurnChart'), {{
                type: 'bar',
                data: {{
                    labels: chartData.contract.labels,
                    datasets: [{{
                        label: 'Churn Rate (%)',
                        data: chartData.contract.data,
                        backgroundColor: (context) => {{
                            const label = context.chart.data.labels[context.dataIndex];
                            if (label.includes('Month-to-Month')) return churnColor;
                            if (label.includes('One Year')) return noChurnColor;
                            return '#00449E';
                        }},
                        borderColor: 'rgba(255, 255, 255, 0.5)',
                        borderWidth: 1,
                        borderRadius: 5
                    }}]
                }},
                options: {{
                    ...tooltipTitleCallback,
                    maintainAspectRatio: false,
                    scales: {{
                        y: {{
                            beginAtZero: true,
                            title: {{ display: true, text: 'Churn Rate (%)' }}
                        }}
                    }},
                    plugins: {{
                        ...tooltipTitleCallback.plugins,
                        legend: {{ display: false }}
                    }}
                }}
            }});

            new Chart(document.getElementById('paymentMethodChart'), {{
                type: 'bar',
                data: {{
                    labels: chartData.payment.labels.map(processLabel),
                    datasets: [{{
                        label: 'Churn Rate (%)',
                        data: chartData.payment.data,
                        backgroundColor: (context) => {{
                             const label = context.chart.data.labels[context.dataIndex];
                             const originalLabel = Array.isArray(label) ? label.join(' ') : label;
                             return originalLabel.includes('Electronic') ? churnColor : brilliantBlues[context.dataIndex % brilliantBlues.length];
                        }},
                         borderRadius: 5
                    }}]
                }},
                options: {{
                    ...tooltipTitleCallback,
                    indexAxis: 'y',
                    maintainAspectRatio: false,
                    scales: {{
                        x: {{
                            beginAtZero: true,
                            title: {{ display: true, text: 'Churn Rate (%)' }}
                        }}
                    }},
                    plugins: {{
                         ...tooltipTitleCallback.plugins,
                        legend: {{ display: false }}
                    }}
                }}
            }});

             new Chart(document.getElementById('tenureChurnChart'), {{
                type: 'line',
                data: {{
                    labels: chartData.tenure.labels,
                    datasets: [{{
                        label: 'Churn Risk Ratio (Yes/No)',
                        data: chartData.tenure.data,
                        fill: true,
                        backgroundColor: 'rgba(255, 107, 107, 0.2)',
                        borderColor: churnColor,
                        tension: 0.3
                    }}]
                }},
                options: {{
                    ...tooltipTitleCallback,
                    maintainAspectRatio: false,
                    scales: {{
                         y: {{
                            beginAtZero: true,
                            title: {{ display: true, text: 'Churn Risk (Higher is Worse)' }}
                        }}
                    }},
                    plugins: {{
                        ...tooltipTitleCallback.plugins,
                        legend: {{ display: false }}
                    }}
                }}
            }});

             new Chart(document.getElementById('internetServiceChart'), {{
                type: 'bar',
                data: {{
                    labels: chartData.internet.labels,
                    datasets: [{{
                        label: 'Churn Rate (%)',
                        data: chartData.internet.data,
                         backgroundColor: (context) => {{
                            const label = context.chart.data.labels[context.dataIndex];
                            return label.includes('Fiber') ? churnColor : brilliantBlues[context.dataIndex % brilliantBlues.length];
                        }},
                         borderRadius: 5
                    }}]
                }},
                options: {{
                     ...tooltipTitleCallback,
                    maintainAspectRatio: false,
                    indexAxis: 'y',
                     scales: {{
                        x: {{
                            beginAtZero: true,
                            title: {{ display: true, text: 'Churn Rate (%)' }}
                        }}
                    }},
                    plugins: {{
                         ...tooltipTitleCallback.plugins,
                        legend: {{ display: false }}
                    }}
                }}
            }});

            new Chart(document.getElementById('addOnsChart'), {{
                type: 'bar',
                data: {{
                    labels: ['No Online Security', 'No Tech Support'],
                    datasets: [{{
                        label: 'Churn Rate without Service (%)',
                        data: chartData.addOns.noService,
                        backgroundColor: churnColor,
                        borderRadius: 5
                    }},
                    {{
                        label: 'Churn Rate with Service (%)',
                        data: chartData.addOns.withService,
                        backgroundColor: noChurnColor,
                        borderRadius: 5
                    }}]
                }},
                 options: {{
                     ...tooltipTitleCallback,
                    maintainAspectRatio: false,
                    scales: {{
                        y: {{
                            beginAtZero: true,
                            title: {{ display: true, text: 'Churn Rate (%)' }}
                        }}
                    }},
                    plugins: {{
                        ...tooltipTitleCallback.plugins,
                        legend: {{ position: 'bottom' }}
                    }}
                }}
            }});

        }});
    </script>
    """.format(chart_data_json=chart_data_json)

    # Store the main HTML content in a multi-line f-string
    html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Telco Customer Churn Infographic</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap" rel="stylesheet">
    <style>
        body {{
            font-family: 'Inter', sans-serif;
        }}
        .chart-container {{
            position: relative;
            width: 100%;
            max-width: 600px;
            margin-left: auto;
            margin-right: auto;
            height: 350px;
            max-height: 400px;
        }}
        @media (min-width: 768px) {{
            .chart-container {{
                height: 400px;
            }}
        }}
        .kpi-card {{
            background: linear-gradient(135deg, #2565AE, #00449E);
            color: white;
            border-radius: 0.75rem;
            padding: 1.5rem;
            box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
            transition: transform 0.3s ease, box-shadow 0.3s ease;
        }}
        .kpi-card:hover {{
            transform: translateY(-5px);
            box-shadow: 0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);
        }}
    </style>
</head>
<body class="bg-gray-100 text-gray-800">

    <div class="container mx-auto p-4 md:p-8">

        <header class="text-center mb-12">
            <h1 class="text-4xl md:text-5xl font-bold text-[#00449E] mb-2">Understanding Telco Customer Churn</h1>
            <p class="text-lg md:text-xl text-[#5386C3]">A Data-Driven Approach to Predicting and Preventing Customer Attrition</p>
        </header>

        <section id="overview" class="mb-12">
            <div class="grid grid-cols-1 md:grid-cols-3 gap-8">
                <div class="bg-white rounded-lg shadow-md p-6 flex flex-col items-center justify-center text-center">
                    <h3 class="text-2xl font-semibold text-[#00449E] mb-2">Overall Churn Rate</h3>
                    <div class="text-6xl font-bold text-[#FF6B6B]">{overall_churn_rate:.1f}%</div>
                    <p class="text-gray-500 mt-2">of customers churned, highlighting a critical need for a predictive retention strategy.</p>
                </div>
                <div class="md:col-span-2 bg-white rounded-lg shadow-md p-6">
                    <h3 class="text-2xl font-semibold text-center text-[#00449E] mb-4">The Challenge: Customer Attrition</h3>
                    <p class="text-gray-600 leading-relaxed">Customer churn represents a significant loss of revenue and opportunity. By identifying the key factors that drive customers to leave, we can build a proactive strategy to improve satisfaction and retention. This project leverages historical customer data to create a powerful predictive model, turning insights into actionable business intelligence.</p>
                </div>
            </div>
        </section>

        <section id="predictors" class="mb-12">
            <h2 class="text-3xl font-bold text-center text-[#00449E] mb-8">The Anatomy of a Churning Customer: Key Predictors</h2>
            <div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-8">

                <div class="bg-white rounded-lg shadow-md p-6 lg:col-span-2">
                    <h3 class="text-xl font-semibold text-center text-[#00449E] mb-4">Contract Type is the Strongest Churn Driver</h3>
                    <div class="chart-container">
                        <canvas id="contractChurnChart"></canvas>
                    </div>
                     <p class="text-center text-sm text-gray-500 mt-4">Customers on month-to-month contracts are significantly more likely to churn, indicating a lack of long-term commitment.</p>
                </div>

                <div class="bg-white rounded-lg shadow-md p-6">
                    <h3 class="text-xl font-semibold text-center text-[#00449E] mb-4">Risk by Payment Method</h3>
                     <div class="chart-container">
                        <canvas id="paymentMethodChart"></canvas>
                    </div>
                    <p class="text-center text-sm text-gray-500 mt-4">Electronic check payments are strongly correlated with higher churn rates compared to automated or traditional methods.</p>
                </div>

                <div class="bg-white rounded-lg shadow-md p-6 lg:col-span-3">
                    <h3 class="text-xl font-semibold text-center text-[#00449E] mb-4">Churn Risk Decreases with Customer Tenure</h3>
                     <div class="chart-container h-80">
                        <canvas id="tenureChurnChart"></canvas>
                    </div>
                    <p class="text-center text-sm text-gray-500 mt-4">New customers are at the highest risk. As tenure increases, loyalty grows and churn rates drop dramatically.</p>
                </div>

                <div class="bg-white rounded-lg shadow-md p-6">
                    <h3 class="text-xl font-semibold text-center text-[#00449E] mb-4">Fiber Optic Service & Churn</h3>
                     <div class="chart-container">
                        <canvas id="internetServiceChart"></canvas>
                    </div>
                     <p class="text-center text-sm text-gray-500 mt-4">While a premium service, Fiber Optic customers show a much higher churn rate than DSL users, suggesting potential issues with service, pricing, or value perception.</p>
                </div>

                <div class="bg-white rounded-lg shadow-md p-6">
                    <h3 class="text-xl font-semibold text-center text-[#00449E] mb-4">Impact of Key Add-ons on Churn</h3>
                    <div class="chart-container">
                        <canvas id="addOnsChart"></canvas>
                    </div>
                    <p class="text-center text-sm text-gray-500 mt-4">Customers without key support services like Online Security and Tech Support show a significantly higher churn rate.</p>
                </div>

            </div>
        </section>

        <section id="model" class="mb-12">
             <h2 class="text-3xl font-bold text-center text-[#00449E] mb-8">Building a Predictive Solution</h2>
             <div class="grid grid-cols-1 md:grid-cols-2 gap-8 items-center">
                <div class="bg-white rounded-lg shadow-md p-6">
                    <h3 class="text-xl font-semibold text-center text-[#00449E] mb-6">Our Methodology</h3>
                    <div class="space-y-4">
                        <div class="flex items-center">
                            <div class="bg-[#C6D5E8] text-[#00449E] rounded-full h-10 w-10 flex items-center justify-center font-bold">1</div>
                            <p class="ml-4 text-gray-700"><strong>Data Preparation & Cleaning:</strong> Corrected data types and handled missing values.</p>
                        </div>
                        <div class="flex items-center">
                             <div class="bg-[#89A9D3] text-white rounded-full h-10 w-10 flex items-center justify-center font-bold">2</div>
                            <p class="ml-4 text-gray-700"><strong>Feature Engineering:</strong> Used Decision Trees to create optimal, data-driven bins for continuous variables.</p>
                        </div>
                        <div class="flex items-center">
                            <div class="bg-[#5386C3] text-white rounded-full h-10 w-10 flex items-center justify-center font-bold">3</div>
                            <p class="ml-4 text-gray-700"><strong>Feature Selection:</strong> Performed Chi-squared tests to select only statistically significant churn predictors.</p>
                        </div>
                         <div class="flex items-center">
                            <div class="bg-[#2565AE] text-white rounded-full h-10 w-10 flex items-center justify-center font-bold">4</div>
                            <p class="ml-4 text-gray-700"><strong>AutoML with H2O:</strong> Trained, tuned, and compared dozens of models to find the top performer.</p>
                        </div>
                    </div>
                </div>
                <div class="grid grid-cols-2 gap-4">
                    <div class="kpi-card text-center">
                        <div class="text-5xl font-bold">{auc_score:.1f}%</div>
                        <div class="text-lg font-semibold mt-2">AUC Score</div>
                        <p class="text-sm opacity-80 mt-1">Excellent at distinguishing churners from non-churners.</p>
                    </div>
                     <div class="kpi-card text-center">
                        <div class="text-5xl font-bold">{recall:.1f}%</div>
                        <div class="text-lg font-semibold mt-2">Recall</div>
                         <p class="text-sm opacity-80 mt-1">Identifies {recall:.0f}% of customers who will actually churn.</p>
                    </div>
                     <div class="kpi-card text-center">
                        <div class="text-5xl font-bold">{precision:.1f}%</div>
                        <div class="text-lg font-semibold mt-2">Precision</div>
                        <p class="text-sm opacity-80 mt-1">When predicting churn, the model is correct {precision:.0f}% of the time.</p>
                    </div>
                     <div class="kpi-card text-center">
                        <div class="text-5xl font-bold">{f1_score:.1f}%</div>
                        <div class="text-lg font-semibold mt-2">F1-Score</div>
                         <p class="text-sm opacity-80 mt-1">A strong balanced score between Precision and Recall.</p>
                    </div>
                </div>
             </div>
        </section>

        <section id="recommendations">
            <h2 class="text-3xl font-bold text-center text-[#00449E] mb-8">Actionable Insights & Recommendations</h2>
            <div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-6">
                <div class="bg-white rounded-lg shadow-md p-6 text-center">
                    <div class="text-4xl mb-4">🎯</div>
                    <h3 class="text-lg font-semibold text-[#00449E] mb-2">Focus on Contracts</h3>
                    <p class="text-gray-600 text-sm">Proactively offer incentives for customers on month-to-month plans to upgrade to more stable one or two-year contracts.</p>
                </div>
                <div class="bg-white rounded-lg shadow-md p-6 text-center">
                    <div class="text-4xl mb-4">🛡️</div>
                    <h3 class="text-lg font-semibold text-[#00449E] mb-2">Support Fiber Customers</h3>
                    <p class="text-gray-600 text-sm">Bundle Online Security and Tech Support with Fiber Optic plans to mitigate the high churn rate in this premium segment.</p>
                </div>
                <div class="bg-white rounded-lg shadow-md p-6 text-center">
                     <div class="text-4xl mb-4">💳</div>
                    <h3 class="text-lg font-semibold text-[#00449E] mb-2">Incentivize Payments</h3>
                    <p class="text-gray-600 text-sm">Offer a small discount or bonus for customers who switch from Electronic Check to more secure automated payment methods.</p>
                </div>
                <div class="bg-white rounded-lg shadow-md p-6 text-center">
                    <div class="text-4xl mb-4">🤝</div>
                    <h3 class="text-lg font-semibold text-[#00449E] mb-2">Nurture Newcomers</h3>
                    <p class="text-gray-600 text-sm">Implement a robust onboarding program for customers in their first 3 months to build loyalty and reduce early churn.</p>
                </div>
            </div>
        </section>

        <footer class="text-center mt-12 pt-8 border-t border-gray-300">
            <p class="text-gray-500">This infographic was generated based on a comprehensive churn prediction analysis. By leveraging data, we can move from reactive problem-solving to proactive, intelligent customer retention.</p>
        </footer>

    </div>

    {javascript_template}
</body>
</html>
"""

    # Write the dynamic HTML to a file
    file_name = "churn_infographic_dynamic.html"
    with open(file_name, "w") as f:
        f.write(html_content)

    print(f"File '{file_name}' has been created successfully.")
    print("Starting download...")

    # Trigger the browser download
    files.download(file_name)

# --- How to use this function ---
# After you have run your analysis and have the 'df_cleaned' DataFrame
# and the H2O 'performance' object, simply call the function like this:
#
# generate_and_download_infographic(df_cleaned, performance)
#
# For demonstration purposes, I am calling it with the final variables from your notebook.
# Please ensure `df_cleaned` and `performance` exist in your notebook's memory.

try:
    # This assumes 'performance' is the final H2O performance object you created
    generate_and_download_infographic(df_cleaned, performance)
except NameError:
    print("Please make sure you have run your full analysis notebook and that")
    print("the 'df_cleaned' DataFrame and H2O 'performance' object are defined.")



File 'churn_infographic_dynamic.html' has been created successfully.
Starting download...


  tenure_churn_ratio = df_analysis.groupby('tenure_bins')['Churn_numeric'].agg(


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>