#Telco Customer Churn

Documentation: https://www.ibm.com/docs/en/cognos-analytics/12.1.x?topic=samples-telco-customer-churn

Local path: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
* Dataset must be loaded into Google Colab each time
* Dataset is available in the GitHub repository (https://github.com/lydsleepy/machine-learning)

Path if cloned GitHub repo: /content/machine-learning/I310D_Project.ipynb

In [None]:
'''IMPORTS AND LOADING'''
# test

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from matplotlib import pyplot as plt
from IPython.core.interactiveshell import InteractiveShell

# loading data
LOCAL_PATH = "/content/WA_Fn-UseC_-Telco-Customer-Churn.csv"
REPO_PATH = "/content/machine-learning/I310D_Project.ipynb"
# we'll use local path for now since not everyone might be in github
df = pd.read_csv(LOCAL_PATH)

FileNotFoundError: [Errno 2] No such file or directory: '/content/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [None]:
# what does our dataset look like?
df.head()

In [None]:
# what are our values?
df.columns.values

In [None]:
'''DATA CLEANING AND SANITIZATION'''
# check if there are any duplicates
number_of_duplicates = df.duplicated().sum()
print(f"Number of duplicates: {number_of_duplicates}")
# There are no duplicates to remove

# Check if there are any null values
df.info()
# There are no null cells/values

In [None]:
# Check for blank values
for column in df:
  blank = list(df[column])
  sum = 0
  for item in blank:
    if item == ' ' or item == '' or item == 'NA' or item == 'NaN' or pd.isna(item):
      sum += 1

  print(f"{column} blank values: {sum}")

# It looks like there are 11 blank values in TotalCharges


In [None]:
# drop the rows that have a blank value for TotalCharges

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors = 'coerce')
df = df.dropna()

In [None]:
# check that the blanks were dropped successfully
for column in df:
  blank = list(df[column])
  sum = 0
  for item in blank:
    if item == ' ' or item == '' or item == 'NA' or item == 'NaN' or pd.isna(item):
      sum += 1

  print(f"{column} blank values: {sum}")

In [None]:
# we want to exclude protected attributes to minimize bias
feature_columns = ["Partner", "Dependents",
                   "tenure", "PhoneService", "MultipleLines",
                   "InternetService", "OnlineSecurity",
                   "OnlineBackup", "DeviceProtection", "TechSupport",
                   "StreamingTV", "StreamingMovies", "Contract",
                   "PaperlessBilling", "PaymentMethod", "MonthlyCharges",
                   "TotalCharges", "Churn"]

churn_data = df[feature_columns]
churn_data.head()

In [None]:
# A lot of the columns are in string format.
# We will convert them into integers
churn_data.info()

In [None]:
# to assist with converting, see all the unique values for each column
for column in churn_data:
  print(f"{column} values: {churn_data[column].unique()}")
  print()

In [None]:
'''FEATURE ENGINEERING'''

def featurize(df):
  # 'X' is the df that will hold our converted data
  X = df[['tenure', 'MonthlyCharges', 'TotalCharges']]
  X['Partner'] = [1 if x=='Yes' else 0 for x in df['Partner']]
  X['Dependents'] = [1 if x=='Yes' else 0 for x in df['Dependents']]
  X['PhoneService'] = [1 if x=='Yes' else 0 for x in df['PhoneService']]
  X['MultipleLines'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['MultipleLines']]
  X['InternetService'] = [1 if x=='DSL' else 0 if x=='Fiber optic' else 2 for x in df['InternetService']]
  X['OnlineSecurity'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['OnlineSecurity']]
  X['OnlineBackup'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['OnlineBackup']]
  X['DeviceProtection'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['DeviceProtection']]
  X['TechSupport'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['TechSupport']]
  X['StreamingTV'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['StreamingTV']]
  X['StreamingMovies'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['StreamingMovies']]
  X['Contract'] = [1 if x=='Month-to-month' else 0 if x=='One year' else 2 for x in df['Contract']]
  X['PaperlessBilling'] = [1 if x=='Yes' else 0 for x in df['PaperlessBilling']]
  X['PaymentMethod'] = [1 if x=='Electronic check'
                        else 0 if x=='Mailed check'
                        else 2 if x=='Bank transfer (automatic)'
                        else 3 for x in df['PaymentMethod']]

  return X


# Also convert 'Churn' into 0/1 format
x_test_churn = featurize(churn_data)
y_actual_churn = [1 if y == 'Yes' else 0 for y in churn_data['Churn']]

churn_labels = pd.DataFrame(y_actual_churn, columns=["Churn"])

display(x_test_churn.head())
display(churn_labels.head())




In [None]:
'''EXPLORATORY DATA ANALYSIS'''
from matplotlib import pyplot as plt
from IPython.core.interactiveshell import InteractiveShell


In [None]:
#GRAPH COMPARING THE MONTHLY PAYMENTS FOR CUSTOMERS WHO WERE AND WERENT CHURNED

#Setting up Datasets
monthlycharge_column = x_test_churn["MonthlyCharges"]
monthly_charge = []
no_churn = []
yes_churn = []

#creating a list of the monthly charges values
for num in monthlycharge_column:
  monthly_charge.append(num)

#seperating the monthly charges into 2 new lists - customers that were and were not churned
x=0
for y in y_actual_churn:
  if y == 0:
    no_churn.append(monthly_charge[x])
  elif y == 1:
    yes_churn.append(monthly_charge[x])
  x+=1

#Creating the boxplot of to compare these two groups
plt.boxplot([no_churn, yes_churn], labels=['No', 'Yes'])
plt.title("Monthly Charges Depending on Churn Status")
plt.xlabel("Churned?")
plt.ylabel("Monthly Charge")

In [None]:
#GRAPH COMPARING THE TOTAL PAYMENTS FOR CUSTOMERS WHO WERE AND WERENT CHURNED

#Setting up Datasets
Totalcharge_column = x_test_churn["TotalCharges"]
total_charge = []
no_churn = []
yes_churn = []

#creating a list of the monthly charges values
for num in Totalcharge_column:
  total_charge.append(num)

#seperating the monthly charges into 2 new lists - customers that were and were not churned
x=0
for y in y_actual_churn:
  if y == 0:
    no_churn.append(total_charge[x])
  elif y == 1:
    yes_churn.append(total_charge[x])
  x+=1

#Creating the boxplot of to compare these two groups
plt.boxplot([no_churn, yes_churn], labels=['No', 'Yes'])
plt.title("Total Charges Depending on Churn Status")
plt.xlabel("Churned?")
plt.ylabel("Total Charge")

In [None]:
# GRAPH COMPARING THE TENURE FOR CUSTOMERS WHO WERE AND WERENT CHURNED

# Setting up Datasets
Tenure_column = x_test_churn["tenure"]
tenure= []
no_churn = []
yes_churn = []

# creating a list of the monthly charges values
for num in Tenure_column:
  tenure.append(num)

# seperating the monthly charges into 2 new lists - customers that were and were not churned
x=0
for y in y_actual_churn:
  if y == 0:
    no_churn.append(tenure[x])
  elif y == 1:
    yes_churn.append(tenure[x])
  x+=1

# Creating the boxplot of to compare these two groups
plt.boxplot([no_churn, yes_churn], labels=['No', 'Yes'])
plt.title("Tenure Depending on Churn Status")
plt.xlabel("Churned?")
plt.ylabel("Tenure")

In [None]:
#GRAPH COMPARING THE CHURNED STATUS ON INTERNET SERVICE
#Setting up Datasets
categories = ["DSL", "Fiber optic", "No"]
internet_column = x_test_churn["InternetService"]

internet= []
yes_churn = []
no_churn = []

no_churn_dsl =[]
no_churn_fiberoptic = []
no_churn_no = []

yes_churn_dsl = []
yes_churn_fiberoptic = []
yes_churn_no = []

#Creating a list of the internet services status
for i in internet_column:
  internet.append(i)

#Seperating the Internet service status into 2 groups yes- if they churned, no- if they did not churn
u=0
for y in y_actual_churn:
  if y == 0:
    no_churn.append(internet[u])
  elif y == 1:
    yes_churn.append(internet[u])
  u+=1


#Seperating the non-churn group into a group depending on internet service

for i in no_churn:
  if i == 0:
    no_churn_fiberoptic.append(i)
  elif i == 1:
    no_churn_dsl.append(i)
  elif i == 2:
    no_churn_no.append(i)
#Seperating the churning group into a group depending on internet service
for i in yes_churn:
  if i == 0:
    yes_churn_fiberoptic.append(i)
  elif i == 1:
    yes_churn_dsl.append(i)
  elif i == 2:
    yes_churn_no.append(i)
#Setting up for plotting
w=.4
cat = np.arange(len(categories))

#Bar plot comparing the internet service and who decided to churn or not
plt.bar(cat - w/2, [len(no_churn_dsl), len(no_churn_fiberoptic), len(no_churn_no)], width=.4, label="Not Churning")
plt.bar(cat+ w/2, [len(yes_churn_dsl), len(yes_churn_fiberoptic), len(yes_churn_no)], width=.4,label="Churning")


#Labeling the bar plot
plt.title("Internet Service Depending on Churn Status")
plt.xlabel("Internet Service Churned?")
plt.ylabel("Number of Customers")
plt.xticks(cat, categories)
plt.legend()

In [None]:
#online security and churned data

#Setting up Datasets
categories = ["Not use Online Security","Use Online Security", "No internet service"]
online_column = x_test_churn["OnlineSecurity"]

security = []
no_churn = []
yes_churn = []

no_churn_n_security =[]
no_churn_y_security = []
no_churn_no = []

yes_churn_n_security = []
yes_churn_y_security = []
yes_churn_no = []

#Creating a list of the online security status

for i in online_column:
  security.append(i)
u=0
for y in y_actual_churn:
  if y == 0:
    no_churn.append(security[u])
  elif y == 1:
    yes_churn.append(security[u])
  u+=1
#Seperating the online security status into 2 groups yes- if they churned, no- if they did not churn
for y in no_churn:
  if y == 0:
    no_churn_n_security.append(y)
  elif y == 1:
    no_churn_y_security.append(y)
  elif y == 2:
    no_churn_no.append(y)
for y in yes_churn:
  if y == 0:
    yes_churn_n_security.append(y)
  elif y == 1:
    yes_churn_y_security.append(y)
  elif y == 2:
    yes_churn_no.append(y)

#Setting up for plotting
w=.4
cat = np.arange(len(categories))

#Bar plot comparing the online security status and who decided to churn or not
plt.bar(cat - w/2, [len(no_churn_n_security), len(no_churn_y_security), len(no_churn_no)], width=.4, label="Not Churning")
plt.bar(cat+ w/2, [len(yes_churn_n_security), len(yes_churn_y_security), len(yes_churn_no)], width=.4,label="Churning")

#Labeling the bar plot
plt.title("Online Security Depending on Churn Status")
plt.xlabel("Online Security Churned?")
plt.ylabel("Number of Customers")
plt.xticks(cat, categories)
plt.legend()


In [None]:
'''MODEL TRAINING'''
from sklearn.model_selection import train_test_split

#display(x_test_churn.head())
#display(churn_labels.head())

# split datasets into training/temporary dataset on 70/30 split
x_train, x_temp, y_train, y_temp = train_test_split(
    x_test_churn, churn_labels,
    random_state = 104,
    test_size = 0.30,
    stratify = churn_labels
)

# split temporary dataset into validation/testing dataset on 50/50 split
# each set has 15% of original data
x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp,
    test_size = 0.50,
    stratify = y_temp,
    random_state = 104
)

print("New Training Datasets")
display(x_train.head())
display(y_train.head())

print("New Validation Datasets")
display(x_val.head())
display(y_val.head())

print("New Testing Datasets")
display(x_test.head())
display(y_test.head())


In [None]:
# Model selection for Logistic Regression
# This block will take ~1 min to run

from sklearn.metrics import accuracy_score

Cs = [0.01, 0.1, 1, 10]
solvers = ["lbfgs", "liblinear"]
weights = [None, "balanced"]

lr_results = []

for c in Cs:
  for solver in solvers:
    for weight in weights:

      lr_classifier = LogisticRegression(
          solver = solver,
          max_iter = 10000,
          C = c,
          class_weight = weight,
          random_state = 45
      )

      lr_classifier.fit(x_train.to_numpy(), y_train.to_numpy())

      y_predicted_lr = lr_classifier.predict(x_val.to_numpy())
      lr_accuracy_score = accuracy_score(y_val, y_predicted_lr)

      lr_results.append((c, solver, weight, lr_accuracy_score))




In [None]:
# Model selection for MLPClassifier
# This block will take like 6 mins to run (sorry)

alphas = [0.001, 0.0001, 0.00001]
layers = [(8,2), (20,), (50,), (50,50)]

mlp_results = []

for alpha in alphas:
  for layer in layers:

    mlp_classifier = MLPClassifier(
        solver = 'lbfgs',
        alpha = alpha,
        hidden_layer_sizes = layer,
        random_state = 11,
        max_iter = 10000
    )

    mlp_classifier.fit(x_train.to_numpy(), y_train.to_numpy())

    y_predicted_mlp = mlp_classifier.predict(x_val.to_numpy())
    mlp_accuracy_score = accuracy_score(y_val, y_predicted_mlp)

    mlp_results.append((alpha, layer, mlp_accuracy_score))


In [None]:
'''MODEL EVALUATION'''

print("Logistic Regression results (ascending order)")
lr_results.sort(key = lambda x : x[3])
for result in lr_results:
  print(result)

print()
print("MLPClassifier results (ascending order)")
mlp_results.sort(key = lambda x : x[2])
for result in mlp_results:
  print(result)

In [None]:
# Now we will use the highest performing models
#            for LR/MLP on the actual test data

# LR: (0.1, 'liblinear', None, 0.7962085308056872)
# MLP: (0.001, (20,), 0.8075829383886256)

final_lr = LogisticRegression(
    C = 0.1,
    solver = 'liblinear',
    max_iter = 10000,
    class_weight = None,
    random_state = 42
)

final_mlp = MLPClassifier(
    solver = 'lbfgs',
    alpha = 0.001,
    hidden_layer_sizes = (20,),
    random_state = 11,
    max_iter = 10000
)

final_lr.fit(x_train.to_numpy(), y_train.to_numpy())
final_mlp.fit(x_train.to_numpy(), y_train.to_numpy())

In [None]:
final_lr_predicted = final_lr.predict(x_test.to_numpy())
final_lr_accuracy = accuracy_score(y_test, final_lr_predicted)

final_mlp_predicted = final_mlp.predict(x_test.to_numpy())
final_mlp_accuracy = accuracy_score(y_test, final_mlp_predicted)

print(f"Logistic Regression Model Accuracy = {final_lr_accuracy}")
print(f"MLPClassifier Model Accuracy = {final_mlp_accuracy}")

# The MLPClassifier did marginally better

In [None]:
# Identify which features the MLPClassifier model relies on the most

from sklearn.inspection import permutation_importance

mlp_results = permutation_importance(
    final_mlp, x_val, y_val, n_repeats = 10, random_state = 99
)

mlp_permutation = pd.DataFrame({
    "feature": x_val.columns,
    "importance": mlp_results.importances_mean
}).sort_values("importance", ascending = False)

print(mlp_permutation)

In [None]:
# redo featurizing keeping the churn column this time
def new_featurize(df):
  # 'X' is the df that will hold our converted data
  X = df[['tenure', 'MonthlyCharges', 'TotalCharges']]
  X['Partner'] = [1 if x=='Yes' else 0 for x in df['Partner']]
  X['Dependents'] = [1 if x=='Yes' else 0 for x in df['Dependents']]
  X['PhoneService'] = [1 if x=='Yes' else 0 for x in df['PhoneService']]
  X['MultipleLines'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['MultipleLines']]
  X['InternetService'] = [1 if x=='DSL' else 0 if x=='Fiber optic' else 2 for x in df['InternetService']]
  X['OnlineSecurity'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['OnlineSecurity']]
  X['OnlineBackup'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['OnlineBackup']]
  X['DeviceProtection'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['DeviceProtection']]
  X['TechSupport'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['TechSupport']]
  X['StreamingTV'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['StreamingTV']]
  X['StreamingMovies'] = [1 if x=='Yes' else 0 if x=='No' else 2 for x in df['StreamingMovies']]
  X['Contract'] = [1 if x=='Month-to-month' else 0 if x=='One year' else 2 for x in df['Contract']]
  X['PaperlessBilling'] = [1 if x=='Yes' else 0 for x in df['PaperlessBilling']]
  X['PaymentMethod'] = [1 if x=='Electronic check'
                        else 0 if x=='Mailed check'
                        else 2 if x=='Bank transfer (automatic)'
                        else 3 for x in df['PaymentMethod']]
  X['Churn'] = [1 if x=='Yes' else 0 for x in df['Churn']]

  return X


churn_analysis = new_featurize(churn_data)

display(churn_analysis.head())




In [None]:
# MonthlyCharges had a high effect on churn rate

churn_analysis["MonthlyCharges_bin"] = pd.qcut(
    churn_analysis["MonthlyCharges"],
    q = 10,
    duplicates = "drop"
)

bin_stats = churn_analysis.groupby("MonthlyCharges_bin")["Churn"].mean()

plt.figure(figsize = (10,5))
bin_stats.plot(kind = "bar")
plt.ylabel("Churn Rate")
plt.xlabel("MonthlyCharges (Binned)")
plt.title("Churn Rate by MonthlyCharges Decile")
plt.tight_layout()
plt.show()

In [None]:
# InternetService churn rate analysis

churn_analysis.groupby("InternetService")["Churn"].mean()

# 0 means Fiber Optic, they churn the most

In [None]:
churn_analysis.groupby("TechSupport")["Churn"].mean()


In [None]:
churn_analysis.groupby("MultipleLines")["Churn"].mean()


In [None]:
churn_analysis.groupby("OnlineSecurity")["Churn"].mean()


In [None]:
churn_analysis.groupby("StreamingMovies")["Churn"].mean()


In [None]:
churn_analysis.groupby("Partner")["Churn"].mean()

In [None]:
churn_analysis.groupby("Dependents")["Churn"].mean()


In [None]:
churn_analysis.groupby("PhoneService")["Churn"].mean()


In [None]:
churn_analysis.groupby("OnlineBackup")["Churn"].mean()


In [None]:
churn_analysis.groupby("DeviceProtection")["Churn"].mean()


In [None]:
churn_analysis.groupby("StreamingTV")["Churn"].mean()


In [None]:
churn_analysis.groupby("Contract")["Churn"].mean()


In [None]:
churn_analysis.groupby("PaperlessBilling")["Churn"].mean()


In [None]:
churn_analysis.groupby("PaymentMethod")["Churn"].mean()


In [None]:
'''DATA VISUALIZATION - LR Model Accuracy'''

#Setting up Datasets
dvcategories = ["True Positives", "True Negatives", "False Positives", "False Negatives"]
TP= 0
TN = 0
FP = 0
FN = 0
y_test_set = y_test["Churn"].tolist()

u=0
for y in final_lr_predicted:
  if y == 0 and y == y_test_set[u]:
    TN += 1
  elif y == 1 and y == y_test_set[u]:
    TP += 1
  elif y == 1 and y!= y_test_set[u]:
    FP += 1
  elif y == 0 and y != y_test_set[u]:
    FN += 1
  u+=1

print(TP, TN, FP, FN)

#Graphing
dvcategories = ["Negative","Positive"]
cat = np.arange(len(dvcategories))
w=0.4

plt.bar(cat - w/2, [TN, TP], width=.4, label="True")
plt.bar(cat+ w/2, [FN, FP], width=.4,label="False")

plt.title("LR Model Accuracy")
plt.xlabel("Positive or Negative?")
plt.ylabel("Number of Customers")
plt.xticks(cat, dvcategories)
plt.legend()

In [None]:
'''DATA VISUALIZATION - MLP Model Accuracy'''

#Setting up Datasets
dvcategories = ["True Positives", "True Negatives", "False Positives", "False Negatives"]
TP= 0
TN = 0
FP = 0
FN = 0
y_test_set = y_test["Churn"].tolist()

u=0
for y in final_mlp_predicted:
  if y == 0 and y == y_test_set[u]:
    TN += 1
  elif y == 1 and y == y_test_set[u]:
    TP += 1
  elif y == 1 and y!= y_test_set[u]:
    FP += 1
  elif y == 0 and y != y_test_set[u]:
    FN += 1
  u+=1

print(TP, TN, FP, FN)

#Graphing
dvcategories = ["Negative","Positive"]
cat = np.arange(len(dvcategories))
w=0.4

plt.bar(cat - w/2, [TN, TP], width=.4, label="True")
plt.bar(cat+ w/2, [FN, FP], width=.4,label="False")

plt.title("MLP Model Accuracy")
plt.xlabel("Positive or Negative?")
plt.ylabel("Number of Customers")
plt.xticks(cat, dvcategories)
plt.legend()

In [None]:
'''BIAS AND FAIRNESS EVALUATION'''
# Lydia

# imports
from sklearn.metrics import accuracy_score, precision_score, recall_score

# y_test is currently shape (1055, 1) => fix to (1055,)
y_true = pd.Series(np.array(y_test).ravel(), index=x_test.index)

# convert to yes/no if needed
# encoding binary data
if y_true.dtype == object:
  y_true = y_true.map({"Yes": 1, "No": 0, "yes": 1, "no": 0})

# 1: build test_df with correct alignment
test_df = x_test.copy()
test_df["y_true"] = y_true

# bring gender back if x_test does not contain
sensitive_attr = "gender"
if sensitive_attr not in test_df.columns:
  test_df[sensitive_attr] = df.loc[test_df.index, sensitive_attr]

# 2: model predictions
test_df["pred_lr"] = final_lr.predict(x_test)
test_df["pred_mlp"] = final_mlp.predict(x_test)

# 3: fairness metrics by group
groups = test_df[sensitive_attr].unique()
results = []

for model_name, pred_col in [("Logistic Regression", "pred_lr"),
                             ("MLP Classifier", "pred_mlp")]:

  print(f"\nFairness Metrics for {model_name}: ")

  for g in groups:
    subset = test_df[test_df[sensitive_attr] == g]

    if len(subset) == 0:
      print(f"Skipping group '{g}' (0 rows).")
      continue

    # no NaNs in the subset labels...?
    if subset["y_true"].isna().any():
      raise ValueError(f"NaNs detected in y_true for group '{g} after alignment.")

    acc = accuracy_score(subset["y_true"], subset[pred_col])
    prec = precision_score(subset["y_true"], subset[pred_col], zero_division=0)
    rec = recall_score(subset["y_true"], subset[pred_col], zero_division=0)
    positive_rate = subset[pred_col].mean()

    results.append({
        "model": model_name,
        "gender": g,
        "count": len(subset),
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "positive_prediction_rate": positive_rate
    })

fairness_df = pd.DataFrame(results)
display(fairness_df)

# 4: disparate impact / 4/5ths rule for each model
print("\nDisparate Impact Analysis: ")

for model_name, pred_col in [("Logistic Regression", "pred_lr"),
                             ("MLP Classifier", "pred_mlp")]:

  print(f"\n{model_name}: ")
  ppr = test_df.groupby(sensitive_attr)[pred_col].mean()

  # if only a single group exists, skip
  if len(ppr) < 2:
    print("Not enough groups to compare.")
    continue

  min_group = ppr.idxmin()
  max_group = ppr.idxmax()
  ratio = ppr[min_group] / ppr[max_group] if ppr[max_group] != 0 else np.nan

  print(f"Lowest PPR group: {min_group} ({ppr[min_group]:.3f})")
  print(f"Highest PPR group: {max_group} ({ppr[max_group]:.3f})")
  print(f"Disparate Impact Ratio (low/high): {ratio:.3f}")

  if pd.isna(ratio):
    print("Ratio is undefined (division by 0).")
  elif ratio < 0.8:
    print("Potential disparate impact detected (ratio < 0.8).")
  else:
    print("No disparate impact detected.")

In [None]:
'''DEMO'''

import pickle

lr_file = open("lr_model.saved", "wb")
pickle.dump(final_lr,lr_file)
lr_file.close()


In [None]:
import numpy as np

open_lr = open("lr_model.saved", "rb")
model = pickle.load(open_lr)
open_lr.close()

# sample inputs of someone who is likely to churn
tenure = 5
monthlycharges = -90
totalcharges = -4500
partner = 0
dependents = 0
phoneservice = 1
multiplelines = 1
internetservice = 0
onlinesecurity = 0
onlinebackup = 0
deviceprot= 0
techsupport = 0
streamingtv= 0
streamingmovies= 0
contract= 1
paperless = 1
payment = 3

input_data = np.array([[tenure,monthlycharges,totalcharges,
                           partner,dependents,phoneservice,
                           multiplelines,internetservice,
                           onlinesecurity,onlinebackup,
                           deviceprot,techsupport,streamingtv,
                           streamingmovies,contract,paperless,
                           payment]])

demo_predicted = final_lr.predict(input_data)

if demo_predicted[0] == 1:
  print("This person is likely to churn")

if demo_predicted[0] == 0:
  print("This person is not likely to churn")

In [None]:
# sample inputs of someone who is not likely to churn
tenure = 12
monthlycharges = 20
totalcharges = 1020
partner = 1
dependents = 1
phoneservice = 1
multiplelines = 1
internetservice = 0
onlinesecurity = 1
onlinebackup = 1
deviceprot= 1
techsupport = 1
streamingtv= 1
streamingmovies= 1
contract= 1
paperless = 0
payment = 3

input_data2 = np.array([[tenure,monthlycharges,totalcharges,
                           partner,dependents,phoneservice,
                           multiplelines,internetservice,
                           onlinesecurity,onlinebackup,
                           deviceprot,techsupport,streamingtv,
                           streamingmovies,contract,paperless,
                           payment]])

demo_predicted2 = final_lr.predict(input_data2)

if demo_predicted2[0] == 1:
  print("This person is likely to churn")

if demo_predicted2[0] == 0:
  print("This person is not likely to churn")