In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the data
df = pd.read_csv('shared/complaints_25Nov21.csv')

In [3]:
# Select predictor variables
X = df[['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via', 'Company response to consumer', 'Timely response?']]

# Convert categorical variables to numerical
X = pd.get_dummies(X)

# Set target variable
le = LabelEncoder()
y = le.fit_transform(df['Consumer disputed?'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [4]:
# Check proportion of disputed complaints in training set and perform undersampling if necessary
if (y_train == 1).sum() / len(y_train) < 0.3:
    undersampler = RandomUnderSampler(random_state=123)
    X_train, y_train = undersampler.fit_resample(X_train, y_train)

In [5]:
# Train the model
model_xgb = XGBClassifier(random_state=123)
model_xgb.fit(X_train, y_train)

In [6]:
# Evaluate the model
y_pred = model_xgb.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.53      0.65     32504
           1       0.27      0.63      0.38      8948

    accuracy                           0.55     41452
   macro avg       0.55      0.58      0.51     41452
weighted avg       0.71      0.55      0.59     41452

[[17157 15347]
 [ 3323  5625]]


In [7]:
# Calculate the proportion of consumers who raised a dispute in the test set
dispute_proportion_test = y_test.mean()
print(f"Proportion of consumers who raised a dispute in the test set: {dispute_proportion_test}")


Proportion of consumers who raised a dispute in the test set: 0.21586413200810575


In [8]:
# Calculate the proportion of consumers who raised a dispute in the training set after undersampling
dispute_proportion_train = y_train.mean()
print(f"Proportion of consumers who raised a dispute in the training set after undersampling: {dispute_proportion_train}")


Proportion of consumers who raised a dispute in the training set after undersampling: 0.5


In [9]:
# Calculate the recall for the category 'Consumer disputed?' = 'Yes' on the test set
from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred)
print(f"Recall for the category 'Consumer disputed?' = 'Yes' on the test set: {recall}")


Recall for the category 'Consumer disputed?' = 'Yes' on the test set: 0.62863209655789


In [10]:
# Calculate the total cost to the banks of dealing with the complaints in the test set if there were no model
base_case_cost = y_test.sum() * 600 + (len(y_test) - y_test.sum()) * 100
print(f"Total cost to the banks of dealing with the complaints in the test set if there were no model: {base_case_cost}")


Total cost to the banks of dealing with the complaints in the test set if there were no model: 8619200


In [11]:
# Calculate the total cost to the banks of dealing with the complaints in the test set based on the model results
extra_diligence_cost = y_pred.sum() * 90
dispute_cost = (y_pred & y_test).sum() * 500
non_dispute_cost = (len(y_test) - y_pred.sum()) * 100
total_cost = extra_diligence_cost + dispute_cost + non_dispute_cost
print(f"Total cost to the banks of dealing with the complaints in the test set based on the model results: {total_cost}")


Total cost to the banks of dealing with the complaints in the test set based on the model results: 6747980


In [12]:
# Change the value of the threshold and determine the lowest total cost to the banks based on the observations in the test set
import numpy as np
thresholds = np.linspace(0, 1, 100)
costs = []
for threshold in thresholds:
    y_pred = (model_xgb.predict_proba(X_test)[:, 1] > threshold).astype(int)
    extra_diligence_cost = y_pred.sum() * 90
    dispute_cost = (y_pred & y_test).sum() * 500
    non_dispute_cost = (len(y_test) - y_pred.sum()) * 100
    total_cost = extra_diligence_cost + dispute_cost + non_dispute_cost
    costs.append(total_cost)
min_cost = min(costs)
print(f"Lowest total cost to the banks based on the observations in the test set: {min_cost}")


Lowest total cost to the banks based on the observations in the test set: 4145190


In [13]:
# Determine the value of the threshold at which the lowest dollar cost is achieved
optimal_threshold = thresholds[costs.index(min_cost)]
print(f"Value of the threshold at which the lowest dollar cost is achieved: {optimal_threshold}")


Value of the threshold at which the lowest dollar cost is achieved: 0.9090909090909092
