In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer



In [2]:
complaints = pd.read_csv('shared/complaints_25Nov21.csv')  # Replace 'complaints.csv' with your dataset filename


In [3]:
selected_features = ['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via',
                     'Company response to consumer', 'Timely response?']
X = complaints[selected_features]
y = complaints['Consumer disputed?']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)



In [6]:
proportion_disputed = np.sum(y_train) / len(y_train)
if proportion_disputed < 0.30:
    undersampler = RandomUnderSampler(random_state=123)
    X_train, y_train = undersampler.fit_resample(X_train, y_train)



In [7]:
model_xgb = XGBClassifier(random_state=123)
model_xgb.fit(X_train, y_train)



In [8]:
y_pred = model_xgb.predict(X_test)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)



In [13]:
print(report)
print(conf_matrix)

              precision    recall  f1-score   support

           0       0.84      0.53      0.65     32504
           1       0.27      0.63      0.38      8948

    accuracy                           0.55     41452
   macro avg       0.55      0.58      0.51     41452
weighted avg       0.72      0.55      0.59     41452

[[17128 15376]
 [ 3302  5646]]


In [15]:
proportion_disputed_after_undersampling = np.sum(y_train) / len(y_train)
print(proportion_disputed_after_undersampling)

0.5


In [14]:
proportion_disputed_in_test_set = np.sum(y_test) / len(y_test)
print(proportion_disputed_in_test_set)

0.21586413200810575


In [17]:
proportion_disputed_after_undersampling = np.sum(y_train) / len(y_train)

print("Proportion of consumers who raised a dispute in the training dataset after undersampling:", proportion_disputed_after_undersampling)

Proportion of consumers who raised a dispute in the training dataset after undersampling: 0.5


In [19]:
from sklearn.metrics import recall_score

# Encode 'Yes' as '1' using the label encoder
y_test_labels = label_encoder.transform(['Yes'])

# Predict on the test set
y_pred = model_xgb.predict(X_test)

# Calculate the recall for 'Consumer disputed?' = 'Yes'
recall = recall_score(y_test, y_pred, pos_label=y_test_labels[0])  # Extract the label (1) from the array

print("Recall for 'Consumer disputed?' = 'Yes' on the test set:", recall)

Recall for 'Consumer disputed?' = 'Yes' on the test set: 0.6309789897183729


In [22]:
def calculate_total_cost(y_true, y_pred):
    # Define your cost values
    cost_false_positive = 100  # Cost for false positives
    cost_false_negative = 500  # Cost for false negatives

    # Calculate the number of false positives and false negatives
    false_positives = np.sum((y_true == 0) & (y_pred == 1))
    false_negatives = np.sum((y_true == 1) & (y_pred == 0))

    # Calculate the total cost
    total_cost = (false_positives * cost_false_positive) + (false_negatives * cost_false_negative)

    return total_cost

In [23]:
y_pred = model_xgb.predict(X_test)

# Calculate the total cost based on model predictions
extra_diligence_cost = 90  # Cost of extra diligence per complaint
dispute_cost = 600  # Cost of a dispute

# Find the indices of positive predictions (model predicts 'Yes')
positive_indices = np.where(y_pred == 1)[0]

# Calculate the total cost based on the number of positive predictions and the extra diligence cost
total_cost = len(positive_indices) * extra_diligence_cost

print("Total cost to the banks based on model results:", total_cost)

Total cost to the banks based on model results: 1891980


In [25]:
y_pred = model_xgb.predict(X_test)

# Define the costs
extra_diligence_cost = 90  # Cost of extra diligence per complaint
dispute_cost = 600  # Cost of a dispute

# Initialize the total cost
total_cost = 0

# Loop through the test set and calculate the total cost
for i in range(len(y_pred)):
    if y_pred[i] == 1:
        # Model predicts 'Yes': Perform extra diligence
        total_cost += extra_diligence_cost
    else:
        # Model predicts 'No': No extra cost incurred
        pass

print("Total cost to the banks based on model results:", total_cost)

Total cost to the banks based on model results: 1891980


In [28]:
import numpy as np

# Create an array of threshold values
thresholds = np.arange(0, 1.01, 0.01)

# Initialize variables to keep track of the best threshold and lowest cost
best_threshold = 0
lowest_cost = float('inf')

# Define the costs
extra_diligence_cost = 90  # Cost of extra diligence per complaint
dispute_cost = 600  # Cost of a dispute

# Loop through thresholds and calculate costs
for threshold in thresholds:
    # Classify based on the current threshold
    y_pred_threshold = (model_xgb.predict_proba(X_test)[:, 1] >= threshold).astype(int)
    
    # Calculate the number of false positives and false negatives
    false_positives = np.sum((y_test == 0) & (y_pred_threshold == 1))
    false_negatives = np.sum((y_test == 1) & (y_pred_threshold == 0))
    
    # Calculate the total cost for this threshold
    total_cost = (false_positives * extra_diligence_cost) + (false_negatives * dispute_cost)
    
    # Update if the cost is lower
    if total_cost < lowest_cost:
        lowest_cost = total_cost
        best_threshold = threshold

print("Best Threshold:", best_threshold)
print("Lowest Total Cost:", lowest_cost)

Best Threshold: 0.34
Lowest Total Cost: 2795430


In [29]:
import numpy as np

# Create an array of threshold values
thresholds = np.arange(0, 1.01, 0.01)

# Initialize variables to keep track of the best threshold and lowest cost
best_threshold = 0
lowest_cost = float('inf')

# Define the costs
extra_diligence_cost = 90  # Cost of extra diligence per complaint
dispute_cost = 600  # Cost of a dispute

# Loop through thresholds and calculate costs
for threshold in thresholds:
    # Classify based on the current threshold
    y_pred_threshold = (model_xgb.predict_proba(X_test)[:, 1] >= threshold).astype(int)
    
    # Calculate the number of false positives and false negatives
    false_positives = np.sum((y_test == 0) & (y_pred_threshold == 1))
    false_negatives = np.sum((y_test == 1) & (y_pred_threshold == 0))
    
    # Calculate the total cost for this threshold
    total_cost = (false_positives * extra_diligence_cost) + (false_negatives * dispute_cost)
    
    # Update if the cost is lower
    if total_cost < lowest_cost:
        lowest_cost = total_cost
        best_threshold = threshold

print("Best Threshold:", best_threshold)
print("Lowest Total Cost:", lowest_cost)

Best Threshold: 0.34
Lowest Total Cost: 2795430


In [30]:
import numpy as np

# Create an array of threshold values
thresholds = np.arange(0, 1.01, 0.01)

# Define the costs
extra_diligence_cost = 90  # Cost of extra diligence per complaint
dispute_cost = 600  # Cost of a dispute

# Initialize variables to keep track of the best threshold and lowest cost
best_threshold = 0
lowest_cost = float('inf')

# Loop through thresholds and calculate costs
for threshold in thresholds:
    # Classify based on the current threshold
    y_pred_threshold = (model_xgb.predict_proba(X_test)[:, 1] >= threshold).astype(int)
    
    # Calculate the number of false positives and false negatives
    false_positives = np.sum((y_test == 0) & (y_pred_threshold == 1))
    false_negatives = np.sum((y_test == 1) & (y_pred_threshold == 0))
    
    # Calculate the total cost for this threshold
    total_cost = (false_positives * extra_diligence_cost) + (false_negatives * dispute_cost)
    
    # Update if the cost is lower
    if total_cost < lowest_cost:
        lowest_cost = total_cost
        best_threshold = threshold

print("Best Threshold:", best_threshold)
print("Lowest Total Cost:", lowest_cost)

Best Threshold: 0.34
Lowest Total Cost: 2795430
