#### Importing the necessary libraries

In [1]:
import torch
import pickle
import pandas as pd
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_score

#### Loading 5 product models

In [2]:
with open('models/Credit_Reporting_model.pkl', 'rb') as f:
   trained_model_cr= pickle.load(f)

with open('models/Credit_Prepaid_Card_model.pkl', 'rb') as f:
   trained_model_cp= pickle.load(f)

with open('models/Checking_saving_model.pkl', 'rb') as f:
    trained_model_cs=pickle.load(f)

with open('models/loan_model.pkl', 'rb') as f:
   trained_model_l= pickle.load(f)

with open('models/Debt_model.pkl', 'rb') as f:
   trained_model_d= pickle.load(f)

#### Loading 17 issue models

In [3]:
# Path to the models and their corresponding names
issue_model_files = {
    'trained_model_account_operations': 'issue_models/account_operations_and_unauthorized_transaction_issues.pkl',
    'trained_model_collect_debt': 'issue_models/attempts_to_collect_debt_not_owed.pkl',
    'trained_model_closing_account': 'issue_models/closing_an_account.pkl',
    'trained_model_closing_your_account': 'issue_models/closing_your_account.pkl',
    'trained_model_credit_report': 'issue_models/credit_report_and_monitoring_issues.pkl',
    'trained_model_lender': 'issue_models/dealing_with_your_lender_or_servicer.pkl',
    'trained_model_disputes': 'issue_models/disputes_and_misrepresentations.pkl',
    'trained_model_improper_use_report': 'issue_models/improper_use_of_your_report.pkl',
    'trained_model_incorrect_info': 'issue_models/incorrect_information_on_your_report.pkl',
    'trained_model_legal_and_threat': 'issue_models/legal_and_threat_actions.pkl',
    'trained_model_managing_account': 'issue_models/managing_an_account.pkl',
    'trained_model_payment_funds': 'issue_models/payment_and_funds_management.pkl',
    'trained_model_investigation_wrt_issue': 'issue_models/problem_with_a_company\'s_investigation_into_an_existing_issue.pkl',
    'trained_model_investigation_wrt_problem': 'issue_models/problem_with_a_company\'s_investigation_into_an_existing_problem.pkl',
    'trained_model_credit_investigation_wrt_problem': 'issue_models/problem_with_a_credit_reporting_company\'s_investigation_into_an_existing_problem.pkl',
    'trained_model_purchase_shown': 'issue_models/problem_with_a_purchase_shown_on_your_statement.pkl',
    'trained_model_notification_about_debt': 'issue_models/written_notification_about_debt.pkl',
}

issue_models = {}

for model_name, file_path in issue_model_files.items():
    with open(file_path, 'rb') as f:
        issue_models[model_name] = pickle.load(f)

#### LLM to classify the product based on the narrative

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Define the pipeline for classifying product
product_classifier = pipeline("text-classification", model="Mahesh9/distil-bert-fintuned-product-cfpb-complaints",
                              max_length = 512, truncation = True, device = device)

#### Function to choose the appropriate product model to classify the sub-product

In [5]:
# Define a function to select the appropriate subproduct prediction model based on the predicted product
def select_subproduct_model(predicted_product):
    if predicted_product == 'Credit Reporting' :
        return trained_model_cr
    elif predicted_product == 'Credit/Prepaid Card':
        return trained_model_cp
    elif predicted_product == 'Checking or savings account':
        return trained_model_cs
    elif predicted_product == 'Loans / Mortgage':
        return trained_model_l
    elif predicted_product == 'Debt collection':
        return trained_model_d
    else:
        raise ValueError("Invalid predicted product category")

#### LLM to classify the issue based on the narrative

In [6]:
# Define the pipeline for classifying issue
issue_classifier = pipeline("text-classification", model="Mahesh9/distil-bert-fintuned-issues-cfpb-complaints",
                            max_length = 512, truncation = True, device = device)

#### Function to choose the appropriate issue model to classify the sub-issue

In [7]:
# Define a function to select the appropriate subissue prediction model based on the predicted issue
def select_subissue_model(predicted_issue):
    if predicted_issue == "Problem with a company's investigation into an existing problem":
        return issue_models['trained_model_investigation_wrt_problem']

    elif predicted_issue == "Problem with a credit reporting company's investigation into an existing problem":
        return issue_models['trained_model_credit_investigation_wrt_problem']

    elif predicted_issue == "Problem with a company's investigation into an existing issue":
        return issue_models['trained_model_investigation_wrt_issue']

    elif predicted_issue == "Problem with a purchase shown on your statement":
        return issue_models['trained_model_purchase_shown']

    elif predicted_issue == "Incorrect information on your report":
        return issue_models['trained_model_incorrect_info']

    elif predicted_issue == "Improper use of your report":
        return issue_models['trained_model_improper_use_report']

    elif predicted_issue == "Account Operations and Unauthorized Transaction Issues":
        return issue_models['trained_model_account_operations']

    elif predicted_issue == "Payment and Funds Management":
        return issue_models['trained_model_payment_funds']

    elif predicted_issue == "Managing an account":
        return issue_models['trained_model_managing_account']

    elif predicted_issue == "Attempts to collect debt not owed":
        return issue_models['trained_model_collect_debt']

    elif predicted_issue == "Written notification about debt":
        return issue_models['trained_model_notification_about_debt']

    elif predicted_issue == "Dealing with your lender or servicer":
        return issue_models['trained_model_lender']

    elif predicted_issue == "Disputes and Misrepresentations":
        return issue_models['trained_model_disputes']

    elif predicted_issue == "Closing your account":
        return issue_models['trained_model_closing_your_account']

    elif predicted_issue == "Closing an account":
        return issue_models['trained_model_closing_account']

    elif predicted_issue == "Credit Report and Monitoring Issues":
        return issue_models['trained_model_credit_report']

    elif predicted_issue == "Legal and Threat Actions":
        return issue_models['trained_model_legal_and_threat']

    else:
        raise ValueError("Invalid predicted issue category")

#### Driver code to classify the complaint into various categories

In [8]:
def classify_complaint(narrative):
    # Predict product category
    predicted_product = product_classifier(narrative)[0]['label']

    # Load the appropriate subproduct prediction model
    subproduct_model = select_subproduct_model(predicted_product)
    # Predict subproduct category using the selected model
    predicted_subproduct = subproduct_model.predict([narrative])[0]



    # Predict the appropriate issue category using the narrative
    predicted_issue = issue_classifier(narrative)[0]['label']

    # Load the appropriate subissue prediction model
    subissue_model = select_subissue_model(predicted_issue)
    # Predict subissue category using the selected model
    predicted_subissue = subissue_model.predict([narrative])[0]

    return {
        "Product" : predicted_product,
        "Sub-product" : predicted_subproduct,
        "Issue" : predicted_issue,
        "Sub-issue" : predicted_subissue
    }

In [9]:
narrative = """It is absurd that I have consistently made timely payments for this account and have never been
             overdue. I kindly request that you promptly update my account to reflect this accurately."""

classify_complaint(narrative)

{'Product': 'Credit/Prepaid Card',
 'Sub-product': 'General-purpose credit card or charge card',
 'Issue': "Problem with a company's investigation into an existing problem",
 'Sub-issue': 'Was not notified of investigation status or results'}

### Evaluating on the test set

In [17]:
from datasets import load_dataset
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, precision_score

# Load your dataset from a csv file
dataset = load_dataset('csv', data_files='/content/drive/MyDrive/Customer_Complaints data/test-data-split.csv')
test_data = dataset['train'].to_pandas()

# Initialize lists to store predicted and actual labels
predicted_products = []
predicted_subproducts = []
predicted_issues = []
predicted_subissues = []

actual_products = test_data['Product']
actual_subproducts = test_data['Sub-product']
actual_issues = test_data['Issue']
actual_subissues = test_data['Sub-issue']

# Iterate over each complaint narrative in the test set
for narrative in tqdm(test_data['Consumer complaint narrative']):
    # Predict product and subproduct using the custom_predict function
    prediction = classify_complaint(narrative)

    # Append predicted labels to lists
    predicted_products.append(prediction['Product'])
    predicted_subproducts.append(prediction['Sub-product'])
    predicted_issues.append(prediction['Issue'])
    predicted_subissues.append(prediction['Sub-issue'])

# Calculate accuracy and precision
accuracy_product = accuracy_score(actual_products, predicted_products)
precision_product = precision_score(actual_products, predicted_products, average='macro', zero_division=1)
accuracy_subproduct = accuracy_score(actual_subproducts, predicted_subproducts)
precision_subproduct = precision_score(actual_subproducts, predicted_subproducts, average='macro', zero_division=1)

accuracy_issue = accuracy_score(actual_issues, predicted_issues)
precision_issue = precision_score(actual_issues, predicted_issues, average='macro', zero_division=1)
accuracy_subissue = accuracy_score(actual_subissues, predicted_subissues)
precision_subissue = precision_score(actual_subissues, predicted_subissues, average='macro', zero_division=1)

# Print the results
print("Product Prediction Accuracy:", accuracy_product)
print("Product Prediction Precision:", precision_product)

print("Subproduct Prediction Accuracy:", accuracy_subproduct)
print("Subproduct Prediction Precision:", precision_subproduct)

print("Issue Prediction Accuracy:", accuracy_issue)
print("Issue Prediction Precision:", precision_issue)

print("Sub-issue Prediction Accuracy:", accuracy_subissue)
print("Sub-issue Prediction Precision:", precision_subissue)


Generating train split: 0 examples [00:00, ? examples/s]

  0%|          | 0/61880 [00:00<?, ?it/s]



Product Prediction Accuracy: 0.959502262443439
Product Prediction Precision: 0.8787916313109703
Subproduct Prediction Accuracy: 0.9265675500969619
Subproduct Prediction Precision: 0.6017085660720637
Issue Prediction Accuracy: 0.7651745313510019
Issue Prediction Precision: 0.7111296882245811
Sub-issue Prediction Accuracy: 0.7402876535229477
Sub-issue Prediction Precision: 0.6149852547976704


In [51]:
overall_average_accuracy = (accuracy_product + accuracy_subproduct + accuracy_issue + accuracy_subissue) / 4
print(f"Average Accuracy across all categories : {overall_average_accuracy}")

overall_average_precision = (precision_product + precision_subproduct + precision_issue + precision_subissue) / 4
print(f"Average Precision across all categories : {overall_average_precision}")

Average Accuracy across all categories : 0.8478829993535876
Average Precision across all categories : 0.7016537851013214


### Classification reports of all 4 categories

In [42]:
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [43]:
print(classification_report(actual_products, predicted_products))

                             precision    recall  f1-score   support

Checking or savings account       0.94      0.93      0.93      3071
           Credit Reporting       0.98      0.98      0.98     52924
        Credit/Prepaid Card       0.80      0.87      0.83      2994
            Debt collection       0.79      0.61      0.69      2345
           Loans / Mortgage       0.89      0.70      0.78       546

                   accuracy                           0.96     61880
                  macro avg       0.88      0.82      0.84     61880
               weighted avg       0.96      0.96      0.96     61880



In [44]:
print(classification_report(actual_subproducts, predicted_subproducts))

                                            precision    recall  f1-score   support

                                 Auto debt       0.35      0.17      0.23       140
               CD (Certificate of Deposit)       0.87      0.30      0.44        44
                          Checking account       0.80      0.93      0.86      2596
                Conventional home mortgage       0.50      0.03      0.06        29
                          Credit card debt       0.12      0.35      0.18       386
                          Credit reporting       0.97      0.98      0.98     52699
            Federal student loan servicing       0.97      0.93      0.95       328
General-purpose credit card or charge card       0.73      0.87      0.79      2668
                             I do not know       0.00      0.00      0.00       616
                                      Loan       0.63      0.39      0.48       189
                              Medical debt       0.25      0.55      0.34  

In [45]:
print(classification_report(actual_issues, predicted_issues))

                                                                                  precision    recall  f1-score   support

                          Account Operations and Unauthorized Transaction Issues       0.56      0.32      0.41       619
                                               Attempts to collect debt not owed       0.54      0.57      0.56      1381
                                                              Closing an account       0.68      0.64      0.66       521
                                                            Closing your account       0.58      0.64      0.61       191
                                             Credit Report and Monitoring Issues       0.67      0.53      0.59       412
                                            Dealing with your lender or servicer       0.96      0.97      0.97       314
                                                 Disputes and Misrepresentations       0.50      0.38      0.43       550
                       

In [46]:
print(classification_report(actual_subissues, predicted_subissues))

                                                                                       precision    recall  f1-score   support

                                                        Account information incorrect       0.52      0.50      0.51      2281
                                                  Account opened as a result of fraud       0.66      0.67      0.66       135
                                                             Account status incorrect       0.68      0.63      0.66      2502
                                                    Attempted to collect wrong amount       0.19      0.08      0.11       202
                                                                       Banking errors       0.63      0.11      0.19       231
                                                                      Billing problem       0.69      0.37      0.48       126
                                                             Can't close your account       0.64      0.05    

### Saving the test reuslts

In [24]:
# Create a DataFrame from the predicted and actual labels
results_df = pd.DataFrame({
    'Complaint Narrative' : test_data['Consumer complaint narrative'],
    'Actual Product': actual_products,
    'Predicted Product': predicted_products,
    'Actual Sub-product': actual_subproducts,
    'Predicted Sub-product': predicted_subproducts,
    'Actual Issue': actual_issues,
    'Predicted Issue': predicted_issues,
    'Actual Sub-issue': actual_subissues,
    'Predicted Sub-issue': predicted_subissues
})

# Save the DataFrame to a CSV file
results_df.to_csv('/content/drive/MyDrive/Customer_Complaints data/test_data_results.csv', index=False)