In [2]:
import pandas as pd

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

def train_product_prediction_model(training_df, validation_df, classifier_model, random_state=None):
    # Define the pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classifier', classifier_model)
    ])
    
    # Train the pipeline
    pipeline.fit(training_df['Consumer complaint narrative'], training_df['Product'])
    
    # Make predictions on the validation set
    y_pred = pipeline.predict(validation_df['Consumer complaint narrative'])
    
    # Evaluate the pipeline
    accuracy = accuracy_score(validation_df['Product'], y_pred)
    print("Accuracy:", accuracy)
    print("\nClassification Report:")
    print(classification_report(validation_df['Product'], y_pred))
    
    return pipeline




In [4]:
product_training_df= pd.read_csv('../data_preprocessing_scriptsdata_splits/train-data-balanced.csv')
product_test_df= pd.read_csv('../data_preprocessing_scriptsdata_splits/test-data-split_2023.csv')

In [5]:
product_training_df.shape

(61295, 3)

In [6]:

from sklearn.ensemble import RandomForestClassifier

# Define RandomForestClassifier model
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)

# Train the product prediction model
trained_product_model = train_product_prediction_model(product_training_df, product_test_df, rf_classifier, random_state=42)


Accuracy: 0.9092598577892695

Classification Report:
                             precision    recall  f1-score   support

Checking or savings account       0.73      0.96      0.83      3071
           Credit Reporting       0.99      0.92      0.95     52924
        Credit/Prepaid Card       0.54      0.86      0.67      2994
            Debt collection       0.51      0.70      0.59      2345
           Loans / Mortgage       0.62      0.64      0.63       546

                   accuracy                           0.91     61880
                  macro avg       0.68      0.81      0.73     61880
               weighted avg       0.93      0.91      0.92     61880



In [7]:
from sklearn.naive_bayes import MultinomialNB

# Define Multinomial Naive Bayes classifier model
nb_classifier = MultinomialNB()

# Train the product prediction model with Naive Bayes classifier
trained_nb_product_model = train_product_prediction_model(product_training_df, product_test_df, nb_classifier, random_state=42)


Accuracy: 0.8646250808015514

Classification Report:
                             precision    recall  f1-score   support

Checking or savings account       0.63      0.96      0.76      3071
           Credit Reporting       0.98      0.89      0.93     52924
        Credit/Prepaid Card       0.32      0.81      0.46      2994
            Debt collection       0.56      0.36      0.44      2345
           Loans / Mortgage       0.84      0.36      0.51       546

                   accuracy                           0.86     61880
                  macro avg       0.67      0.68      0.62     61880
               weighted avg       0.92      0.86      0.88     61880



In [8]:
from sklearn.linear_model import LogisticRegression

# Define Logistic Regression classifier model
logreg_classifier = LogisticRegression(max_iter=1000)

#  Logistic Regression classifier
trained_logreg_product_model = train_product_prediction_model(product_training_df, product_test_df, logreg_classifier, random_state=42)


Accuracy: 0.9046541693600517

Classification Report:
                             precision    recall  f1-score   support

Checking or savings account       0.84      0.93      0.89      3071
           Credit Reporting       0.99      0.91      0.95     52924
        Credit/Prepaid Card       0.58      0.90      0.71      2994
            Debt collection       0.44      0.74      0.55      2345
           Loans / Mortgage       0.42      0.86      0.56       546

                   accuracy                           0.90     61880
                  macro avg       0.65      0.87      0.73     61880
               weighted avg       0.94      0.90      0.92     61880



In [9]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Initialize individual classifiers
logreg_classifier = LogisticRegression(max_iter=1000, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Initialize Voting Classifier
voting_classifier = VotingClassifier(estimators=[
    ('logreg', logreg_classifier),
    ('rf', rf_classifier),
    ('gb', gb_classifier)
], voting='hard')  # 'hard' for majority voting

# Train the Voting Classifier
trained_voting_classifier = train_product_prediction_model(product_training_df, product_test_df, voting_classifier, random_state=42)


Accuracy: 0.9111344537815126

Classification Report:
                             precision    recall  f1-score   support

Checking or savings account       0.77      0.95      0.85      3071
           Credit Reporting       0.99      0.92      0.95     52924
        Credit/Prepaid Card       0.59      0.89      0.71      2994
            Debt collection       0.49      0.71      0.58      2345
           Loans / Mortgage       0.48      0.80      0.60       546

                   accuracy                           0.91     61880
                  macro avg       0.66      0.85      0.74     61880
               weighted avg       0.93      0.91      0.92     61880



In [9]:
import pickle

with open('../subproduct_prediction/models/Product_model.pkl', 'wb') as f:
    pickle.dump(trained_voting_classifier, f)

In [13]:
product_training_df.Product.value_counts()

Checking or savings account    18028
Credit/Prepaid Card            17281
Credit Reporting               15735
Debt collection                 7035
Loans / Mortgage                3216
Name: Product, dtype: int64