In [1]:


pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

dataset = load_dataset("BEE-spoke-data/consumer-finance-complaints")

df = dataset["train"].to_pandas()

print(df.head())



  Date received                                            Product  \
0    2024-02-15  Credit reporting or other personal consumer re...   
1    2024-02-15  Credit reporting or other personal consumer re...   
2    2024-02-15  Credit reporting or other personal consumer re...   
3    2024-02-15  Credit reporting or other personal consumer re...   
4    2024-02-15  Credit reporting or other personal consumer re...   

        Sub-product                                              Issue  \
0  Credit reporting                        Improper use of your report   
1  Credit reporting  Problem with a company's investigation into an...   
2  Credit reporting               Incorrect information on your report   
3  Credit reporting                        Improper use of your report   
4  Credit reporting               Incorrect information on your report   

                                           Sub-issue  \
0  Credit inquiries on your report that you don't...   
1  Difficulty submitti

In [3]:
# check columns
print(df.columns)

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')


In [4]:
df = df[["Consumer complaint narrative", "Product"]]
df.head()

Unnamed: 0,Consumer complaint narrative,Product
0,,Credit reporting or other personal consumer re...
1,,Credit reporting or other personal consumer re...
2,,Credit reporting or other personal consumer re...
3,,Credit reporting or other personal consumer re...
4,,Credit reporting or other personal consumer re...


In [5]:
df = df.dropna(subset=["Consumer complaint narrative"])

In [6]:
df.rename(columns={
    "Consumer complaint narrative": "text",
    "Product": "label"
}, inplace=True)

df.head()

Unnamed: 0,text,label
24,This is my OTHER request that I have been a vi...,Credit reporting or other personal consumer re...
84,Subject : Dispute of Unauthorized Credit Inqui...,Credit reporting or other personal consumer re...
230,This account is reporting a late payment to th...,Credit reporting or other personal consumer re...
242,We applied for the Citi Advantage card for a p...,Credit card
267,Trans Union XXXX and XXXX have violated my con...,Credit reporting or other personal consumer re...


In [7]:
df.shape

(1689593, 2)

In [8]:
from sklearn.model_selection import train_test_split

X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)

model.fit(X_train_tfidf, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [11]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7710930726002385


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.62      0.29      0.40      2990
                                                 Checking or savings account       0.71      0.82      0.76     17418
                                                               Consumer Loan       0.51      0.16      0.25      1932
                                                                 Credit card       0.57      0.24      0.33      6356
                                                 Credit card or prepaid card       0.66      0.73      0.69     21859
                                                            Credit reporting       0.61      0.19      0.28      6402
                         Credit reporting or other personal consumer reports       0.68      0.20      0.31     23958
Credit reporting, credit repair services, or other pers

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [12]:
model = LogisticRegression(max_iter=1000, class_weight="balanced")

In [13]:
TfidfVectorizer(max_features=15000, ngram_range=(1,2))

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [14]:
counts = df['label'].value_counts()
df = df[df['label'].isin(counts[counts > 100].index)]

In [15]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

svm_model = LinearSVC()

svm_model.fit(X_train_tfidf, y_train)

y_pred_svm = svm_model.predict(X_test_tfidf)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.7656894107759552


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.62      0.19      0.29      2990
                                                 Checking or savings account       0.70      0.82      0.76     17418
                                                               Consumer Loan       0.57      0.06      0.11      1932
                                                                 Credit card       0.61      0.14      0.23      6356
                                                 Credit card or prepaid card       0.65      0.72      0.68     21859
                                                            Credit reporting       0.62      0.08      0.14      6402
                         Credit reporting or other personal consumer reports       0.69      0.16      0.27     23958
Credit reporting, credit repair services, or other pers

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [16]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(loss="hinge")  # hinge = SVM-style

sgd_model.fit(X_train_tfidf, y_train)

y_pred_sgd = sgd_model.predict(X_test_tfidf)

print("SGD Accuracy:", accuracy_score(y_test, y_pred_sgd))
print(classification_report(y_test, y_pred_sgd))

SGD Accuracy: 0.739496743302389


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.34      0.01      0.02      2990
                                                 Checking or savings account       0.65      0.82      0.73     17418
                                                               Consumer Loan       0.17      0.01      0.03      1932
                                                                 Credit card       0.56      0.03      0.05      6356
                                                 Credit card or prepaid card       0.64      0.68      0.66     21859
                                                            Credit reporting       0.14      0.00      0.00      6402
                         Credit reporting or other personal consumer reports       0.74      0.02      0.04     23958
Credit reporting, credit repair services, or other pers

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [17]:
svm_model = LinearSVC(class_weight="balanced")

In [18]:
sgd_model = SGDClassifier(loss="hinge", class_weight="balanced")

In [19]:
from sklearn.metrics import accuracy_score, classification_report, f1_score

# Logistic
log_acc = accuracy_score(y_test, y_pred)
log_f1 = f1_score(y_test, y_pred, average="weighted")

# SVM
svm_acc = accuracy_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm, average="weighted")

# SGD
sgd_acc = accuracy_score(y_test, y_pred_sgd)
sgd_f1 = f1_score(y_test, y_pred_sgd, average="weighted")

In [20]:
import pandas as pd

results = pd.DataFrame({
    "Model": ["Logistic", "Linear SVM", "SGD"],
    "Accuracy": [log_acc, svm_acc, sgd_acc],
    "Weighted F1": [log_f1, svm_f1, sgd_f1]
})

print(results)

        Model  Accuracy  Weighted F1
0    Logistic  0.771093     0.747530
1  Linear SVM  0.765689     0.734469
2         SGD  0.739497     0.685761


In [21]:
from sklearn.linear_model import LogisticRegression

final_model = LogisticRegression(max_iter=1000, class_weight="balanced")

final_model.fit(X_train_tfidf, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [22]:
import pickle

pickle.dump(final_model, open("complaint_classifier.pkl", "wb"))
pickle.dump(vectorizer, open("tfidf_vectorizer.pkl", "wb"))

In [23]:
import joblib

# save model
joblib.dump(final_model, "complaint_model.pkl")

# save vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully")

Model and vectorizer saved successfully


In [24]:
import joblib

model = joblib.load("complaint_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

text = "Unauthorized transaction happened"

vec = vectorizer.transform([text])
prediction = model.predict(vec)

print(prediction)

['Money transfer, virtual currency, or money service']
