In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

In [None]:
df = pd.read_csv('https://files.consumerfinance.gov/ccdb/complaints.csv.zip', compression='zip')
df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(3371097, 18)

In [None]:
df.head(2).T

Unnamed: 0,0,1
Date received,2023-02-10,2023-02-17
Product,"Credit reporting, credit repair services, or o...","Credit reporting, credit repair services, or o..."
Sub-product,Credit repair services,Credit reporting
Issue,Fraud or scam,Improper use of your report
Sub-issue,,Reporting company used your report improperly
Consumer complaint narrative,,
Company public response,,
Company,"CDS Debt Relief, LLC",Experian Information Solutions Inc.
State,IN,CA
ZIP code,47304.0,90001.0


In [None]:
# Create a new dataframe with two columns
df1 = df[['Product', 'Consumer complaint narrative']].copy()

# Remove missing values (NaN)
df1 = df1[pd.notnull(df1['Consumer complaint narrative'])]

# Renaming second column for a simpler name
df1.columns = ['Product', 'Consumer_complaint'] 

df1.shape

(1215787, 2)

In [None]:
df1.head(2).T

Unnamed: 0,10,17
Product,"Credit reporting, credit repair services, or o...","Credit reporting, credit repair services, or o..."
Consumer_complaint,Im submitting a complaint to you today to info...,This is my NUMEROUS request that I have been a...


In [None]:
pd.DataFrame(df1.Product.unique())

Unnamed: 0,0
0,"Credit reporting, credit repair services, or o..."
1,Checking or savings account
2,Mortgage
3,Credit card or prepaid card
4,Debt collection
5,"Money transfer, virtual currency, or money ser..."
6,Student loan
7,Vehicle loan or lease
8,"Payday loan, title loan, or personal loan"
9,Payday loan


In [None]:
# Because the computation is time consuming (in terms of CPU), the data was sampled
df2 = df1.sample(10000, random_state=1).copy()

In [None]:
df2.replace({
    'Product': {
        'Credit reporting, credit repair services, or other personal consumer reports': 'Credit reporting, repair, or other',
        'Payday loan': 'Payday loan, title loan, or personal loan',
        'Credit reporting': 'Credit reporting, repair, or other',
        'Credit card': 'Credit card or prepaid card',
        'Prepaid card': 'Credit card or prepaid card',
        'Money transfers': 'Money transfer, virtual currency, or money service',
        'Virtual currency': 'Money transfer, virtual currency, or money service'
    }
}, inplace=True)

In [None]:
pd.DataFrame(df2.Product.unique())

Unnamed: 0,0
0,Bank account or service
1,"Money transfer, virtual currency, or money ser..."
2,"Credit reporting, repair, or other"
3,Checking or savings account
4,Debt collection
5,Student loan
6,Mortgage
7,Credit card or prepaid card
8,Vehicle loan or lease
9,Consumer Loan


In [None]:
df2['category_id'] = df2['Product'].factorize()[0]
df2.head()

Unnamed: 0,Product,Consumer_complaint,category_id
2850460,Bank account or service,We acquired a mortgage on a double wide traile...,0
454226,"Money transfer, virtual currency, or money ser...","XX/XX/2019 XXXX from JP Morgan XX/XX/XXXX, XX/...",1
122770,"Credit reporting, repair, or other","On XX/XX/2022, I sent a letter regarding inacc...",2
1137360,Checking or savings account,Due to the COVID-19 crisis our firm was forced...,3
2585396,"Credit reporting, repair, or other",Equifax experienced a security breach in XXXX ...,2


In [None]:
category_id_df = df2[['Product', 'category_id']].drop_duplicates()
print(category_id_df)

                                                   Product  category_id
2850460                            Bank account or service            0
454226   Money transfer, virtual currency, or money ser...            1
122770                  Credit reporting, repair, or other            2
1137360                        Checking or savings account            3
2658116                                    Debt collection            4
2480616                                       Student loan            5
2820318                                           Mortgage            6
255752                         Credit card or prepaid card            7
2552590                              Vehicle loan or lease            8
2782543                                      Consumer Loan            9
3331189          Payday loan, title loan, or personal loan           10


In [None]:
category_to_id = dict(category_id_df.values)
print(category_to_id)

{'Bank account or service': 0, 'Money transfer, virtual currency, or money service': 1, 'Credit reporting, repair, or other': 2, 'Checking or savings account': 3, 'Debt collection': 4, 'Student loan': 5, 'Mortgage': 6, 'Credit card or prepaid card': 7, 'Vehicle loan or lease': 8, 'Consumer Loan': 9, 'Payday loan, title loan, or personal loan': 10}


In [None]:
id_to_category = dict(category_id_df[['category_id', 'Product']].values)
print(id_to_category)

{0: 'Bank account or service', 1: 'Money transfer, virtual currency, or money service', 2: 'Credit reporting, repair, or other', 3: 'Checking or savings account', 4: 'Debt collection', 5: 'Student loan', 6: 'Mortgage', 7: 'Credit card or prepaid card', 8: 'Vehicle loan or lease', 9: 'Consumer Loan', 10: 'Payday loan, title loan, or personal loan'}


In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

# We transform each complaint into a vector
features = tfidf.fit_transform(df2.Consumer_complaint).toarray()

labels = df2.category_id

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2)

model = LinearSVC()
model.fit(x_train, y_train)

LinearSVC()

In [None]:
y_pred = model.predict(x_test)

NameError: ignored

In [None]:
X = df2['Consumer_complaint']
Y = df2['Product']

x_train, y_train, x_test, y_test = train_test_split(X, Y, test_size = 0.2)