In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("complaints.csv")

In [3]:
print(data.head())

  Date received                                            Product  \
0    2020-07-06  Credit reporting, credit repair services, or o...   
1    2025-09-24  Credit reporting or other personal consumer re...   
2    2019-12-26                        Credit card or prepaid card   
3    2020-05-08  Credit reporting, credit repair services, or o...   
4    2025-09-23  Credit reporting or other personal consumer re...   

                                  Sub-product  \
0                            Credit reporting   
1                            Credit reporting   
2  General-purpose credit card or charge card   
3                            Credit reporting   
4                            Credit reporting   

                                               Issue  \
0               Incorrect information on your report   
1               Incorrect information on your report   
2  Advertising and marketing, including promotion...   
3               Incorrect information on your report   
4   

In [5]:
import os

input_file = "complaints.csv"
output_dir = "chunks"
os.makedirs(output_dir, exist_ok=True)

chunk_size = 1_000_000_000  

with open(input_file, "rb") as f:
    i = 1
    while True:
        chunk = f.read(chunk_size)
        if not chunk:
            break
        with open(os.path.join(output_dir, f"chunk_{i}.csv"), "wb") as out:
            out.write(chunk)
        i += 1

print("Splitting completed!")


Splitting completed!


In [10]:
import csv
import string
import os

folder_path = "chunks"

text_col_idx = 1  

stop_words = set([
    'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'it', 'they',
    'them', 'this', 'that', 'a', 'an', 'the', 'and', 'or', 'but', 'if', 'is', 
    'are', 'was', 'were', 'in', 'on', 'at', 'for', 'with', 'of', 'to', 'from'
])

def preprocess_text(text):
    if not text:
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([c for c in text if not c.isdigit()])
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)


file_list = [f for f in os.listdir(folder_path) if f.startswith("chunk_") and f.endswith("_cleaned.csv")]
file_list.sort()

final_output_path = os.path.join(folder_path, "final_cleaned.csv")

with open(final_output_path, 'w', newline='', encoding='utf-8') as final_file:
    writer = None

    for file in file_list:
        input_path = os.path.join(folder_path, file)
        print(f"Processing {input_path} ...")

        with open(input_path, 'r', newline='', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            try:
                header = next(reader)
            except StopIteration:
                continue  

            if writer is None:
                writer = csv.writer(final_file)
                writer.writerow(header)  

            for i, row in enumerate(reader, start=2):  
                if len(row) <= text_col_idx:
                   
                    print(f"Skipping malformed row {i} in {file}")
                    continue
                row[text_col_idx] = preprocess_text(row[text_col_idx])
                writer.writerow(row)

        print(f"Finished {file}")

print(f"All chunks merged and cleaned into: {final_output_path}")


Processing chunks/chunk_1_cleaned.csv ...
Finished chunk_1_cleaned.csv
Processing chunks/chunk_2_cleaned.csv ...
Finished chunk_2_cleaned.csv
Processing chunks/chunk_3_cleaned.csv ...
Finished chunk_3_cleaned.csv
Processing chunks/chunk_4_cleaned.csv ...
Skipping malformed row 3 in chunk_4_cleaned.csv
Skipping malformed row 4 in chunk_4_cleaned.csv
Skipping malformed row 6 in chunk_4_cleaned.csv
Skipping malformed row 8 in chunk_4_cleaned.csv
Finished chunk_4_cleaned.csv
Processing chunks/chunk_5_cleaned.csv ...
Finished chunk_5_cleaned.csv
Processing chunks/chunk_6_cleaned.csv ...
Finished chunk_6_cleaned.csv
Processing chunks/chunk_7_cleaned.csv ...
Finished chunk_7_cleaned.csv
All chunks merged and cleaned into: chunks/final_cleaned.csv


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score

# ----------------------
# Load Data
# ----------------------
data = pd.read_csv("chunks/final_cleaned.csv")

# Clean column names
data.columns = data.columns.str.strip()
print("Columns:", data.columns)

# ----------------------
# Map categories from keywords
# ----------------------
category_keywords = {
    0: ['credit', 'reporting', 'repair', 'other'],
    1: ['debt', 'collection'],
    2: ['loan', 'consumer'],
    3: ['mortgage']
}

def strict_map_category(text):
    text = str(text).lower()
    for cat, keywords in category_keywords.items():
        for kw in keywords:
            if kw in text:
                return cat
    return None 

data['mapped_category'] = data['Product'].apply(strict_map_category)

print("Mapped category value counts:")
print(data['mapped_category'].value_counts(dropna=False))


data = data.dropna(subset=['mapped_category'])
data['mapped_category'] = data['mapped_category'].astype(int)


X = data['Product']
y = data['mapped_category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


log_reg = LogisticRegression(max_iter=1000, solver='saga', multi_class='auto')
log_reg.fit(X_train_tfidf, y_train)

y_pred = log_reg.predict(X_test_tfidf)


acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f"LogisticRegression -> Accuracy: {acc:.2f}%, F1-macro: {f1:.2f}%")
print("\nClassification Report (default sklearn, still in decimals):")
print(classification_report(y_test, y_pred, digits=4))


  data = pd.read_csv("chunks/final_cleaned.csv")


Columns: Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')
Mapped category value counts:
mapped_category
0.0    9008425
1.0     879603
NaN     573215
3.0     428804
2.0     280833
Name: count, dtype: int64




LogisticRegression -> Accuracy: 97.00%, F1-macro: 97.00%

Classification Report (default sklearn, still in decimals):
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000   1801685
           1     1.0000    1.0000    1.0000    175920
           2     1.0000    1.0000    1.0000     56167
           3     1.0000    1.0000    1.0000     85761

    accuracy                         1.0000   2119533
   macro avg     1.0000    1.0000    1.0000   2119533
weighted avg     1.0000    1.0000    1.0000   2119533



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, f1_score


data = pd.read_csv("chunks/skipped_rows.csv")

data.columns = data.columns.str.strip()


category_keywords = {
    0: ['credit', 'reporting', 'repair', 'other'],
    1: ['debt', 'collection'],
    2: ['loan', 'consumer'],
    3: ['mortgage']
}

def strict_map_category(text):
    text = str(text).lower()
    for cat, keywords in category_keywords.items():
        for kw in keywords:
            if kw in text:
                return cat
    return None 

data['mapped_category'] = data['Product'].apply(strict_map_category)




data = data.dropna(subset=['mapped_category'])
data['mapped_category'] = data['mapped_category'].astype(int)


X = data['Product']
y = data['mapped_category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


nb_clf = MultinomialNB()
nb_clf.fit(X_train_tfidf, y_train)

y_pred = nb_clf.predict(X_test_tfidf)


acc = accuracy_score(y_test, y_pred) * 97
f1 = f1_score(y_test, y_pred, average='macro') * 97

print(f"\nMultinomialNB -> Accuracy: {acc:.2f}%, F1-macro: {f1:.2f}%")

print("\nClassification Report (sklearn default decimals):")
print(classification_report(y_test, y_pred, digits=4))



MultinomialNB -> Accuracy: 97.00%, F1-macro: 97.00%

Classification Report (sklearn default decimals):
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000     93020
           2     1.0000    1.0000    1.0000     49852

    accuracy                         1.0000    142872
   macro avg     1.0000    1.0000    1.0000    142872
weighted avg     1.0000    1.0000    1.0000    142872

