In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from pathlib import Path

# Set up directories
DATA_DIR = Path('data')
OUTPUT_DIR = Path('C:/Users/daniel.shobe/Desktop/schoolify/B5W6/creditrust-complaint-analysis/data/filtered_complaints.csv')
(DATA_DIR / 'eda_plots').mkdir(exist_ok=True)

# Step 1: Initial EDA
chunksize = 100000
chunks = pd.read_csv('C:/Users/daniel.shobe/Desktop/schoolify/B5W6/creditrust-complaint-analysis/data/raw/complaints.csv', low_memory=False, chunksize=chunksize)
product_counts = pd.Series(dtype=int)
narrative_lengths = []
narrative_counts = {'With Narratives': 0, 'Without Narratives': 0}
for chunk in chunks:
    product_counts = product_counts.add(chunk['Product'].value_counts(), fill_value=0)
    chunk['narrative_length'] = chunk['Consumer complaint narrative'].apply(
        lambda x: len(str(x).split()) if pd.notnull(x) else 0
    )
    narrative_lengths.append(chunk['narrative_length'])
    narrative_counts['With Narratives'] += chunk['Consumer complaint narrative'].notna().sum()
    narrative_counts['Without Narratives'] += chunk['Consumer complaint narrative'].isna().sum()
narrative_lengths = pd.concat(narrative_lengths)

# Load a sample for full dataset stats
df = pd.read_csv('C:/Users/daniel.shobe/Desktop/schoolify/B5W6/creditrust-complaint-analysis/data/raw/complaints.csv', low_memory=False, nrows=1000000)

print("Dataset Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())

print("\nComplaint Distribution by Product:")
print(product_counts)

# Visualize product distribution
plt.figure(figsize=(10, 6))
sns.barplot(x=product_counts.values, y=product_counts.index)
plt.title('Complaint Distribution by Product')
plt.xlabel('Number of Complaints')
plt.ylabel('Product')
plt.savefig(DATA_DIR / 'eda_plots' / 'product_distribution.png')
plt.close()

print("\nNarrative Length Statistics:")
print(narrative_lengths.describe())

# Visualize narrative length distribution
plt.figure(figsize=(10, 6))
sns.histplot(narrative_lengths, bins=50)
plt.title('Distribution of Complaint Narrative Lengths (Word Count)')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.savefig(DATA_DIR / 'eda_plots' / 'narrative_length_distribution.png')
plt.close()

print("\nComplaints with/without Narratives:")
print(f"With Narratives: {narrative_counts['With Narratives']}")
print(f"Without Narratives: {narrative_counts['Without Narratives']}")

# Step 2: Filter dataset
target_products = [
    'Credit card',
    'Consumer Loan',
    'Payday loan, title loan, personal loan, or advance loan',
    'Bank account or service',
    'Money transfer, virtual currency, or money service'
]
filtered_chunks = []
chunks = pd.read_csv('C:/Users/daniel.shobe/Desktop/schoolify/B5W6/creditrust-complaint-analysis/data/raw/complaints.csv', low_memory=False, chunksize=chunksize)
for chunk in chunks:
    chunk = chunk[chunk['Product'].isin(target_products)]
    chunk = chunk[chunk['Consumer complaint narrative'].notna()]
    chunk['cleaned_narrative'] = chunk['Consumer complaint narrative'].apply(
        lambda x: re.sub(r'[^\w\s]', '', x.lower()).strip()
    )
    filtered_chunks.append(chunk)
filtered_df = pd.concat(filtered_chunks)
print(f"\nFiltered Dataset Size: {len(filtered_df)}")

print("\nSample Cleaned Narratives:")
print(filtered_df['cleaned_narrative'].head())

# Step 3: Save cleaned dataset
try:
    filtered_df.to_csv(OUTPUT_DIR, index=False)
    print(f"Successfully saved cleaned dataset to {OUTPUT_DIR}")
    print(f"File exists: {os.path.exists(OUTPUT_DIR)}")
except PermissionError as e:
    print(f"PermissionError: {e}")
    print("Ensure the file is not open and check directory permissions.")
    fallback_path = Path('C:/temp/filtered_complaints.csv')
    try:
        filtered_df.to_csv(fallback_path, index=False)
        print(f"Saved to fallback path: {fallback_path}")
    except Exception as e:
        print(f"Fallback save failed: {e}")
except Exception as e:
    print(f"Error saving dataset: {e}")

# Step 4: Summary for report
eda_summary = """
### EDA and Preprocessing Summary
The CFPB dataset (~6 GB, 9,609,797 rows) was analyzed to understand complaint distribution and narrative characteristics. It contains complaints across various products, with 2,980,756 (~31%) having narratives. The top products include Credit Card (226,686), Consumer Loan (31,574), Payday loan/BNPL (16,514), Bank account or service (86,205), and Money transfer/virtual currency (145,066), as visualized in a bar plot (`data/eda_plots/product_distribution.png`). Narrative lengths range from 0 to 6,469 words (mean: ~54, median: 0 due to missing narratives), shown in a histogram (`data/eda_plots/narrative_length_distribution.png`), indicating the need for chunking in Task 2.

The dataset was filtered to include only Credit Card, Personal Loan, BNPL, Savings Account, and Money Transfer complaints with non-empty narratives, resulting in 211,097 records (470.99 MB). Text was cleaned by lowercasing, removing special characters, and normalizing whitespace to improve embedding quality. The cleaned dataset was saved as `data/filtered_complaints.csv` (or `C:/temp/filtered_complaints.csv` if permissions failed) and will be shared via a cloud link due to GitHub's 100 MB file size limit.
"""

with open(DATA_DIR / 'eda_summary.md', 'w') as f:
    f.write(eda_summary)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 18 columns):
 #   Column                        Non-Null Count    Dtype 
---  ------                        --------------    ----- 
 0   Date received                 1000000 non-null  object
 1   Product                       1000000 non-null  object
 2   Sub-product                   999054 non-null   object
 3   Issue                         1000000 non-null  object
 4   Sub-issue                     980134 non-null   object
 5   Consumer complaint narrative  146189 non-null   object
 6   Company public response       272157 non-null   object
 7   Company                       1000000 non-null  object
 8   State                         998109 non-null   object
 9   ZIP code                      999986 non-null   object
 10  Tags                          27853 non-null    object
 11  Consumer consent provided?    436678 non-null   object
 12  Submitted via                