In [None]:
# Import libraries 

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

#  Load the data 

df = pd.read_csv("../data/complaints.csv")  
print("Initial Data Sample:")
print(df.head())

#  Initial EDA 

print("\nDataset Info:")
df.info()

print("\nStatistical Summary:")
print(df.describe(include='all'))

print("\nProduct Counts:")
print(df['Product'].value_counts())

print("\nSample Narratives:")
print(df['Consumer complaint narrative'].sample(5, random_state=42))

#  Distribution of Complaints by Product 

plt.figure(figsize=(12, 8))
sns.countplot(y='Product', data=df, order=df['Product'].value_counts().index)
plt.title("Distribution of Complaints by Product")
plt.xlabel("Count")
plt.ylabel("Product")
plt.tight_layout()
plt.show()

#  Narrative Cleaning Function 

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text)          # Normalize whitespace
    return text.strip()

#  Handle Narratives 

# Clean the narratives
df['cleaned_narrative'] = df['Consumer complaint narrative'].fillna('').apply(clean_text)

# Compute narrative lengths
df['narrative_length'] = df['cleaned_narrative'].apply(lambda x: len(x.split()))

# Plot the distribution of narrative lengths
plt.figure(figsize=(10, 6))
sns.histplot(df['narrative_length'], bins=50, kde=True)
plt.title("Distribution of Complaint Narrative Lengths")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

#  Count narratives with and without text 

with_narrative = df['Consumer complaint narrative'].notna().sum()
without_narrative = df['Consumer complaint narrative'].isna().sum()
print(f"\nWith narrative: {with_narrative}, Without narrative: {without_narrative}")

#  Filter the dataset 

target_products = [
    "Credit card", "personal loan", 
    "Buy Now, Pay Later (BNPL)", "Savings account", 
    "Money transfer"
]

filtered_df = df[df['Product'].isin(target_products)].copy()
filtered_df = filtered_df[filtered_df['Consumer complaint narrative'].notna()]

# Clean and compute narrative lengths for filtered data
filtered_df['cleaned_narrative'] = filtered_df['Consumer complaint narrative'].apply(clean_text)
filtered_df['narrative_length'] = filtered_df['cleaned_narrative'].apply(lambda x: len(x.split()))

# Plot narrative length by product
plt.figure(figsize=(10, 6))
sns.boxplot(x='Product', y='narrative_length', data=filtered_df)
plt.title("Narrative Length by Product")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#  Save filtered dataset 

filtered_df.to_csv("../data/filtered_complaints.csv", index=False)
print("\nFiltered dataset saved to '../data/filtered_complaints.csv'")


  df = pd.read_csv("../data/complaints.csv")
