# Task 1: Exploratory Data Analysis and Data Preprocessing

Objective: Understand the structure, content, and quality of the complaint data and prepare it for the RAG pipeline.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('../src'))
from preprocessing import load_data, filter_data, clean_text, preprocess_pipeline

%matplotlib inline
sns.set_theme(style="whitegrid")

## 1. Load Data

In [None]:
df = load_data('../data/complaints.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## 2. Initial EDA

### Product Distribution

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(y='Product', data=df, order=df['Product'].value_counts().index)
plt.title('Distribution of Complaints across Products')
plt.show()

### Narrative Presence

In [None]:
narrative_count = df['Consumer complaint narrative'].notnull().sum()
missing_count = df['Consumer complaint narrative'].isnull().sum()

print(f"Complaints with narrative: {narrative_count}")
print(f"Complaints without narrative: {missing_count}")

plt.figure(figsize=(6, 4))
sns.barplot(x=['With Narrative', 'Without Narrative'], y=[narrative_count, missing_count])
plt.title('Narrative Presence')
plt.show()

### Narrative Length Analysis

In [None]:
# Filter for narratives only
narratives = df['Consumer complaint narrative'].dropna()
word_counts = narratives.apply(lambda x: len(str(x).split()))

plt.figure(figsize=(10, 6))
sns.histplot(word_counts, bins=50, kde=True)
plt.title('Distribution of Complaint Narrative Length (Word Count)')
plt.xlabel('Word Count')
plt.show()

print(f"Average word count: {word_counts.mean():.2f}")
print(f"Median word count: {word_counts.median()}")
print(f"Max word count: {word_counts.max()}")
print(f"Min word count: {word_counts.min()}")

## 3. Filtering and Cleaning

In [None]:
df_filtered = preprocess_pipeline('../data/complaints.csv', '../data/filtered_complaints.csv')
df_filtered.head()

### Cleaned Narrative Sample

In [None]:
print("Original:")
print(df_filtered['Consumer complaint narrative'].iloc[0][:200] + "...")
print("\nCleaned:")
print(df_filtered['cleaned_narrative'].iloc[0][:200] + "...")