# Task-1 EDA  and  Data Preprocessing 
Major activities accomplished under this task:

 - The full CFPB complaint dataset was loaded
 - initial EDA to understand the data.
 - Filtering of dataset to meet the project requirements 
 - cleaning of text narrative to improve the embedding quality 

In [4]:
# installed dependencies 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import sys
from pathlib import Path
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Add the parent directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import pandas as pd

# Initialize counters
total_raw = 0
total_with_narrative = 0
total_without_narrative = 0

# Initialize an empty DataFrame to hold all data
df = pd.DataFrame()

# Process the CSV in chunks
for i, chunk in enumerate(pd.read_csv('F:/Intelligent_Complaint_Analysis/data/Complaints.csv',
                                      chunksize=100000, low_memory=False)):

    print(f'Processing chunk {i+1}...')

    # Clean column names
    chunk.columns = chunk.columns.str.strip()

    # Update total row count
    total_raw += len(chunk)

    # Select and rename relevant columns
    chunk = chunk[['Complaint ID', 'Product', 'Consumer complaint narrative']].copy()
    chunk.columns = ['complaint_id', 'product', 'narrative']

    # Update narrative counts
    total_with_narrative += chunk['narrative'].notna().sum()
    total_without_narrative += chunk['narrative'].isna().sum()

    # Append the chunk to the final DataFrame
    if i == 0:
        df = chunk
    else:
        df = pd.concat([df, chunk], ignore_index=True)

# Display summary
print(f"\n✅ Total rows processed: {total_raw}")
print(f"🟢 Rows with narratives: {total_with_narrative}")
print(f"🔴 Rows without narratives: {total_without_narrative}")
print(f"📄 Final DataFrame shape: {df.shape}")


Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
Processing chunk 21...
Processing chunk 22...
Processing chunk 23...
Processing chunk 24...
Processing chunk 25...
Processing chunk 26...
Processing chunk 27...
Processing chunk 28...
Processing chunk 29...
Processing chunk 30...
Processing chunk 31...
Processing chunk 32...
Processing chunk 33...
Processing chunk 34...
Processing chunk 35...
Processing chunk 36...
Processing chunk 37...
Processing chunk 38...
Processing chunk 39...
Processing chunk 40...
Processing chunk 41...
Processing chunk 42...
Processing chunk 43...
Processing chunk 44.

## Data Loading 

In [None]:
from src.EDA_preprocessing import load_and_process_complaints
file_path = 'F:/Intelligent_Complaint_Analysis/data/Complaints.csv'
df_complaints, summary = load_and_process_complaints(file_path)

print("\n📊 Processing Summary:")
for key, value in summary.items():
    print(f"{key}: {value}")

# Optional: Save to CSV
df_complaints.to_csv("F:/Intelligent_Complaint_Analysis/data/cleaned_complaints.csv", index=False

In [None]:
df_complaints.head()

In [None]:
# Initial EDA
df=df_complaints
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

## Distribution of complaints across Products

In [None]:
# Analyze distribution of complaints across Products
print("\nComplaint Distribution by Product:")
product_counts = df['Product'].value_counts()
print(product_counts)

## visualization of Complaint by product 

In [None]:
# Visualize product distribution
plt.figure(figsize=(10, 6))
sns.barplot(x=product_counts.values, y=product_counts.index)
plt.title('Distribution of Complaints by Product')
plt.xlabel('Number of Complaints')
plt.ylabel('Product')
plt.tight_layout()
plt.show()

## Narrative Length Calculation

In [None]:
# Calculate narrative length (word count)
df['narrative_length'] = df['Consumer complaint narrative'].apply(
    lambda x: len(word_tokenize(str(x))) if pd.notnull(x) else 0
)


In [None]:

# Visualize narrative length distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['narrative_length'], bins=50)
plt.title('Distribution of Consumer Complaint Narrative Lengths')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

**Complaints with and without Narrative**

In [None]:
#Identify complaints with and without narratives
narrative_counts = df['Consumer complaint narrative'].notnull().value_counts()
print("\nComplaints with and without narratives:")
print(f"With narratives: {narrative_counts[True]}")
print(f"Without narratives: {narrative_counts[False]}")

## Data Filtering and Cleaning 

In [None]:
# Filter dataset for specified products and non-empty narratives
target_products = ['Credit card', 'Personal loan', 'Buy Now, Pay Later (BNPL)', 
                  'Savings account', 'Money transfers']
df_filtered = df[
    (df['Product'].isin(target_products)) & 
    (df['Consumer complaint narrative'].notnull())
].copy()

In [None]:
# Apply text cleaning
from src.EDA_preprocessing import clean_narrative
df_filtered['cleaned_narrative'] = df_filtered['Consumer complaint narrative'].apply(clean_narrative)

## Filtered Data was saved to folder data

In [None]:
# Save cleaned and filtered dataset
df_filtered.to_csv('F:/Intelligent_Complaint_Analysis/data/filtered_data.csv', index=False)
print(f"\nFiltered dataset saved to data")
print(f"Shape of filtered dataset: {df_filtered.shape}")


## Summary statistics of filtered dataset

In [None]:

# Summary statistics of filtered dataset
print("\nFiltered Dataset Statistics:")
print(df_filtered.describe(include='all'))
print("\nProduct Distribution in Filtered Dataset:")
print(df_filtered['Product'].value_counts())