In [7]:
# Cell 1: Imports and setup
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from src.config import RAW_DATA_PATH, PRODUCT_MAPPING, TARGET_PRODUCTS
from src.data_loader import filter_complaints_streaming
from src.preprocessor import prepare_final_dataset
from src.visualizer import create_product_dashboard, create_text_length_plot
from src.reporter import save_data_quality_report
print(f"‚úÖ Setup complete - Target products: {TARGET_PRODUCTS}")

print("‚úÖ Setup complete - using configuration from src/")
print(f"   Target products: {TARGET_PRODUCTS}")

‚úÖ Setup complete - Target products: ['Credit Card', 'Mortgage', 'Student Loan', 'Vehicle Loan', 'Payday Loan']
‚úÖ Setup complete - using configuration from src/
   Target products: ['Credit Card', 'Mortgage', 'Student Loan', 'Vehicle Loan', 'Payday Loan']


In [None]:
# Cell 2: Load data
from src.config import RAW_DATA_PATH
from src.data_loader import load_complaints_data
print(f"üìÇ Loading data from: {RAW_DATA_PATH}")

df = load_complaints_data(RAW_DATA_PATH)
print(f"\n‚úÖ Loaded {len(df):,} complaints with {len(df.columns)} features")
print(f"   Date range: {df['Date received'].min()} to {df['Date received'].max()}")

üöÄ Loading complaint database...


üìÇ Loading data from: d:\10 acadamy\Intelligent Complaint Analysis for Financial Services\notebooks\..\data\raw\complaints.csv


   üìä Chunk 5: 250,000 records loaded
   üìä Chunk 10: 500,000 records loaded
   üìä Chunk 15: 750,000 records loaded
   üìä Chunk 20: 1,000,000 records loaded
   üìä Chunk 25: 1,250,000 records loaded
   üìä Chunk 30: 1,500,000 records loaded
   üìä Chunk 35: 1,750,000 records loaded
   üìä Chunk 40: 2,000,000 records loaded
   üìä Chunk 45: 2,250,000 records loaded
   üìä Chunk 50: 2,500,000 records loaded
   üìä Chunk 55: 2,750,000 records loaded
   üìä Chunk 60: 3,000,000 records loaded
   üìä Chunk 65: 3,250,000 records loaded
   üìä Chunk 70: 3,500,000 records loaded
   üìä Chunk 75: 3,750,000 records loaded
   üìä Chunk 80: 4,000,000 records loaded
   üìä Chunk 85: 4,250,000 records loaded
   üìä Chunk 90: 4,500,000 records loaded
   üìä Chunk 95: 4,750,000 records loaded
   üìä Chunk 100: 5,000,000 records loaded
   üìä Chunk 105: 5,250,000 records loaded
   üìä Chunk 110: 5,500,000 records loaded
   üìä Chunk 115: 5,750,000 records loaded
   üìä Chunk 

In [None]:
# Cell 3: Product mapping and filtering
from src.preprocessor import fast_filter_pipeline

df_filtered = fast_filter_pipeline(df)
print("\nüìä Product categories:")
product_counts = df_filtered['Product_Category'].value_counts()
for product, count in product_counts.items():
    print(f"   ‚Ä¢ {product}: {count:,}")

‚úÖ Product mapping complete
üìä Filtered to 265,782 business-relevant complaints with narratives

üìä Product categories:
   ‚Ä¢ Mortgage: 130,160
   ‚Ä¢ Credit Card: 80,667
   ‚Ä¢ Student Loan: 53,209
   ‚Ä¢ Payday Loan: 1,746


In [None]:
# Cell 5: Clean text
df_final = prepare_final_dataset(df_filtered)
print("‚úÖ Text cleaning complete")

# Show sample
sample = df_final.iloc[0]
print(f"\nüìù Sample cleaned text:")
print(f"   Original: {sample['Consumer complaint narrative'][:100]}...")
print(f"   Cleaned:  {sample['Cleaned_Narrative'][:100]}...")
print(f"   Length:   {sample['Word_Count']} words")  # Changed from Text_Length_Words to Word_Count

‚úÖ Final dataset: 265,694 complaints ready for analysis
üìä Products: {'Mortgage': 130134, 'Credit Card': 80620, 'Student Loan': 53194, 'Payday Loan': 1746}
‚úÖ Text cleaning complete

üìù Sample cleaned text:
   Original: I signed a purchase agreement with Lennar Corporation on XX/XX/year>, for a new construction home in...
   Cleaned:  i signed a purchase agreement with lennar corporation on xx xx year for a new construction home in x...
   Length:   276 words


In [None]:
# Cell 6: Save data
from src.config import PROCESSED_DATA_PATH

final_cols = [
    'Complaint ID', 'Date received', 'Product', 'Product_Category',
    'Issue', 'Company', 'State', 'Consumer complaint narrative',
    'Cleaned_Narrative', 'Text_Length_Chars', 'Text_Length_Words'
]

# Use df_final instead of business_df_viable
final_df = df_final[[c for c in final_cols if c in df_final.columns]]

# Create processed directory if it doesn't exist
PROCESSED_DATA_PATH.mkdir(parents=True, exist_ok=True)

# Save to CSV
output_path = PROCESSED_DATA_PATH / 'filtered_complaints.csv'
final_df.to_csv(output_path, index=False)
print(f"‚úÖ Saved {len(final_df):,} complaints to {output_path}")

‚úÖ Saved 265,694 complaints to d:\10 acadamy\Intelligent Complaint Analysis for Financial Services\notebooks\..\data\processed\filtered_complaints.csv


In [None]:
# Cell 8: Report
from pathlib import Path
from src.config import PROCESSED_DATA_PATH

# Create reports directory
reports_path = Path(PROCESSED_DATA_PATH).parent / 'reports'
reports_path.mkdir(parents=True, exist_ok=True)
print(f"üìÅ Reports directory: {reports_path}")

# Save report
report_path = reports_path / "task1_quality_report.json"
report = save_data_quality_report(df_final, report_path)
print(f"‚úÖ Report saved to {report_path}")

# Generate summary
summary = generate_task1_summary(df_final, df_final['Product_Category'].value_counts())
print(summary)

NameError: name 'Path' is not defined

In [None]:
# Cell 9: Final statistics
print("=" * 80)
print("üìä FINAL DATASET STATISTICS")
print("=" * 80)

print(f"\nüìà Product distribution:")
product_dist = final_df['Product_Category'].value_counts()
for product, count in product_dist.items():
    print(f"   ‚Ä¢ {product}: {count:,} ({count/len(final_df)*100:.1f}%)")

print(f"\nüìè Text length statistics:")
print(f"   ‚Ä¢ Mean: {final_df['Text_Length_Words'].mean():.0f} words")
print(f"   ‚Ä¢ Median: {final_df['Text_Length_Words'].median():.0f} words")
print(f"   ‚Ä¢ Min: {final_df['Text_Length_Words'].min()} words")
print(f"   ‚Ä¢ Max: {final_df['Text_Length_Words'].max():,} words")

print(f"\n‚úÖ TASK 1 COMPLETE - Ready for Task 2: Chunking & Embedding")