In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from pathlib import Path

In [57]:
# ============================================================================
# DATA LOADER MODULE
# ============================================================================
import pandas as pd
from pathlib import Path
class DataLoader:
    """Handles efficient data loading with chunking support."""
   
    def __init__(self, data_path, chunksize=50000):
        self.data_path = data_path
        self.chunksize = chunksize
   
    def load_in_chunks(self, columns=None, filters=None, dtype=None, parse_dates=None):
        """Load data in chunks to avoid memory errors."""
        print(f"üìÇ Loading data in chunks of {self.chunksize:,}...")
       
        chunks = []
        total_rows = 0
        chunks_processed = 0
       
        try:
            for chunk in pd.read_csv(
                self.data_path,
                chunksize=self.chunksize,
                usecols=columns,
                dtype=dtype,
                parse_dates=parse_dates
            ):
                # Apply filters during loading
                if filters:
                    for col, values in filters.items():
                        if col in chunk.columns:
                            chunk = chunk[chunk[col].isin(values)]
               
                if len(chunk) > 0:
                    chunks.append(chunk)
                    total_rows += len(chunk)
                    chunks_processed += 1
                   
                    if chunks_processed % 10 == 0:
                        print(f" Processed {chunks_processed} chunks: {total_rows:,} rows retained")
           
            df = pd.concat(chunks, ignore_index=True)
            print(f"‚úì Successfully loaded {len(df):,} rows")
            return df
           
        except Exception as e:
            print(f"‚úó Error loading data: {str(e)}")
            return None

In [58]:
# ============================================================================
# TEXT PROCESSOR MODULE
# ============================================================================
import re
class TextProcessor:
    """Handles text cleaning and preprocessing."""
   
    @staticmethod
    def clean_text(text):
        """Clean and normalize text."""
        if pd.isna(text) or text == '':
            return ''
       
        text = str(text).lower()
       
        # Remove boilerplate patterns
        boilerplate = [
            r'i am writing to file a complaint',
            r'i am filing this complaint',
            r'i would like to file a complaint',
            r'this is a complaint about',
            r'xxxx+',
            r'\*\*+'
        ]
        for pattern in boilerplate:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
       
        # Clean special characters
        text = re.sub(r'[^a-z0-9\s.,!?\'-]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
       
        return text
   
    @staticmethod
    def add_word_count(df, text_col):
        """Add word count column."""
        print(f"üìù Calculating word counts...")
        df['word_count'] = df[text_col].fillna('').apply(
            lambda x: len(str(x).split())
        )
        return df
   
    @staticmethod
    def get_cleaning_stats(df, original_col, cleaned_col):
        """Get before/after cleaning statistics."""
        orig_len = df[original_col].str.len().mean()
        clean_len = df[cleaned_col].str.len().mean()
        reduction = (1 - clean_len / orig_len) * 100
       
        return {
            'original_avg': orig_len,
            'cleaned_avg': clean_len,
            'reduction_pct': reduction
        }

In [60]:
# ============================================================================
# ANALYZER MODULE
# ============================================================================
import matplotlib.pyplot as plt
class DataAnalyzer:
    """Handles data analysis and visualization."""
   
    def __init__(self, output_dir='output'):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
   
    def initial_exploration(self, df):
        """Perform initial data exploration."""
        print("\n" + "=" * 70)
        print("INITIAL DATA EXPLORATION")
        print("=" * 70)
       
        print(f"\nüìä Dataset Overview:")
        print(f" Total Records: {len(df):,}")
        print(f" Total Columns: {len(df.columns)}")
        print(f" Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
       
        if 'Date received' in df.columns:
            print(f" Date Range: {df['Date received'].min()} to {df['Date received'].max()}")
       
        print("\nüìã Columns:")
        for idx, col in enumerate(df.columns, 1):
            print(f" {idx:2d}. {col}")
       
        print("\nüîç Missing Data:")
        missing = df.isnull().sum()
        missing = missing[missing > 0].sort_values(ascending=False)
        if len(missing) > 0:
            for col, count in missing.items():
                pct = count / len(df) * 100
                print(f" {col}: {count:,} ({pct:.1f}%)")
        else:
            print(" No missing data!")
       
        print("\nüìÑ Sample Data (first 2 rows):")
        print(df.head(2).to_string())
   
    def analyze_products(self, df):
        """Analyze product distribution with visualization."""
        print("\n" + "=" * 70)
        print("PRODUCT DISTRIBUTION ANALYSIS")
        print("=" * 70)
       
        counts = df['Product'].value_counts()
        print(f"\nüìä Total Unique Products: {len(counts)}")
        print(f"\nProduct Distribution:")
        for product, count in counts.items():
            pct = count / len(df) * 100
            print(f" {product}: {count:,} ({pct:.1f}%)")
       
        # Create visualization
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
       
        # Bar chart
        ax = axes[0]
        counts.plot(kind='barh', ax=ax, color='steelblue', edgecolor='black')
        ax.set_xlabel('Number of Complaints', fontweight='bold', fontsize=12)
        ax.set_ylabel('Product', fontweight='bold', fontsize=12)
        ax.set_title('Complaints by Product', fontweight='bold', fontsize=14, pad=15)
        ax.invert_yaxis()
       
        # Add value labels
        for i, v in enumerate(counts.values):
            ax.text(v + max(counts) * 0.01, i, f'{v:,}', va='center', fontsize=10)
       
        # Pie chart
        ax = axes[1]
        colors = plt.cm.Set3.colors
        ax.pie(counts.values, labels=counts.index, autopct='%1.1f%%',
               startangle=90, colors=colors, textprops={'fontsize': 10})
        ax.set_title('Product Distribution (%)', fontweight='bold', fontsize=14, pad=15)
       
        plt.tight_layout()
        filename = self.output_dir / 'product_distribution.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"\n‚úì Saved: {filename}")
       
        return counts
   
    def analyze_narratives(self, df, narrative_col):
        """Analyze narrative statistics with visualization."""
        print("\n" + "=" * 70)
        print("NARRATIVE ANALYSIS")
        print("=" * 70)
       
        with_narrative = df[df[narrative_col].notna()]
        without_narrative = df[df[narrative_col].isna()]
       
        print(f"\nüìä Narrative Coverage:")
        print(f" WITH narratives: {len(with_narrative):,} ({len(with_narrative)/len(df)*100:.1f}%)")
        print(f" WITHOUT narratives: {len(without_narrative):,} ({len(without_narrative)/len(df)*100:.1f}%)")
       
        if 'word_count' in df.columns and len(with_narrative) > 0:
            stats = with_narrative['word_count'].describe()
            print(f"\nüìà Word Count Statistics:")
            print(f" Mean: {stats['mean']:.0f} words")
            print(f" Median: {stats['50%']:.0f} words")
            print(f" Std: {stats['std']:.0f} words")
            print(f" Min: {stats['min']:.0f} words")
            print(f" Max: {stats['max']:.0f} words")
            print(f" 25th: {stats['25%']:.0f} words")
            print(f" 75th: {stats['75%']:.0f} words")
           
            # Edge cases
            very_short = (with_narrative['word_count'] < 10).sum()
            very_long = (with_narrative['word_count'] > 1000).sum()
            print(f"\nüîç Edge Cases:")
            print(f" Very SHORT (<10 words): {very_short:,} ({very_short/len(with_narrative)*100:.1f}%)")
            print(f" Very LONG (>1000 words): {very_long:,} ({very_long/len(with_narrative)*100:.1f}%)")
           
            # Visualization
            self._plot_narrative_analysis(with_narrative, without_narrative, df)
   
    def _plot_narrative_analysis(self, with_narrative, without_narrative, df):
        """Create narrative analysis visualizations."""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
       
        # 1. Histogram of word counts (‚â§500)
        ax = axes[0, 0]
        data = with_narrative['word_count']
        filtered_data = data[data <= 500]
        ax.hist(filtered_data, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
        ax.axvline(data.mean(), color='red', linestyle='--', linewidth=2,
                   label=f'Mean: {data.mean():.0f}')
        ax.axvline(data.median(), color='green', linestyle='--', linewidth=2,
                   label=f'Median: {data.median():.0f}')
        ax.set_xlabel('Word Count', fontweight='bold', fontsize=11)
        ax.set_ylabel('Frequency', fontweight='bold', fontsize=11)
        ax.set_title('Distribution of Narrative Lengths (‚â§500 words)',
                    fontweight='bold', fontsize=12, pad=10)
        ax.legend()
        ax.grid(True, alpha=0.3)
       
        # 2. Box plot
        ax = axes[0, 1]
        ax.boxplot(data[data <= 500], vert=True)
        ax.set_ylabel('Word Count', fontweight='bold', fontsize=11)
        ax.set_title('Box Plot of Narrative Lengths (‚â§500 words)',
                    fontweight='bold', fontsize=12, pad=10)
        ax.grid(True, alpha=0.3)
       
        # 3. Pie chart - with/without narratives
        ax = axes[1, 0]
        sizes = [len(with_narrative), len(without_narrative)]
        labels = ['With Narrative', 'Without Narrative']
        colors = ['#3498db', '#e74c3c']
        ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90,
               colors=colors, textprops={'fontsize': 11})
        ax.set_title('Complaints: With vs Without Narratives',
                    fontweight='bold', fontsize=12, pad=10)
       
        # 4. Length categories
        ax = axes[1, 1]
        bins = [0, 50, 100, 200, 500, float('inf')]
        labels = ['Very Short\n(0-50)', 'Short\n(51-100)', 'Medium\n(101-200)',
                  'Long\n(201-500)', 'Very Long\n(>500)']
        categories = pd.cut(data, bins=bins, labels=labels)
        counts = categories.value_counts().sort_index()
        ax.bar(range(len(counts)), counts.values, color='teal',
               edgecolor='black', alpha=0.7)
        ax.set_xticks(range(len(counts)))
        ax.set_xticklabels(counts.index, fontsize=10)
        ax.set_ylabel('Number of Complaints', fontweight='bold', fontsize=11)
        ax.set_title('Narrative Length Categories', fontweight='bold',
                    fontsize=12, pad=10)
       
        for i, v in enumerate(counts.values):
            ax.text(i, v + max(counts.values) * 0.01, f'{v:,}',
                   ha='center', va='bottom', fontsize=9)
       
        plt.tight_layout()
        filename = self.output_dir / 'narrative_analysis.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"‚úì Saved: {filename}")

In [65]:
# ============================================================================
# MAIN PROCESSOR (Updated for Memory Efficiency)
# ============================================================================
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

class ComplaintProcessor:
    """Main processor orchestrating the entire pipeline (memory-optimized)."""

    def __init__(self, data_path):
        self.data_path = data_path
        self.loader = DataLoader(data_path, chunksize=50000)
        self.text_proc = TextProcessor()
        self.analyzer = DataAnalyzer()

        # 5 specific products as per requirements
        self.target_products = [
            'Credit card',
            'Credit card or prepaid card',
            'Student loan',
            'Checking or savings account',
            'Money transfer, virtual currency, or money service'
        ]

        self.df_filtered = None  # Only this dataset is retained

    def load_and_filter_data(self):
        """Load data in chunks AND filter for target products + valid narratives."""
        print("\n" + "=" * 70)
        print("LOADING & FILTERING DATA (TARGET PRODUCTS ONLY)")
        print("=" * 70)

        columns = [
            'Date received', 'Product', 'Consumer complaint narrative', 'Complaint ID'
        ]

        dtypes = {
            'Product': 'category',
            'Consumer complaint narrative': 'object',
            'Complaint ID': 'int64'
        }

        # Apply product filter DURING loading
        filters = {'Product': self.target_products}

        print(f"üìå Target products:")
        for i, prod in enumerate(self.target_products, 1):
            print(f" {i}. {prod}")

        print(f"\nüìÇ Loading only target products in chunks...")
        df_initial = self.loader.load_in_chunks(
            columns=columns,
            filters=filters,
            dtype=dtypes,
            parse_dates=['Date received']
        )

        if df_initial is None or len(df_initial) == 0:
            print("‚úó No data loaded for target products.")
            return False

        print(f"‚úì Loaded {len(df_initial):,} complaints for target products.")

        # Remove empty/NaN narratives
        narrative_col = 'Consumer complaint narrative'
        before = len(df_initial)
        self.df_filtered = df_initial[
            df_initial[narrative_col].notna() &
            (df_initial[narrative_col].str.strip() != '')
        ].copy()

        print(f"‚úì After removing empty narratives: {len(self.df_filtered):,} complaints")
        print(f"‚úì Removed: {before - len(self.df_filtered):,} empty narratives")

        return True

    def run_filtered_eda(self):
        """Run EDA on the filtered dataset (only target products)."""
        print("\n" + "=" * 70)
        print("EXPLORATORY DATA ANALYSIS - FILTERED DATASET")
        print("=" * 70)

        # Basic overview
        self.analyzer.initial_exploration(self.df_filtered)

        # Product distribution
        self.analyzer.analyze_products(self.df_filtered)

        # Word count analysis
        self.df_filtered = self.text_proc.add_word_count(
            self.df_filtered, 'Consumer complaint narrative'
        )

        # Narrative analysis
        self.analyzer.analyze_narratives(
            self.df_filtered, 'Consumer complaint narrative'
        )

    def preprocess_text(self):
        """Preprocess complaint narratives."""
        print("\n" + "=" * 70)
        print("TEXT PREPROCESSING")
        print("=" * 70)

        narrative_col = 'Consumer complaint narrative'

        print("\nüßπ Cleaning complaint narratives...")
        self.df_filtered['cleaned_narrative'] = self.df_filtered[narrative_col].apply(
            self.text_proc.clean_text
        )

        # Show examples
        print("\nüîç Before/After Examples:")
        print("-" * 70)
        samples = self.df_filtered.sample(min(3, len(self.df_filtered)), random_state=42)

        for idx, (_, row) in enumerate(samples.iterrows(), 1):
            orig = str(row[narrative_col])[:200]
            clean = str(row['cleaned_narrative'])[:200]
            print(f"\nExample {idx}:")
            print(f"ORIGINAL ({len(str(row[narrative_col]))} chars):")
            print(f" {orig}...")
            print(f"CLEANED ({len(str(row['cleaned_narrative']))} chars):")
            print(f" {clean}...")
            print("-" * 70)

        # Statistics
        stats = self.text_proc.get_cleaning_stats(
            self.df_filtered, narrative_col, 'cleaned_narrative'
        )

        print(f"\nüìä Cleaning Statistics:")
        print(f" Average original length: {stats['original_avg']:.0f} characters")
        print(f" Average cleaned length: {stats['cleaned_avg']:.0f} characters")
        print(f" Average reduction: {stats['reduction_pct']:.2f}%")

    def save_data(self, output_path='data/filtered_complaints.csv'):
        """Save processed data."""
        print("\n" + "=" * 70)
        print("SAVING PROCESSED DATA")
        print("=" * 70)

        Path(output_path).parent.mkdir(parents=True, exist_ok=True)

        # Save only essential columns to reduce file size
        save_cols = [
            'Complaint ID', 'Date received', 'Product',
            'Consumer complaint narrative', 'cleaned_narrative', 'word_count'
        ]
        self.df_filtered[save_cols].to_csv(output_path, index=False)

        size_mb = Path(output_path).stat().st_size / 1024**2
        print(f"\n‚úì Data saved successfully!")
        print(f" File: {output_path}")
        print(f" Size: {size_mb:.2f} MB")
        print(f" Records: {len(self.df_filtered):,}")
        print(f" Columns: {len(save_cols)}")

    def generate_report(self):
        """Generate summary report."""
        print("\n" + "=" * 70)
        print("GENERATING SUMMARY REPORT")
        print("=" * 70)

        narrative_col = 'Consumer complaint narrative'

        report = f"""
{'='*70}
EDA AND PREPROCESSING SUMMARY REPORT
CrediTrust Financial - Complaint Analysis System
{'='*70}
1. DATASET OVERVIEW
   ‚Ä¢ Complaints with target products: {len(self.df_filtered):,}
   ‚Ä¢ Date range: {self.df_filtered['Date received'].min()} to {self.df_filtered['Date received'].max()}
2. TARGET PRODUCTS (5 Products)
   ‚Ä¢ Credit card
   ‚Ä¢ Credit card or prepaid card
   ‚Ä¢ Student loan
   ‚Ä¢ Checking or savings account
   ‚Ä¢ Money transfer, virtual currency, or money service
3. PRODUCT DISTRIBUTION
"""
        for product, count in self.df_filtered['Product'].value_counts().items():
            pct = count / len(self.df_filtered) * 100
            report += f" ‚Ä¢ {product}: {count:,} ({pct:.1f}%)\n"

        report += f"""
4. NARRATIVE ANALYSIS
   ‚Ä¢ Complaints WITH narratives: {len(self.df_filtered):,} (100% ‚Äî filtered)
   ‚Ä¢ Average length: {self.df_filtered['word_count'].mean():.0f} words
   ‚Ä¢ Median length: {self.df_filtered['word_count'].median():.0f} words
   ‚Ä¢ Very short (<10 words): {(self.df_filtered['word_count'] < 10).sum():,}
   ‚Ä¢ Very long (>1000 words): {(self.df_filtered['word_count'] > 1000).sum():,}
5. TEXT PREPROCESSING APPLIED
   ‚Ä¢ Lowercasing: ‚úì
   ‚Ä¢ Boilerplate removal: ‚úì
   ‚Ä¢ Special character cleaning: ‚úì
   ‚Ä¢ Whitespace normalization: ‚úì
   ‚Ä¢ Average text reduction: {(1 - self.df_filtered['cleaned_narrative'].str.len().mean() / self.df_filtered[narrative_col].str.len().mean()) * 100:.2f}%
6. OUTPUT FILES GENERATED
   ‚Ä¢ data/filtered_complaints.csv
   ‚Ä¢ output/product_distribution.png
   ‚Ä¢ output/narrative_analysis.png
   ‚Ä¢ output/eda_summary_report.txt
7. NEXT STEPS (Task 2)
   ‚Ä¢ Chunk cleaned narratives
   ‚Ä¢ Generate embeddings
   ‚Ä¢ Store in FAISS vector DB
   ‚Ä¢ Estimated chunks: ~{len(self.df_filtered) * 3:,}
{'='*70}
END OF REPORT
{'='*70}
        """

        print(report)

        report_path = Path('output/eda_summary_report.txt')
        report_path.parent.mkdir(exist_ok=True)
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write(report)

        print(f"‚úì Report saved: {report_path}")

    def run(self):
        """Run complete pipeline (memory-safe)."""
        print("\n" + "="*70)
        print("COMPLAINT ANALYSIS PIPELINE - TASK 1 (MEMORY OPTIMIZED)")
        print("="*70)

        Path('output').mkdir(exist_ok=True)

        # Step 1: Load and filter in one pass
        if not self.load_and_filter_data():
            print("\n‚úó Pipeline failed during data loading/filtering")
            return False

        # Step 2: EDA on filtered data
        self.run_filtered_eda()

        # Step 3: Preprocess text
        self.preprocess_text()

        # Step 4: Save data
        self.save_data()

        # Step 5: Generate report
        self.generate_report()

        print("\n" + "=" * 70)
        print("‚úì PIPELINE COMPLETED SUCCESSFULLY!")
        print("=" * 70)
        print("\nGenerated Files:")
        print(" ‚Ä¢ data/filtered_complaints.csv")
        print(" ‚Ä¢ output/product_distribution.png")
        print(" ‚Ä¢ output/narrative_analysis.png")
        print(" ‚Ä¢ output/eda_summary_report.txt")

        return True

In [66]:
processor = ComplaintProcessor('../data/raw/complaints.csv')

In [67]:
processor.run()


COMPLAINT ANALYSIS PIPELINE - TASK 1 (MEMORY OPTIMIZED)

LOADING & FILTERING DATA (TARGET PRODUCTS ONLY)
üìå Target products:
 1. Credit card
 2. Credit card or prepaid card
 3. Student loan
 4. Checking or savings account
 5. Money transfer, virtual currency, or money service

üìÇ Loading only target products in chunks...
üìÇ Loading data in chunks of 50,000...
 Processed 10 chunks: 19,676 rows retained
 Processed 20 chunks: 47,602 rows retained
 Processed 30 chunks: 72,784 rows retained
 Processed 40 chunks: 115,870 rows retained
 Processed 50 chunks: 155,081 rows retained
 Processed 60 chunks: 182,656 rows retained
 Processed 70 chunks: 218,477 rows retained
 Processed 80 chunks: 263,404 rows retained
 Processed 90 chunks: 312,973 rows retained
 Processed 100 chunks: 369,679 rows retained
 Processed 110 chunks: 434,457 rows retained
 Processed 120 chunks: 508,918 rows retained
 Processed 130 chunks: 584,686 rows retained
 Processed 140 chunks: 656,516 rows retained
 Processed 15

True