 EDA FOR CREDITRUST FINANCIAL
 ML Engineer Analysis - Customer Complaint Intelligence


In [2]:
# ============================================================================
# üì¶ SECTION 1: EXECUTIVE SETUP & BUSINESS CONTEXT
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Professional imports for advanced NLP
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from wordcloud import WordCloud
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-eng')

# Set professional aesthetics
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 200)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\G5\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\G5\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\G5\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading omw-eng: Package 'omw-eng' not found in
[nltk_data]     index


In [3]:
import os

# Get the current notebook directory
current_dir = os.getcwd()  # This is 'd:/10 acadamy/Intelligent Complaint Analysis for Financial Services/notebokks'

# Go up one level to project root, then navigate to data/raw
project_root = os.path.dirname(current_dir)  # Goes up one level
data_path = os.path.join(project_root, 'data', 'raw', 'complaints.csv')

print(f"Loading data from: {data_path}")

Loading data from: d:\10 acadamy\Intelligent Complaint Analysis for Financial Services\data\raw\complaints.csv


DATA LOADING WITH MEMORY OPTIMIZATION

In [4]:
# ============================================================================
# üìà SECTION 2: DATA LOADING WITH MEMORY OPTIMIZATION
# ============================================================================

print("\n" + "=" * 100)
print("üì¶ PHASE 1: DATA ACQUISITION & INITIAL ASSESSMENT")
print("=" * 100)

# Get the correct path to your data
import os
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
data_path = os.path.join(project_root, 'data', 'raw', 'complaints.csv')

print(f"‚úÖ Correct data path calculated: {data_path}")

# Optimized data types for memory efficiency
dtype_strategy = {
    'Complaint ID': 'str',
    'Date received': 'str',
    'Product': 'category',
    'Sub-product': 'category',
    'Issue': 'category',
    'Sub-issue': 'category',
    'Company': 'category',
    'State': 'category',
    'ZIP code': 'str',
    'Tags': 'category',
    'Consumer consent provided?': 'category',
    'Submitted via': 'category',
    'Company response to consumer': 'category',
    'Timely response?': 'category',
    'Consumer disputed?': 'category',
    'Consumer complaint narrative': 'object'
}

# Load data in chunks
print("üöÄ Loading 464K+ complaint database...")
chunks = []
chunk_size = 50000

# CRITICAL: Use data_path variable here, not the hardcoded string
for i, chunk in enumerate(pd.read_csv(data_path,
                                       dtype=dtype_strategy,
                                       chunksize=chunk_size,
                                       parse_dates=['Date received'],
                                       infer_datetime_format=True)):
    chunks.append(chunk)
    if (i + 1) % 5 == 0:
        print(f"   üìä Chunk {i+1}: {len(chunk):,} records loaded")
    
df = pd.concat(chunks, ignore_index=True)


üì¶ PHASE 1: DATA ACQUISITION & INITIAL ASSESSMENT
‚úÖ Correct data path calculated: d:\10 acadamy\Intelligent Complaint Analysis for Financial Services\data\raw\complaints.csv
üöÄ Loading 464K+ complaint database...
   üìä Chunk 5: 50,000 records loaded
   üìä Chunk 10: 50,000 records loaded
   üìä Chunk 15: 50,000 records loaded
   üìä Chunk 20: 50,000 records loaded
   üìä Chunk 25: 50,000 records loaded
   üìä Chunk 30: 50,000 records loaded
   üìä Chunk 35: 50,000 records loaded
   üìä Chunk 40: 50,000 records loaded
   üìä Chunk 45: 50,000 records loaded
   üìä Chunk 50: 50,000 records loaded
   üìä Chunk 55: 50,000 records loaded
   üìä Chunk 60: 50,000 records loaded
   üìä Chunk 65: 50,000 records loaded
   üìä Chunk 70: 50,000 records loaded
   üìä Chunk 75: 50,000 records loaded
   üìä Chunk 80: 50,000 records loaded
   üìä Chunk 85: 50,000 records loaded
   üìä Chunk 90: 50,000 records loaded
   üìä Chunk 95: 50,000 records loaded
   üìä Chunk 100: 50

In [5]:
print(f"\n‚úÖ DATA LOADED SUCCESSFULLY")
print(f"   Total Records: {df.shape[0]:,}")
print(f"   Total Features: {df.shape[1]}")
print(f"   Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"   Time Range: {df['Date received'].min().date()} to {df['Date received'].max().date()}")


‚úÖ DATA LOADED SUCCESSFULLY
   Total Records: 9,609,797
   Total Features: 18
   Memory Usage: 12188.49 MB
   Time Range: 2011-12-01 to 2025-06-23


EXECUTIVE DATA QUALITY DASHBOARD

In [6]:
# ============================================================================
# üìä SECTION 3: EXECUTIVE DATA QUALITY DASHBOARD
# ============================================================================

print("\n" + "=" * 100)
print("üîç PHASE 2: DATA QUALITY ASSESSMENT")
print("=" * 100)

# First, display the DataFrame shape
print(f"üìä DATAFRAME SHAPE: {df.shape}")
print(f"   ‚Ä¢ Total Rows: {df.shape[0]:,}")
print(f"   ‚Ä¢ Total Columns: {df.shape[1]}")

# Create comprehensive data quality report
quality_metrics = {}

# 1. Missing Values Analysis
print("\n" + "-" * 80)
print("üîç MISSING VALUES ANALYSIS")
print("-" * 80)

missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100

# Display missing values summary
print(f"\nüìã Total missing cells in dataset: {missing_data.sum():,}")

# Display top 10 columns with most missing values
print("\nüìä TOP 10 COLUMNS WITH MISSING VALUES:")
print("-" * 50)

# Create a DataFrame for better display
missing_df = pd.DataFrame({
    'Missing_Count': missing_data,
    'Missing_Percentage': missing_percentage
}).sort_values('Missing_Count', ascending=False)

# Display top 10
print(missing_df.head(10).to_string())

quality_metrics['missing_values'] = {
    'total_missing_cells': missing_data.sum(),
    'missing_percentage_overall': (missing_data.sum() / (df.shape[0] * df.shape[1]) * 100),
    'critical_missing_narratives': missing_data['Consumer complaint narrative'],
    'critical_missing_percentage': missing_percentage['Consumer complaint narrative']
}

# Display the critical narrative missing info
print(f"\n‚ö†Ô∏è  CRITICAL FIELD - Consumer Complaint Narrative:")
print(f"   ‚Ä¢ Missing narratives: {quality_metrics['missing_values']['critical_missing_narratives']:,}")
print(f"   ‚Ä¢ Percentage missing: {quality_metrics['missing_values']['critical_missing_percentage']:.1f}%")

# 2. Visualize missing values
print("\n" + "-" * 80)
print("üìà MISSING VALUES HEATMAP PREVIEW")
print("-" * 80)

# For large datasets, sample to create visualization
if len(df) > 10000:
    sample_size = min(10000, len(df))
    missing_sample = df.sample(sample_size).isnull()
    print(f"(Showing heatmap for {sample_size:,} sample rows)")
else:
    missing_sample = df.isnull()

# Calculate percentage of missing per column
missing_summary = missing_sample.sum().sort_values(ascending=False)
missing_pct = (missing_summary / len(missing_sample)) * 100

print("\nüìä COLUMNS WITH > 0% MISSING VALUES:")
for col in missing_pct[missing_pct > 0].index:
    print(f"   ‚Ä¢ {col}: {missing_pct[col]:.1f}% missing ({missing_summary[col]:,} rows)")


üîç PHASE 2: DATA QUALITY ASSESSMENT
üìä DATAFRAME SHAPE: (9609797, 18)
   ‚Ä¢ Total Rows: 9,609,797
   ‚Ä¢ Total Columns: 18

--------------------------------------------------------------------------------
üîç MISSING VALUES ANALYSIS
--------------------------------------------------------------------------------

üìã Total missing cells in dataset: 32,030,923

üìä TOP 10 COLUMNS WITH MISSING VALUES:
--------------------------------------------------
                              Missing_Count  Missing_Percentage
Tags                                8981029           93.457011
Consumer disputed?                  8841498           92.005044
Consumer complaint narrative        6629041           68.982113
Company public response             4770207           49.638999
Consumer consent provided?          1649561           17.165409
Sub-issue                            839522            8.736105
Sub-product                          235295            2.448491
State                    

In [7]:
print("\nüìã DATA QUALITY METRICS:")
print("-" * 80)

print(f"1Ô∏è‚É£  Completeness:")
print(f"   ‚Ä¢ Narratives Missing: {quality_metrics['missing_values']['critical_missing_narratives']:,} "
      f"({quality_metrics['missing_values']['critical_missing_percentage']:.1f}%)")
print(f"   ‚Ä¢ Overall Data Completeness: {(100 - quality_metrics['missing_values']['missing_percentage_overall']):.1f}%")

# 2. Duplicate Analysis
duplicate_count = df.duplicated(subset=['Complaint ID']).sum()
quality_metrics['duplicates'] = {
    'total_duplicates': duplicate_count,
    'duplicate_percentage': (duplicate_count / len(df)) * 100
}

print(f"\n2Ô∏è‚É£  Uniqueness:")
print(f"   ‚Ä¢ Duplicate Complaints: {duplicate_count:,} "
      f"({quality_metrics['duplicates']['duplicate_percentage']:.1f}%)")


üìã DATA QUALITY METRICS:
--------------------------------------------------------------------------------
1Ô∏è‚É£  Completeness:
   ‚Ä¢ Narratives Missing: 6,629,041 (69.0%)
   ‚Ä¢ Overall Data Completeness: 81.5%

2Ô∏è‚É£  Uniqueness:
   ‚Ä¢ Duplicate Complaints: 0 (0.0%)


In [8]:
# 3. Temporal Coverage
date_range_days = (df['Date received'].max() - df['Date received'].min()).days
quality_metrics['temporal'] = {
    'date_range_days': date_range_days,
    'complaints_per_day': len(df) / date_range_days,
    'start_date': df['Date received'].min().date(),
    'end_date': df['Date received'].max().date()
}

print(f"\n3Ô∏è‚É£  Temporal Coverage:")
print(f"   ‚Ä¢ Time Period: {quality_metrics['temporal']['start_date']} to {quality_metrics['temporal']['end_date']}")
print(f"   ‚Ä¢ Total Days: {date_range_days:,} days")
print(f"   ‚Ä¢ Average Complaints/Day: {quality_metrics['temporal']['complaints_per_day']:.1f}")


3Ô∏è‚É£  Temporal Coverage:
   ‚Ä¢ Time Period: 2011-12-01 to 2025-06-23
   ‚Ä¢ Total Days: 4,953 days
   ‚Ä¢ Average Complaints/Day: 1940.2


In [9]:
# 4. Cardinality Analysis
quality_metrics['cardinality'] = {
    'unique_products': df['Product'].nunique(),
    'unique_companies': df['Company'].nunique(),
    'unique_states': df['State'].nunique(),
    'unique_issues': df['Issue'].nunique()
}

print(f"\n4Ô∏è‚É£  Data Diversity:")
print(f"   ‚Ä¢ Unique Products: {quality_metrics['cardinality']['unique_products']}")
print(f"   ‚Ä¢ Unique Companies: {quality_metrics['cardinality']['unique_companies']:,}")
print(f"   ‚Ä¢ Unique Issues: {quality_metrics['cardinality']['unique_issues']}")
print(f"   ‚Ä¢ States Covered: {quality_metrics['cardinality']['unique_states']}")


4Ô∏è‚É£  Data Diversity:
   ‚Ä¢ Unique Products: 21
   ‚Ä¢ Unique Companies: 7,674
   ‚Ä¢ Unique Issues: 178
   ‚Ä¢ States Covered: 63


ADVANCED PRODUCT ANALYSIS - BUSINESS FOCUS

In [10]:
# ============================================================================
# üìà SECTION 4: ADVANCED PRODUCT ANALYSIS - BUSINESS FOCUS
# ============================================================================

print("\n" + "=" * 100)
print("üéØ PHASE 3: PRODUCT ANALYSIS - CREDITRUST BUSINESS MAPPING")
print("=" * 100)

# CRITICAL: First filter for NLP-viable data
print("‚ö†Ô∏è  APPLYING NLP-VIABILITY FILTER (69% of data lacks narratives)")
viable_df = df[df['Consumer complaint narrative'].notna()].copy()
print(f"   ‚Ä¢ Original dataset: {len(df):,} complaints")
print(f"   ‚Ä¢ NLP-viable dataset: {len(viable_df):,} complaints ({len(viable_df)/len(df)*100:.1f}%)")

# Create business-focused product mapping
product_mapping = {
    # Credit Cards (Our Core Product)
    'Credit card': 'Credit Card',
    'Credit card or prepaid card': 'Credit Card',
    'Prepaid card': 'Credit Card',
    
    # Personal Loans (Our Product)
    'Payday loan, title loan, or personal loan': 'Personal Loan',
    'Consumer Loan': 'Personal Loan',
    'Vehicle loan or lease': 'Personal Loan',
    
    # Savings Accounts (Our Product)
    'Bank account or service': 'Savings Account',
    'Checking or savings account': 'Savings Account',
    'Savings account': 'Savings Account',
    
    # Money Transfers (Our Product)
    'Money transfer, virtual currency, or money service': 'Money Transfer',
    'Virtual currency': 'Money Transfer',
    
    # Other categories for context
    'Mortgage': 'Mortgage',
    'Student loan': 'Student Loan',
    'Debt collection': 'Debt Collection',
    'Credit reporting, credit repair services, or other personal consumer reports': 'Credit Reporting'
}

# Apply mapping to BOTH datasets
df['Product_Category'] = df['Product'].map(product_mapping).fillna('Other')
viable_df['Product_Category'] = viable_df['Product'].map(product_mapping).fillna('Other')

# Business Impact Analysis
print("\nüìä BUSINESS-RELEVANT COMPLAINT DISTRIBUTION:")
print("-" * 80)

our_products = ['Credit Card', 'Personal Loan', 'Savings Account', 'Money Transfer']

# Analyze FULL dataset for overall trends
business_df_full = df[df['Product_Category'].isin(our_products)]
total_business_complaints_full = len(business_df_full)

# Analyze NLP-VIABLE dataset for text analysis
business_df_viable = viable_df[viable_df['Product_Category'].isin(our_products)]
total_business_complaints_viable = len(business_df_viable)

print(f"üìà OVERALL TRENDS (All 9.6M complaints):")
print(f"   ‚Ä¢ Total Complaints in Database: {len(df):,}")
print(f"   ‚Ä¢ Complaints Relevant to CrediTrust: {total_business_complaints_full:,} "
      f"({(total_business_complaints_full/len(df)*100):.1f}%)")

print(f"\nüéØ NLP-ANALYZABLE DATA (3.0M with narratives):")
print(f"   ‚Ä¢ NLP-viable Complaints: {len(viable_df):,}")
print(f"   ‚Ä¢ Business-relevant & NLP-viable: {total_business_complaints_viable:,} "
      f"({(total_business_complaints_viable/len(viable_df)*100):.1f}% of viable data)")

# Detailed product breakdown - SHOW BOTH PERSPECTIVES
print("\nüìä PRODUCT-WISE BREAKDOWN:")
print("-" * 80)
print(f"{'Product':<20} {'Total':>12} {'NLP-Viable':>12} {'Viable %':>10}")

for product in our_products:
    # Full dataset counts
    total_count = len(df[df['Product_Category'] == product])
    
    # NLP-viable counts
    viable_count = len(viable_df[viable_df['Product_Category'] == product])
    
    # Calculate percentage viable
    viable_pct = (viable_count / total_count * 100) if total_count > 0 else 0
    
    # Determine severity
    if viable_pct > 50:
        severity = "‚úÖ HIGH"
    elif viable_pct > 30:
        severity = "‚ö†Ô∏è MEDIUM"
    else:
        severity = "üö® LOW"
    
    print(f"   ‚Ä¢ {product:<20} {total_count:>12,} {viable_count:>12,} {viable_pct:>9.1f}% {severity}")

# Calculate overall viability percentage for business products
total_viable_pct = (total_business_complaints_viable / total_business_complaints_full * 100) if total_business_complaints_full > 0 else 0

print(f"\nüìà KEY BUSINESS INSIGHT:")
print(f"   ‚Ä¢ Only {total_viable_pct:.1f}% of business-relevant complaints have analyzable narratives")
print(f"   ‚Ä¢ For NLP/AI analysis, focus on {total_business_complaints_viable:,} complaints")
print(f"   ‚Ä¢ {total_business_complaints_full - total_business_complaints_viable:,} business complaints cannot be text-analyzed")

# Create a visualization-ready summary
product_summary = pd.DataFrame({
    'Product': our_products,
    'Total_Complaints': [len(df[df['Product_Category'] == p]) for p in our_products],
    'NLP_Viable': [len(viable_df[viable_df['Product_Category'] == p]) for p in our_products]
})

product_summary['Viable_Pct'] = (product_summary['NLP_Viable'] / product_summary['Total_Complaints'] * 100)
product_summary['Missing_Narratives'] = product_summary['Total_Complaints'] - product_summary['NLP_Viable']

print("\nüìã SUMMARY DATAFRAME:")
print(product_summary.to_string())


üéØ PHASE 3: PRODUCT ANALYSIS - CREDITRUST BUSINESS MAPPING
‚ö†Ô∏è  APPLYING NLP-VIABILITY FILTER (69% of data lacks narratives)
   ‚Ä¢ Original dataset: 9,609,797 complaints
   ‚Ä¢ NLP-viable dataset: 2,980,756 complaints (31.0%)

üìä BUSINESS-RELEVANT COMPLAINT DISTRIBUTION:
--------------------------------------------------------------------------------
üìà OVERALL TRENDS (All 9.6M complaints):
   ‚Ä¢ Total Complaints in Database: 9,609,797
   ‚Ä¢ Complaints Relevant to CrediTrust: 1,105,974 (11.5%)

üéØ NLP-ANALYZABLE DATA (3.0M with narratives):
   ‚Ä¢ NLP-viable Complaints: 2,980,756
   ‚Ä¢ Business-relevant & NLP-viable: 515,810 (17.3% of viable data)

üìä PRODUCT-WISE BREAKDOWN:
--------------------------------------------------------------------------------
Product                     Total   NLP-Viable   Viable %
   ‚Ä¢ Credit Card               448,335      197,126      44.0% ‚ö†Ô∏è MEDIUM
   ‚Ä¢ Personal Loan             135,172       66,276      49.0% ‚ö†Ô∏è MEDIUM
 

CLASS BALANCE & STATISTICAL ANALYSIS

In [11]:
# ============================================================================
# üìä SECTION 5: CLASS BALANCE & STATISTICAL ANALYSIS
# ============================================================================

print("\n" + "=" * 100)
print("‚öñÔ∏è PHASE 4: CLASS BALANCE & STATISTICAL ANALYSIS")
print("=" * 100)

# CRITICAL: Use the filtered business data from Section 4
print("üìä USING NLP-VIABLE BUSINESS DATA FROM SECTION 4")
print(f"   ‚Ä¢ Business-relevant complaints: {len(business_df_viable):,}")
print(f"   ‚Ä¢ Business complaints with narratives: {len(business_df_viable):,}")

# Calculate product distribution for NLP-VIABLE business data
product_distribution = business_df_viable['Product_Category'].value_counts()
product_percentage = (product_distribution / len(business_df_viable) * 100)

print("\nüìä PRODUCT DISTRIBUTION (NLP-Viable Business Data):")
print("-" * 80)

for product, count, percent in zip(product_distribution.index, 
                                   product_distribution.values, 
                                   product_percentage.values):
    severity = "üö® HIGH" if percent > 25 else "‚ö†Ô∏è MEDIUM" if percent > 15 else "‚úÖ LOW"
    print(f"   ‚Ä¢ {product:<20} {count:>8,} complaints ({percent:>5.1f}%) {severity}")

# 1. Class Balance Visualization - DUAL PERSPECTIVE
fig1 = make_subplots(
    rows=1, cols=3,
    subplot_titles=('All Products (Full Dataset)', 
                    'Our Products (Full Dataset)',
                    'Our Products (NLP-Viable)'),
    specs=[[{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}]],
    column_widths=[0.33, 0.33, 0.34]
)

# Chart 1: All products in FULL dataset (top 10)
all_counts_full = df['Product'].value_counts().head(10)
fig1.add_trace(
    go.Pie(
        labels=all_counts_full.index,
        values=all_counts_full.values,
        hole=0.3,
        name='All Products (Full)',
        marker=dict(colors=px.colors.qualitative.Set3),
        textinfo='label+percent',
        textposition='inside'
    ),
    row=1, col=1
)

# Chart 2: Our products in FULL dataset
our_products = ['Credit Card', 'Personal Loan', 'Savings Account', 'Money Transfer']
business_df_full = df[df['Product_Category'].isin(our_products)]
our_counts_full = business_df_full['Product_Category'].value_counts()

fig1.add_trace(
    go.Pie(
        labels=our_counts_full.index,
        values=our_counts_full.values,
        hole=0.3,
        name='Our Products (Full)',
        marker=dict(colors=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']),
        textinfo='label+percent',
        textposition='inside'
    ),
    row=1, col=2
)

# Chart 3: Our products in NLP-VIABLE dataset (FOR AI ANALYSIS)
fig1.add_trace(
    go.Pie(
        labels=product_distribution.index,
        values=product_distribution.values,
        hole=0.3,
        name='Our Products (NLP-Viable)',
        marker=dict(colors=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']),
        textinfo='label+percent',
        textposition='inside'
    ),
    row=1, col=3
)

fig1.update_layout(
    title_text="<b>Class Balance Analysis</b><br><i>Comparing Full Dataset vs NLP-Viable Data</i>",
    title_font_size=16,
    showlegend=True,
    height=500,
    annotations=[
        dict(text="9.6M Total", x=0.12, y=1.05, xref="paper", yref="paper", showarrow=False, font=dict(size=12)),
        dict(text=f"{len(business_df_full):,} Business", x=0.5, y=1.05, xref="paper", yref="paper", showarrow=False, font=dict(size=12)),
        dict(text=f"{len(business_df_viable):,} NLP-Viable", x=0.88, y=1.05, xref="paper", yref="paper", showarrow=False, font=dict(size=12))
    ]
)

# Create reports directory if it doesn't exist
import os
os.makedirs('reports', exist_ok=True)

fig1.write_html("reports/class_balance_analysis.html")
print("\n‚úÖ Saved class balance visualization: reports/class_balance_analysis.html")

# 2. Statistical Imbalance Metrics - FOR NLP-VIABLE DATA
print("\nüìä STATISTICAL IMBALANCE ANALYSIS (NLP-Viable Business Data):")
print("-" * 80)

if len(product_distribution) > 1:
    imbalance_ratio = product_distribution.max() / product_distribution.min()
    gini_coefficient = 1 - sum((product_distribution / product_distribution.sum())**2)
    
    print(f"   ‚Ä¢ Max/Min Ratio: {imbalance_ratio:.2f}x (Higher = More Imbalanced)")
    print(f"   ‚Ä¢ Gini Coefficient: {gini_coefficient:.3f} (0=Perfect Balance, 1=Maximum Imbalance)")
    print(f"   ‚Ä¢ Entropy Score: {(-sum((product_distribution/product_distribution.sum()) * np.log2(product_distribution/product_distribution.sum()))):.3f}")
    
    if imbalance_ratio > 10:
        print(f"   ‚ö†Ô∏è  WARNING: Severe class imbalance detected (>10x ratio)")
        print(f"   üí° RECOMMENDATION: Consider stratified sampling or weighted loss in AI model")
    elif imbalance_ratio > 5:
        print(f"   ‚ö†Ô∏è  NOTICE: Moderate class imbalance detected")
        print(f"   üí° RECOMMENDATION: Monitor performance across all classes")
    else:
        print(f"   ‚úÖ GOOD: Class balance is acceptable for AI modeling")
else:
    print("   ‚ö†Ô∏è  Not enough product categories for imbalance analysis")

# 3. Narrative Viability by Product
print("\nüìà NARRATIVE VIABILITY BY PRODUCT CATEGORY:")
print("-" * 80)

for product in our_products:
    total = len(df[df['Product_Category'] == product])
    viable = len(viable_df[viable_df['Product_Category'] == product])
    pct = (viable / total * 100) if total > 0 else 0
    
    print(f"   ‚Ä¢ {product:<20} {viable:>8,}/{total:>8,} ({pct:>5.1f}%) have narratives")


‚öñÔ∏è PHASE 4: CLASS BALANCE & STATISTICAL ANALYSIS
üìä USING NLP-VIABLE BUSINESS DATA FROM SECTION 4
   ‚Ä¢ Business-relevant complaints: 515,810
   ‚Ä¢ Business complaints with narratives: 515,810

üìä PRODUCT DISTRIBUTION (NLP-Viable Business Data):
--------------------------------------------------------------------------------
   ‚Ä¢ Credit Card           197,126 complaints ( 38.2%) üö® HIGH
   ‚Ä¢ Savings Account       155,204 complaints ( 30.1%) üö® HIGH
   ‚Ä¢ Money Transfer         97,204 complaints ( 18.8%) ‚ö†Ô∏è MEDIUM
   ‚Ä¢ Personal Loan          66,276 complaints ( 12.8%) ‚úÖ LOW

‚úÖ Saved class balance visualization: reports/class_balance_analysis.html

üìä STATISTICAL IMBALANCE ANALYSIS (NLP-Viable Business Data):
--------------------------------------------------------------------------------
   ‚Ä¢ Max/Min Ratio: 2.97x (Higher = More Imbalanced)
   ‚Ä¢ Gini Coefficient: 0.711 (0=Perfect Balance, 1=Maximum Imbalance)
   ‚Ä¢ Entropy Score: 1.886
   ‚úÖ GOOD: Cl

ADVANCED TEXT ANALYSIS - NLP DEPTH

In [13]:
# ============================================================================
# üéØ CRITICAL: CREATE NLP-VIABLE DATASET BEFORE SECTION 6
# ============================================================================

print("\n" + "=" * 100)
print("üéØ CREATING NLP-VIABLE DATASET FOR TEXT ANALYSIS")
print("=" * 100)

# 1. Filter for complaints WITH narratives (31% of data)
viable_df = df[df['Consumer complaint narrative'].notna()].copy()
print(f"‚úÖ Created viable_df: {len(viable_df):,} complaints with narratives")
print(f"   ‚Ä¢ From total dataset of: {len(df):,} complaints")
print(f"   ‚Ä¢ Percentage with narratives: {len(viable_df)/len(df)*100:.1f}%")

# 2. Apply product mapping to viable_df
print("\nüìä Applying product mapping to NLP-viable data...")
product_mapping = {
    'Credit card': 'Credit Card',
    'Credit card or prepaid card': 'Credit Card',
    'Prepaid card': 'Credit Card',
    'Payday loan, title loan, or personal loan': 'Personal Loan',
    'Consumer Loan': 'Personal Loan',
    'Vehicle loan or lease': 'Personal Loan',
    'Bank account or service': 'Savings Account',
    'Checking or savings account': 'Savings Account',
    'Savings account': 'Savings Account',
    'Money transfer, virtual currency, or money service': 'Money Transfer',
    'Virtual currency': 'Money Transfer',
    'Mortgage': 'Mortgage',
    'Student loan': 'Student Loan',
    'Debt collection': 'Debt Collection',
    'Credit reporting, credit repair services, or other personal consumer reports': 'Credit Reporting'
}

viable_df['Product_Category'] = viable_df['Product'].map(product_mapping).fillna('Other')

# 3. Create business_df_viable (NLP-viable AND business-relevant)
our_products = ['Credit Card', 'Personal Loan', 'Savings Account', 'Money Transfer']
business_df_viable = viable_df[viable_df['Product_Category'].isin(our_products)]

print(f"\n‚úÖ Created business_df_viable: {len(business_df_viable):,} complaints")
print(f"   ‚Ä¢ NLP-viable AND business-relevant")
print(f"   ‚Ä¢ Products: {', '.join(our_products)}")

print("\n" + "=" * 100)
print("üéØ READY FOR TEXT ANALYSIS SECTIONS 6-10")
print("=" * 100)


üéØ CREATING NLP-VIABLE DATASET FOR TEXT ANALYSIS
‚úÖ Created viable_df: 2,980,756 complaints with narratives
   ‚Ä¢ From total dataset of: 9,609,797 complaints
   ‚Ä¢ Percentage with narratives: 31.0%

üìä Applying product mapping to NLP-viable data...

‚úÖ Created business_df_viable: 515,810 complaints
   ‚Ä¢ NLP-viable AND business-relevant
   ‚Ä¢ Products: Credit Card, Personal Loan, Savings Account, Money Transfer

üéØ READY FOR TEXT ANALYSIS SECTIONS 6-10


In [None]:
# ============================================================================
# üìù SECTION 6: ADVANCED TEXT ANALYSIS - NLP DEPTH
# ============================================================================

print("\n" + "=" * 100)
print("üìù PHASE 5: ADVANCED TEXT ANALYSIS - NLP INSIGHTS")
print("=" * 100)

print(f"üìä Analyzing NLP-Viable Dataset: {len(viable_df):,} complaints with narrative text")
print(f"   (This is {len(viable_df)/len(df)*100:.1f}% of the total {len(df):,} complaints)")

# 1. Document Length Analysis
print("\nüìè DOCUMENT LENGTH ANALYSIS:")
print("-" * 80)

# Calculate comprehensive text statistics ON THE VIABLE DATA
viable_df['Narrative_Length_Chars'] = viable_df['Consumer complaint narrative'].str.len()
viable_df['Narrative_Length_Words'] = viable_df['Consumer complaint narrative'].str.split().str.len()
viable_df['Narrative_Length_Sentences'] = viable_df['Consumer complaint narrative'].apply(
    lambda x: len(sent_tokenize(str(x))) if pd.notna(x) else 0
)

# Use viable_df for statistics
text_stats = viable_df[['Narrative_Length_Chars', 'Narrative_Length_Words', 'Narrative_Length_Sentences']].describe()

print("üìà Summary Statistics (for complaints WITH narratives):")
print(text_stats.round(1))

# Identify outliers IN THE VIABLE DATA
Q1 = viable_df['Narrative_Length_Words'].quantile(0.25)
Q3 = viable_df['Narrative_Length_Words'].quantile(0.75)
IQR = Q3 - Q1

outliers = viable_df[(viable_df['Narrative_Length_Words'] < (Q1 - 1.5 * IQR)) | 
                     (viable_df['Narrative_Length_Words'] > (Q3 + 1.5 * IQR))]

print(f"\nüìä Outlier Detection (within narratives):")
print(f"   ‚Ä¢ Short Outliers (< {Q1 - 1.5 * IQR:.0f} words): {len(outliers[outliers['Narrative_Length_Words'] < (Q1 - 1.5 * IQR)])}")
print(f"   ‚Ä¢ Long Outliers (> {Q3 + 1.5 * IQR:.0f} words): {len(outliers[outliers['Narrative_Length_Words'] > (Q3 + 1.5 * IQR)])}")
print(f"   ‚Ä¢ Total Outliers: {len(outliers):,} ({len(outliers)/len(viable_df)*100:.1f}% of viable data)")

# 2. Length Distribution Visualization
fig2 = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Character Length Distribution', 
                    'Word Length Distribution',
                    'Sentence Length Distribution',
                    'Length vs Product Category'),
    specs=[[{'type': 'histogram'}, {'type': 'histogram'}],
           [{'type': 'histogram'}, {'type': 'box'}]]
)

# Character length - USE viable_df
fig2.add_trace(
    go.Histogram(
        x=viable_df['Narrative_Length_Chars'].dropna(),
        nbinsx=50,
        name='Characters',
        marker_color='#FF6B6B'
    ),
    row=1, col=1
)

# Word length - USE viable_df
fig2.add_trace(
    go.Histogram(
        x=viable_df['Narrative_Length_Words'].dropna(),
        nbinsx=50,
        name='Words',
        marker_color='#4ECDC4'
    ),
    row=1, col=2
)

# Sentence length - USE viable_df
fig2.add_trace(
    go.Histogram(
        x=viable_df['Narrative_Length_Sentences'].dropna(),
        nbinsx=30,
        name='Sentences',
        marker_color='#45B7D1'
    ),
    row=2, col=1
)

# Box plot by product - USE business_df_viable (which is a subset of viable_df)
for product in our_products:
    subset = business_df_viable[business_df_viable['Product_Category'] == product]
    fig2.add_trace(
        go.Box(
            y=subset['Narrative_Length_Words'],
            name=product,
            boxpoints='outliers',
            marker_color={'Credit Card': '#FF6B6B', 
                         'Personal Loan': '#4ECDC4',
                         'Savings Account': '#45B7D1',
                         'Money Transfer': '#96CEB4'}[product]
        ),
        row=2, col=2
    )

fig2.update_layout(
    title_text="<b>Text Length Analysis</b><br><i>Statistical Distribution of NLP-Viable Complaint Narratives</i>",
    title_font_size=18,
    height=700,
    showlegend=False
)

# Create reports directory if it doesn't exist
import os
os.makedirs('reports', exist_ok=True)

fig2.write_html("reports/text_length_analysis.html")
print("\n‚úÖ Saved text length analysis visualization: reports/text_length_analysis.html")


üìù PHASE 5: ADVANCED TEXT ANALYSIS - NLP INSIGHTS
üìä Analyzing NLP-Viable Dataset: 2,980,756 complaints with narrative text
   (This is 31.0% of the total 9,609,797 complaints)

üìè DOCUMENT LENGTH ANALYSIS:
--------------------------------------------------------------------------------


In [None]:
# ============================================================================
# üî§ SECTION 7: VOCABULARY & LINGUISTIC ANALYSIS
# ============================================================================

print("\n" + "=" * 100)
print("üî§ PHASE 6: VOCABULARY & LINGUISTIC ANALYSIS")
print("=" * 100)

# Note: We are analyzing ONLY the viable complaints (with narratives)
print(f"üìä Analyzing vocabulary for {len(business_df_viable):,} business-relevant, NLP-viable complaints")

# Sample for vocabulary analysis (for performance)
sample_size = min(10000, len(business_df_viable))
sample_df = business_df_viable.sample(sample_size, random_state=42)
print(f"   ‚Ä¢ Using sample of {sample_size:,} complaints for vocabulary analysis")

def analyze_vocabulary(text_series):
    """Advanced vocabulary analysis"""
    all_words = []
    for text in text_series.dropna():
        tokens = word_tokenize(str(text).lower())
        all_words.extend(tokens)
    
    word_counts = Counter(all_words)
    total_words = len(all_words)
    unique_words = len(word_counts)
    
    return {
        'total_words': total_words,
        'unique_words': unique_words,
        'vocabulary_richness': unique_words / total_words if total_words > 0 else 0,
        'top_words': word_counts.most_common(20)
    }

print("\nüìä VOCABULARY ANALYSIS ACROSS PRODUCTS (NLP-Viable Data):")
print("-" * 80)

vocab_results = {}
for product in our_products:
    product_texts = business_df_viable[business_df_viable['Product_Category'] == product]['Consumer complaint narrative']
    if len(product_texts) > 0:
        vocab_results[product] = analyze_vocabulary(product_texts)
        
        print(f"\n{product}:")
        print(f"   ‚Ä¢ Total Words: {vocab_results[product]['total_words']:,}")
        print(f"   ‚Ä¢ Unique Words: {vocab_results[product]['unique_words']:,}")
        print(f"   ‚Ä¢ Vocabulary Richness: {vocab_results[product]['vocabulary_richness']:.4f}")
        print(f"   ‚Ä¢ Top 5 Words: {[word for word, count in vocab_results[product]['top_words'][:5]]}")
    else:
        print(f"\n{product}: No narrative data available")

# Calculate vocabulary overlap
print("\nüìä VOCABULARY OVERLAP ANALYSIS (NLP-Viable Products):")
print("-" * 80)

# Get unique words per product from NLP-viable data
product_vocabs = {}
for product in our_products:
    all_words = []
    product_data = business_df_viable[business_df_viable['Product_Category'] == product]
    for text in product_data['Consumer complaint narrative'].dropna():
        tokens = word_tokenize(str(text).lower())
        all_words.extend(tokens)
    product_vocabs[product] = set(all_words)
    print(f"   ‚Ä¢ {product}: {len(product_vocabs[product]):,} unique words")

# Calculate Jaccard similarity between product vocabularies
from itertools import combinations

overlap_matrix = pd.DataFrame(index=our_products, columns=our_products)

for prod1, prod2 in combinations(our_products, 2):
    if len(product_vocabs[prod1]) > 0 and len(product_vocabs[prod2]) > 0:
        intersection = len(product_vocabs[prod1].intersection(product_vocabs[prod2]))
        union = len(product_vocabs[prod1].union(product_vocabs[prod2]))
        jaccard_similarity = intersection / union if union > 0 else 0
        
        overlap_matrix.loc[prod1, prod2] = jaccard_similarity
        overlap_matrix.loc[prod2, prod1] = jaccard_similarity
    else:
        overlap_matrix.loc[prod1, prod2] = 0
        overlap_matrix.loc[prod2, prod1] = 0

# Fill diagonal
for product in our_products:
    overlap_matrix.loc[product, product] = 1.0

print("\nJaccard Similarity Matrix (Vocabulary Overlap in NLP-Viable Data):")
print(overlap_matrix.round(3))

In [None]:
# ============================================================================
# üßπ SECTION 8: ADVANCED TEXT CLEANING PIPELINE
# ============================================================================

print("\n" + "=" * 100)
print("üßπ PHASE 7: ADVANCED TEXT CLEANING PIPELINE")
print("=" * 100)

print(f"üîß Applying text cleaning to {len(business_df_viable):,} NLP-viable business complaints")

class AdvancedTextCleaner:
    """Production-grade text cleaner with NLP techniques"""
    
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        # Add domain-specific stopwords
        self.domain_stopwords = {
            'bank', 'account', 'card', 'loan', 'company', 
            'service', 'customer', 'please', 'thank', 'would',
            'could', 'should', 'also', 'however', 'therefore'
        }
        self.stop_words.update(self.domain_stopwords)
        
        # Regex patterns for noise removal
        self.patterns = {
            'email': r'\S+@\S+',
            'phone': r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
            'ssn': r'\d{3}-\d{2}-\d{4}',
            'url': r'https?://\S+|www\.\S+',
            'account_number': r'account\s*(?:no|number|#)?\s*:?\s*\d+',
            'date': r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
            'currency': r'\$\d+(?:\.\d{2})?',
            'special_chars': r'[^\w\s.,!?;:\-\'"]',
            'extra_spaces': r'\s+'
        }
    
    def clean_text(self, text):
        """Complete text cleaning pipeline"""
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove boilerplate patterns
        boilerplate_phrases = [
            r'dear\s+(?:sir|madam|team|customer\s+service)',
            r'to\s+whom\s+it\s+may\s+concern',
            r'i\s+am\s+writing\s+(?:to|because|regarding)',
            r'this\s+is\s+(?:a|to)\s+(?:file|submit|report)',
            r'please\s+be\s+(?:advised|informed|noted)',
            r'thank\s+you\s+(?:in\s+advance|for\s+your\s+(?:time|help|attention))',
            r'sincerely\s*yours?',
            r'best\s+regards',
            r'kind\s+regards',
            r'regards',
            r'respectfully',
            r'yours\s+truly'
        ]
        
        for phrase in boilerplate_phrases:
            text = re.sub(phrase, '', text, flags=re.IGNORECASE)
        
        # Remove structured patterns
        for pattern_name, pattern in self.patterns.items():
            if pattern_name in ['email', 'phone', 'ssn', 'url', 'account_number']:
                text = re.sub(pattern, '[REDACTED]', text)
            elif pattern_name == 'special_chars':
                text = re.sub(pattern, ' ', text)
            elif pattern_name == 'extra_spaces':
                text = re.sub(pattern, ' ', text)
        
        # Tokenize and process
        tokens = word_tokenize(text)
        
        # Remove stopwords (but keep negation words)
        negation_words = {'not', 'no', 'never', 'none', 'nothing', 'nowhere'}
        filtered_tokens = [
            token for token in tokens 
            if token not in self.stop_words or token in negation_words
        ]
        
        # Apply lemmatization
        lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in filtered_tokens]
        
        # Reconstruct text
        cleaned_text = ' '.join(lemmatized_tokens)
        
        return cleaned_text.strip()
    
    def analyze_cleaning_impact(self, original_text, cleaned_text):
        """Analyze cleaning impact"""
        original_words = len(word_tokenize(str(original_text)))
        cleaned_words = len(word_tokenize(str(cleaned_text)))
        
        return {
            'original_length': original_words,
            'cleaned_length': cleaned_words,
            'reduction_percentage': ((original_words - cleaned_words) / original_words * 100) if original_words > 0 else 0,
            'stopwords_removed': original_words - cleaned_words
        }

print("üîß Initializing advanced text cleaner...")
cleaner = AdvancedTextCleaner()

# Test cleaning pipeline
test_cases = [
    "Dear Sir, I am writing to file a complaint about my credit card billing. My account number is 123456789. Please contact me at john@email.com or 555-123-4567. Thank you in advance.",
    "This is to REPORT a serious issue with my LOAN. The bank charged me $500 extra fees!!! I want this resolved ASAP.",
    "Not happy with the service. The representative was not helpful at all. Never using this bank again."
]

print("\nüß™ TESTING CLEANING PIPELINE:")
print("-" * 80)

for i, test_case in enumerate(test_cases[:3]):
    cleaned = cleaner.clean_text(test_case)
    impact = cleaner.analyze_cleaning_impact(test_case, cleaned)
    
    print(f"\nTest Case {i+1}:")
    print(f"   Original: {test_case[:100]}...")
    print(f"   Cleaned: {cleaned[:100]}...")
    print(f"   Impact: {impact['reduction_percentage']:.1f}% reduction "
          f"({impact['original_length']} ‚Üí {impact['cleaned_length']} words)")

# Apply cleaning to NLP-viable business data
print(f"\nüöÄ Applying cleaning to {len(business_df_viable):,} NLP-viable business complaints...")
business_df_viable['Cleaned_Narrative'] = business_df_viable['Consumer complaint narrative'].apply(cleaner.clean_text)

# Analyze cleaning impact on NLP-viable data
original_lengths = business_df_viable['Consumer complaint narrative'].str.split().str.len()
cleaned_lengths = business_df_viable['Cleaned_Narrative'].str.split().str.len()

cleaning_summary = {
    'avg_original_length': original_lengths.mean(),
    'avg_cleaned_length': cleaned_lengths.mean(),
    'avg_reduction': ((original_lengths - cleaned_lengths) / original_lengths * 100).mean(),
    'total_words_removed': (original_lengths - cleaned_lengths).sum()
}

print("\nüìä CLEANING IMPACT SUMMARY (NLP-Viable Business Data):")
print("-" * 80)
print(f"   ‚Ä¢ Average Original Length: {cleaning_summary['avg_original_length']:.1f} words")
print(f"   ‚Ä¢ Average Cleaned Length: {cleaning_summary['avg_cleaned_length']:.1f} words")
print(f"   ‚Ä¢ Average Reduction: {cleaning_summary['avg_reduction']:.1f}%")
print(f"   ‚Ä¢ Total Words Removed: {cleaning_summary['total_words_removed']:,}")

In [None]:
# ============================================================================
# üìä SECTION 9: SENTIMENT & TOPIC ANALYSIS
# ============================================================================

print("\n" + "=" * 100)
print("üé≠ PHASE 8: SENTIMENT & TOPIC ANALYSIS")
print("=" * 100)

# 1. Sentiment Analysis on CLEANED NLP-viable narratives
def analyze_sentiment(text):
    """Basic sentiment analysis"""
    if pd.isna(text) or len(str(text).strip()) < 10:
        return 0.0
    
    analysis = TextBlob(str(text))
    return analysis.sentiment.polarity  # -1 to 1

print("üìà Calculating sentiment scores for cleaned NLP-viable narratives...")
business_df_viable['Sentiment_Score'] = business_df_viable['Cleaned_Narrative'].apply(analyze_sentiment)

# Sentiment distribution by product
sentiment_by_product = business_df_viable.groupby('Product_Category')['Sentiment_Score'].agg(['mean', 'std', 'count'])

print("\nüìä SENTIMENT ANALYSIS BY PRODUCT (NLP-Viable Business Data):")
print("-" * 80)
for product in our_products:
    if product in sentiment_by_product.index:
        mean_sentiment = sentiment_by_product.loc[product, 'mean']
        if mean_sentiment < -0.1:
            sentiment_label = "üò† NEGATIVE"
        elif mean_sentiment < 0.1:
            sentiment_label = "üòê NEUTRAL"
        else:
            sentiment_label = "üòä POSITIVE"
        print(f"   ‚Ä¢ {product:<20} {mean_sentiment:>6.3f} {sentiment_label} (n={sentiment_by_product.loc[product, 'count']:,})")
    else:
        print(f"   ‚Ä¢ {product:<20} No data available")

# 2. Topic/Issue Analysis
print("\nüìä TOP ISSUES BY PRODUCT CATEGORY (NLP-Viable Data):")
print("-" * 80)

# Get top issues for each product
for product in our_products:
    product_data = business_df_viable[business_df_viable['Product_Category'] == product]
    if len(product_data) > 0:
        top_issues = product_data['Issue'].value_counts().head(3)
        
        print(f"\n{product} (n={len(product_data):,}):")
        for issue, count in top_issues.items():
            percentage = (count / len(product_data)) * 100
            print(f"   ‚Ä¢ {issue}: {count:,} complaints ({percentage:.1f}%)")
    else:
        print(f"\n{product}: No NLP-viable data available")

# 3. Sentiment distribution visualization
print("\nüìà SENTIMENT DISTRIBUTION SUMMARY:")
print("-" * 80)

# Categorize sentiments
def categorize_sentiment(score):
    if score < -0.1:
        return "Negative"
    elif score < 0.1:
        return "Neutral"
    else:
        return "Positive"

business_df_viable['Sentiment_Category'] = business_df_viable['Sentiment_Score'].apply(categorize_sentiment)
sentiment_dist = business_df_viable['Sentiment_Category'].value_counts()

for sentiment, count in sentiment_dist.items():
    percentage = (count / len(business_df_viable)) * 100
    print(f"   ‚Ä¢ {sentiment:<10} {count:>8,} complaints ({percentage:>5.1f}%)")

# 4. Issue-Sentiment correlation
print("\nüìä MOST NEGATIVE ISSUES (Top 5 by Average Sentiment):")
print("-" * 80)

if 'Issue' in business_df_viable.columns:
    issue_sentiment = business_df_viable.groupby('Issue')['Sentiment_Score'].agg(['mean', 'count'])
    # Filter for issues with at least 100 complaints
    issue_sentiment = issue_sentiment[issue_sentiment['count'] >= 100]
    most_negative = issue_sentiment.sort_values('mean').head(5)
    
    for issue, row in most_negative.iterrows():
        print(f"   ‚Ä¢ {issue}: {row['mean']:.3f} sentiment (n={row['count']:,})")

In [None]:
# ============================================================================
# üìà SECTION 10: TF-IDF & KEYWORD ANALYSIS
# ============================================================================

print("\n" + "=" * 100)
print("üîë PHASE 9: TF-IDF & KEYWORD ANALYSIS")
print("=" * 100)

print(f"üìä Performing TF-IDF analysis on {len(business_df_viable):,} cleaned NLP-viable narratives")

# Sample data for TF-IDF (for performance)
sample_size_tfidf = min(5000, len(business_df_viable))
tfidf_sample = business_df_viable.sample(sample_size_tfidf, random_state=42)
print(f"   ‚Ä¢ Using sample of {sample_size_tfidf:,} complaints for TF-IDF analysis")

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),  # Include bigrams
    min_df=5,  # Minimum document frequency
    max_df=0.8  # Maximum document frequency
)

# Fit and transform on CLEANED narratives
try:
    tfidf_matrix = tfidf.fit_transform(tfidf_sample['Cleaned_Narrative'])
    feature_names = tfidf.get_feature_names_out()
    
    print(f"‚úÖ TF-IDF matrix created: {tfidf_matrix.shape[0]} documents √ó {tfidf_matrix.shape[1]} features")
    
    # Get top keywords for each product
    print("\nüîç TOP KEYWORDS BY PRODUCT (TF-IDF on Cleaned NLP-Viable Data):")
    print("-" * 80)
    
    for product in our_products:
        product_mask = tfidf_sample['Product_Category'] == product
        
        if product_mask.sum() > 0:
            # Calculate average TF-IDF for this product
            product_tfidf = tfidf_matrix[product_mask].mean(axis=0).A1
            top_indices = product_tfidf.argsort()[-10:][::-1]
            top_keywords = [feature_names[i] for i in top_indices]
            
            print(f"\n{product} (n={product_mask.sum():,}):")
            print(f"   ‚Ä¢ Top Keywords: {', '.join(top_keywords[:5])}")
            print(f"   ‚Ä¢ All Top 10: {', '.join(top_keywords)}")
        else:
            print(f"\n{product}: No data in sample")
    
    # Get overall top keywords
    print("\nüîç OVERALL TOP KEYWORDS (All NLP-Viable Business Data):")
    print("-" * 80)
    
    overall_tfidf = tfidf_matrix.mean(axis=0).A1
    top_indices = overall_tfidf.argsort()[-20:][::-1]
    top_keywords = [feature_names[i] for i in top_indices]
    
    print("Top 20 Keywords by TF-IDF Score:")
    for i in range(0, len(top_keywords), 5):
        print(f"   ‚Ä¢ {', '.join(top_keywords[i:i+5])}")
    
    # Analyze keyword uniqueness by product
    print("\nüìä KEYWORD UNIQUENESS ANALYSIS:")
    print("-" * 80)
    
    product_keywords = {}
    for product in our_products:
        product_mask = tfidf_sample['Product_Category'] == product
        if product_mask.sum() > 10:  # Need enough documents
            product_tfidf = tfidf_matrix[product_mask].mean(axis=0).A1
            # Get keywords where this product has score > 0.1 and others < 0.05
            other_products_mask = tfidf_sample['Product_Category'] != product
            other_tfidf = tfidf_matrix[other_products_mask].mean(axis=0).A1
            
            unique_indices = np.where((product_tfidf > 0.1) & (other_tfidf < 0.05))[0]
            unique_keywords = [feature_names[i] for i in unique_indices[:5]]  # Top 5 unique
            
            if len(unique_keywords) > 0:
                print(f"   ‚Ä¢ {product}: {', '.join(unique_keywords)}")
            else:
                print(f"   ‚Ä¢ {product}: No strongly unique keywords")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Error in TF-IDF analysis: {e}")
    print("   This can happen if there's insufficient text data after cleaning.")
    print("   Try reducing min_df parameter or checking cleaned text quality.")

# Additional keyword analysis using frequency
print("\nüìä FREQUENCY-BASED KEYWORD ANALYSIS:")
print("-" * 80)

from collections import Counter

# Analyze most common words in cleaned narratives
all_words = []
for text in business_df_viable['Cleaned_Narrative'].dropna():
    tokens = word_tokenize(str(text))
    all_words.extend(tokens)

word_freq = Counter(all_words)
print(f"Total words in cleaned narratives: {len(all_words):,}")
print(f"Unique words: {len(word_freq):,}")

print("\nMost Common Words (excluding stopwords):")
common_words = [(word, count) for word, count in word_freq.most_common(30) 
                if word not in cleaner.stop_words and len(word) > 2]
for i in range(0, len(common_words), 5):
    words_batch = common_words[i:i+5]
    print(f"   ‚Ä¢ {', '.join([f'{w}({c:,})' for w, c in words_batch])}")