## Converting pdf to csv

In [47]:
"""
STAGE 1: PDF TO CSV CONVERSION
Extracts M-Pesa statement from PDF and saves as CSV
"""

import tabula
import pandas as pd

def convert_mpesa_pdf_to_csv(pdf_path: str, password: str, output_csv: str):
    """
    Convert M-Pesa PDF statement to CSV
    
    Args:
        pdf_path: Path to M-Pesa PDF statement
        password: PDF password
        output_csv: Output CSV file path
    """
    
    print("=" * 80)
    print("STAGE 1: PDF TO CSV CONVERSION")
    print("=" * 80)
    print()
    
    print(f"üìÑ Reading PDF: {pdf_path}")
    print(f"üîê Using password: {'*' * len(password)}")
    print()
    
    # Extract ALL tables from ALL pages with 'latin-1' encoding
    tables = tabula.read_pdf(
        pdf_path, 
        password=password,
        encoding='latin-1',
        pages='all',  # Extract from all pages
        multiple_tables=True  # Get all tables on each page
    )
    
    print(f"‚úì Extracted {len(tables)} tables from PDF")
    
    # Combine all tables
    df = pd.concat(tables, ignore_index=True)
    
    print(f"‚úì Combined into {len(df)} rows and {len(df.columns)} columns")
    print()
    
    # Save to CSV
    df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    
    print(f"‚úÖ CSV saved successfully: {output_csv}")
    print()
    
    # Show preview
    print("=" * 80)
    print("PREVIEW - First 5 rows:")
    print("=" * 80)
    print(df.head())
    print()
    
    print("=" * 80)
    print("PREVIEW - Last 5 rows:")
    print("=" * 80)
    print(df.tail())
    print()
    
    print(f"Total rows: {len(df):,}")
    print(f"Total columns: {len(df.columns)}")
    print()
    
    return df


if __name__ == "__main__":
    # CONFIGURE THESE PATHS
    PDF_FILE = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\mpesa_statement_john.pdf"
    PDF_PASSWORD = "335419"  # Your PDF password
    OUTPUT_CSV = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\stage1_mpesa_raw.csv"
    
    # Run conversion
    df = convert_mpesa_pdf_to_csv(PDF_FILE, PDF_PASSWORD, OUTPUT_CSV)
    
    print("‚úÖ STAGE 1 COMPLETE!")
    print(f"Output: {OUTPUT_CSV}")

STAGE 1: PDF TO CSV CONVERSION

üìÑ Reading PDF: C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\mpesa_statement_john.pdf
üîê Using password: ******



Got stderr: Feb 17, 2026 1:18:53 AM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>



‚úì Extracted 148 tables from PDF
‚úì Combined into 2869 rows and 14 columns

‚úÖ CSV saved successfully: C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\stage1_mpesa_raw.csv

PREVIEW - First 5 rows:
   Unnamed: 0  Unnamed: 1           TRANSACTION TYPE       PAID IN  \
0         NaN         NaN                SEND MONEY:          0.00   
1         NaN         NaN            RECEIVED MONEY:    581,070.00   
2         NaN         NaN             AGENT DEPOSIT:  2,028,550.00   
3         NaN         NaN          AGENT WITHDRAWAL:          0.00   
4         NaN         NaN  LIPA NA M-PESA (PAYBILL):          0.00   

       PAID OUT Receipt No. Completion Time Details Transaction Status  \
0    781,669.00         NaN             NaN     NaN                NaN   
1          0.00         NaN             NaN     NaN                NaN   
2          0.00         NaN             NaN     NaN                NaN   
3    146,179.00         NaN             NaN     NaN                NaN   
4  1,99

## Key Categorization

In [48]:
"""
STAGE 2: TRANSACTION TYPE IDENTIFICATION (COMPLETE FIX)
‚úÖ Separates Fuliza LOAN (OverDraft of Credit Party) from Fuliza PAYMENTS
‚úÖ Loan Repayment as separate category
‚úÖ All other fixes included
"""

import pandas as pd
import re
from typing import Dict


class TransactionTypeIdentifier:
    """Enhanced transaction type identification with all fixes"""
    
    def __init__(self):
        """Define patterns in strict priority order"""
        # Format: (type_name, [patterns], priority)
        self.type_patterns = [
            # PRIORITY 1: Fees (check first - often confused with other types)
            ('M-Pesa Fee', [
                r'transfer\s+of\s+funds\s+charge',
                r'pay\s+bill\s+charge',
                r'pay\s+merchant\s+charge',
                r'withdraw(al)?\s+charge',
                r'\bcharge\b$',
            ], 1),
            
            # PRIORITY 2: Fuliza/Overdraft LOAN (the credit itself - OverDraft of Credit Party)
            ('Fuliza', [
                r'overdraft\s+of\s+credit\s+party',  # This is the LOAN
            ], 2),
            
            # PRIORITY 3: Loan Repayment (paying back loans, including Fuliza payments)
            ('Loan Repayment', [
                r'od\s+loan\s+repayment',
                r'loan\s+repayment',
                r'fuliza\s+repayment',
                r'overdraw',
            ], 3),
            
            # PRIORITY 4: LOOP Payment (Income from LOOP)
            ('LOOP Payment', [
                r'promotion\s+payment\s+from.*loop\s+b2c',
                r'loop\s+b2c',
            ], 4),
            
            # PRIORITY 5: Received Money (Income)
            ('Received Money', [
                r'funds\s+received\s+from',
                r'business\s+payment\s+from',
                r'received\s+from',
            ], 5),
            
            # PRIORITY 6: Cash Deposit (at agent)
            ('Cash Deposit', [
                r'deposit\s+of\s+funds\s+at\s+agent',
            ], 6),
            
            # PRIORITY 7: Cash Withdrawal (at agent)
            ('Cash Withdrawal', [
                r'customer\s+withdrawal\s+at\s+agent',
                r'withdrawal\s+at\s+agent',
            ], 7),
            
            # PRIORITY 8: Data Bundles (separated from Airtime)
            # Including Fuliza-powered data bundles
            ('Data Bundles', [
                r'safaricom\s+data',
                r'safaricom\s+data\s+bundles',
                r'customer\s+bundle\s+purchase\s+with\s+fuliza.*4093441',
                r'(?i)buy\s+bundle',
                r'(?i)customer\s+bundle\s+purchase',
                 r'customer\s+bundle\s+purchase\s+with\s+fuliza',
            
            ], 8),
            
            # PRIORITY 9: Airtime (separated from Data, includes Direct Pay)
            # Including Fuliza-powered airtime
            ('Airtime', [
                r'(?i)safaricom\s+offers',  # Safaricom Offers = Airtime
                r'airtime\s+purchase',
                r'pay\s+bill.*direct\s+pay.*atl\d+',  # Direct Pay airtime
                r'4187661.*direct\s+pay',  # Direct Pay paybill
                r'4093275.*direct\s+pay',  # Another Direct Pay paybill
                r'recharge\s+for\s+customer',
            ], 9),
            
            # PRIORITY 10: Send Money (including Fuliza-powered transfers)
            ('Send Money', [
                r'(?i)customer\s+transfer\s+to\s+-\s+(2547|07|01)[\d\*]+',
                r'customer\s+transfer\s+to\s+-\s+',
                r'(?i)customer\stransfer',
                r'customer\s+send\s+money.*fuliza.*to\s+-\s+(2547|07|01)[\d\*]+',
                r'(?i)customer\s+transfer\s+fuliza\s+mpesa\s*to\s+-\s+(2547|07|01)[\d\*]+',
            ], 10),
            
            # PRIORITY 11: Pochi la Biashara
            ('Pochi la Biashara', [
                r'customer\s+payment\s+to\s+small\s+business',
            ], 11),
            
            # PRIORITY 12: Till Payment (including Fuliza-powered)
            ('Till Payment', [
                r'merchant\s+payment\s+(online\s+)?to\s+\d+',
                r'merchant\s+payment\s+fuliza\s+m-?pesa\s*to\s+\d+',
                r'till\s+\d+',
            ], 12),
            
            # PRIORITY 13: PayBill (including Fuliza-powered)
            ('PayBill', [
                r'pay\s+bill\s+(online\s+)?to\s+\d+',
                r'pay\s+bill\s+fuliza\s+m-?pesa\s+to\s+\d+',
                r'pay\s+bill\s+online\s+fuliza\s+m-pesa\s+to\s+(\d+)\s+-\s+([\w\s]+?)\s+acc\.?\s+([\w\s]+)',
            ], 13),
            
            # PRIORITY 14: M-Shwari
            ('M-Shwari', [
                r'm-?\s*shwari',
            ], 14),
            
            # PRIORITY 15: Unit Trust
            ('Unit Trust', [
                r'unit\s+trust',
                r'ziidi',
            ], 15),
            
            # PRIORITY 16: Reversal
            ('Reversal', [
                r'reversal',
                r'reversed',
            ], 16),
        ]
    
    def identify_type(self, description: str) -> str:
        """Identify transaction type"""
        if pd.isna(description) or description == '':
            return 'Other'
        
        desc_lower = str(description).lower().strip()
        
        # Check in priority order
        for trans_type, patterns, _ in self.type_patterns:
            for pattern in patterns:
                if re.search(pattern, desc_lower, re.IGNORECASE):
                    return trans_type
        
        return 'Other'
    
    def extract_fields(self, description: str, txn_type: str) -> Dict:
        """Extract key fields from description"""
        if pd.isna(description):
            return {}
        
        fields = {}
        desc = str(description)
        
        if txn_type == "Send Money":
            # Regular transfer
            match = re.search(
                r'(?i)customer\s+transfer\s+(?:fuliza\s+mpesa\s*)?to\s+-\s+((2547|07|01)[\d\*]+)\s+(.*)',
                desc
            )
            if match:
                fields["recipient_number"] = match.group(1)
                fields["recipient_name"] = match.group(3).strip()
        
        elif txn_type == "Pochi la Biashara":
            match = re.search(
                r'(?i)small\s+business\s+to\s+-\s+((2547|07|01)[\d\*]+)\s+(.*)',
                desc
            )
            if match:
                fields["recipient_number"] = match.group(1)
                fields["recipient_name"] = match.group(3).strip()
        
        elif txn_type == "Till Payment":
            # Regular or Fuliza merchant payment
            match = re.search(
                r'(?i)merchant\s+payment\s+(?:fuliza\s+m-?pesa\s*)?(?:online\s+)?to\s+(\d+)\s+-\s+(.*)',
                desc
            )
            if match:
                fields["till_number"] = match.group(1)
                raw_merchant = match.group(2).strip()
                raw_merchant = re.sub(
                    r'(?i)\s+via\s+(coop|equity|kcb|ncba|family)\s+bank\.?$',
                    '', raw_merchant
                ).strip()
                fields["merchant_name"] = raw_merchant
        
        elif txn_type == "PayBill":
            # Regular or Fuliza paybill
            match = re.search(
                r'(?i)pay\s+bill\s+(?:fuliza\s+m-?pesa\s*)?(?:online\s+)?to\s+(\d+)\s+[-‚Äì]\s+([\w\s]+?)\s+[Aa]cc\.?\s+([\w#]+)',
                desc
            )
            if match:
                fields["paybill_number"] = match.group(1)
                fields["merchant_name"] = match.group(2).strip()
                fields["account_number"] = match.group(3).strip()
            else:
                match2 = re.search(
                    r'(?i)pay\s+bill\s+(?:fuliza\s+m-?pesa\s*)?(?:online\s+)?to\s+(\d+)\s+[-‚Äì]?\s+(.*)',
                    desc
                )
                if match2:
                    fields["paybill_number"] = match2.group(1)
                    fields["merchant_name"] = match2.group(2).strip()
        
        elif txn_type in ["Cash Withdrawal", "Cash Deposit"]:
            match = re.search(
                r'(?i)agent\s+till\s+(\d+)\s+[-‚Äì]\s+(.*)',
                desc
            )
            if match:
                fields["agent_till"] = match.group(1)
                fields["agent_name"] = match.group(2).strip()
        
        elif txn_type in ["Received Money", "LOOP Payment"]:
            match = re.search(
                r'(?i)(?:funds\s+received|payment)\s+from\s+[-‚Äì]?\s+(\d+)\s+[-‚Äì]\s+(.*)',
                desc
            )
            if match:
                fields["sender_number"] = match.group(1)
                fields["sender_name"] = match.group(2).strip()
        
        return fields
    
    def process_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """Add transaction_type and extracted_fields"""
        print("üîç Identifying transaction types...")
        
        # Clean description
        df['description_clean'] = df['Details'].apply(self._clean_text)
        
        # Identify types
        df['transaction_type'] = df['description_clean'].apply(self.identify_type)
        
        # Extract fields
        print("üìã Extracting details...")
        df['extracted_fields'] = df.apply(
            lambda row: self.extract_fields(row['description_clean'], row['transaction_type']),
            axis=1
        )
        
        print(f"‚úì Identified {df['transaction_type'].nunique()} transaction types")
        
        return df
    
    def _clean_text(self, text: str) -> str:
        """Clean multiline PDF text"""
        if pd.isna(text):
            return ''
        text = str(text).replace('\\r', ' ').replace('\\n', ' ').replace('\r', ' ').replace('\n', ' ')
        text = re.sub(r'\s+', ' ', text)
        return text.strip()


def run_stage2(input_csv: str, output_csv: str):
    """Run Stage 2"""
    
    print("=" * 80)
    print("STAGE 2: TRANSACTION TYPE IDENTIFICATION (COMPLETE FIX)")
    print("=" * 80)
    print()
    print("‚úÖ OverDraft of Credit Party ‚Üí Fuliza (LOAN)")
    print("‚úÖ Fuliza payments ‚Üí Categorized by what was paid (Airtime, Till, etc.)")
    print("‚úÖ Loan Repayment separate category")
    print("‚úÖ LOOP B2C ‚Üí Income")
    print("‚úÖ Data Bundles (4093441) ‚â† Airtime (826915)")
    print("‚úÖ Direct Pay ‚Üí Airtime")
    print("‚úÖ Deposit/Withdrawal at Agent ‚Üí separate")
    print()
    
    # Load
    print(f"üìÇ Loading: {input_csv}")
    df = pd.read_csv(input_csv, low_memory=False)
    
    # Filter to transactions
    mask = df['Receipt No.'].notna() & (df['Receipt No.'] != '')
    df = df[mask].copy()
    
    print(f"‚úì Loaded {len(df)} transactions")
    print()
    
    # Process
    identifier = TransactionTypeIdentifier()
    df = identifier.process_dataframe(df)
    
    print()
    print("=" * 80)
    print("TRANSACTION TYPE BREAKDOWN")
    print("=" * 80)
    for trans_type, count in df['transaction_type'].value_counts().items():
        pct = (count / len(df)) * 100
        print(f"{trans_type:25s}: {count:5d} ({pct:5.1f}%)")
    
    print()
    print("=" * 80)
    print("VERIFICATION - KEY TYPES")
    print("=" * 80)
    
    # Verify critical fixes
    key_types = ['Fuliza', 'Loan Repayment', 'LOOP Payment', 'Data Bundles', 
                 'Airtime', 'Cash Deposit', 'Cash Withdrawal']
    
    for trans_type in key_types:
        type_df = df[df['transaction_type'] == trans_type]
        if len(type_df) > 0:
            print(f"\n{trans_type} ({len(type_df)} transactions):")
            for _, row in type_df.head(2).iterrows():
                print(f"  {row['description_clean'][:70]}")
    
    print()
    
    # Show Fuliza-powered transactions
    fuliza_powered = df[df['description_clean'].str.contains('fuliza', case=False, na=False)]
    print("=" * 80)
    print(f"FULIZA-POWERED TRANSACTIONS: {len(fuliza_powered)} total")
    print("=" * 80)
    fuliza_breakdown = fuliza_powered['transaction_type'].value_counts()
    for txn_type, count in fuliza_breakdown.items():
        print(f"  {txn_type:25s}: {count:5d}")
    print()
    print("‚úì Fuliza LOAN (OverDraft) vs Fuliza PAYMENTS properly separated!")
    print()
    
    # Save
    df['extracted_fields_str'] = df['extracted_fields'].apply(str)
    df.to_csv(output_csv, index=False)
    print(f"‚úÖ Saved: {output_csv}")
    print()
    
    return df


if __name__ == "__main__":
    INPUT = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\stage1_mpesa_raw.csv"
    OUTPUT = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\stage2_with_types.csv"
    
    df = run_stage2(INPUT, OUTPUT)
    print("‚úÖ Ready for Stage 3")

STAGE 2: TRANSACTION TYPE IDENTIFICATION (COMPLETE FIX)

‚úÖ OverDraft of Credit Party ‚Üí Fuliza (LOAN)
‚úÖ Fuliza payments ‚Üí Categorized by what was paid (Airtime, Till, etc.)
‚úÖ Loan Repayment separate category
‚úÖ LOOP B2C ‚Üí Income
‚úÖ Data Bundles (4093441) ‚â† Airtime (826915)
‚úÖ Direct Pay ‚Üí Airtime
‚úÖ Deposit/Withdrawal at Agent ‚Üí separate

üìÇ Loading: C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\stage1_mpesa_raw.csv
‚úì Loaded 2715 transactions

üîç Identifying transaction types...
üìã Extracting details...
‚úì Identified 13 transaction types

TRANSACTION TYPE BREAKDOWN
M-Pesa Fee               :   779 ( 28.7%)
Send Money               :   625 ( 23.0%)
PayBill                  :   404 ( 14.9%)
Cash Deposit             :   297 ( 10.9%)
Received Money           :   241 (  8.9%)
Pochi la Biashara        :   139 (  5.1%)
M-Shwari                 :    85 (  3.1%)
Till Payment             :    81 (  3.0%)
Airtime                  :    32 (  1.2%)
Cash Withdrawal 

## Transansaction Type Identification

In [49]:
"""
STAGE 3: KEYWORD-BASED CATEGORIZATION (COMPLETE FIX)
All issues addressed:
‚úÖ Removed "online" and "online purchase" from Online Shopping
‚úÖ Removed "fuliza" from Loans (handled in Stage 2)
‚úÖ Fuliza ‚Üí Loans (via transaction type)
‚úÖ LOOP Payment ‚Üí Income
‚úÖ Data Bundles and Airtime now separate
‚úÖ Direct Pay airtime not miscategorized as Online Shopping
"""

import pandas as pd
import re
from typing import Dict


class KeywordCategorizer:
    """Enhanced keyword categorizer with all fixes"""
    
    def __init__(self):
        self.category_keywords = {
            # HIGH PRIORITY
            'Health Care': {
                'keywords': [
                    'hospital', 'clinic', 'pharmacy', 'medical', 'nhif',
                    'chemist', 'doctor', 'laboratory', 'lab', 'diagnostic',
                    'aga khan', 'nairobi hospital', 'mater', 'kenyatta hospital',
                    'mp shah', 'gertrudes', 'lancet', 'dental', 'optical',
                ],
                'priority': 1,
            },
            
            'Betting': {
                'keywords': [
                    'sportpesa', 'sportybet', 'betika', '1xbet', 'stake', 
                    'bangbet', '22bet', 'mozzart bet', 'betway', 'odibets',
                    'kareco holdings', 'melbet', 'betin', 'betpawa', 'shabiki',
                    'bet', 'betting', 'lotto', 'lottery', 'casino',
                ],
                'priority': 1,
            },
            
            # Loans - REMOVED "fuliza" (now handled via transaction type)
            'Loans': {
                'keywords': [
                    'm-shwari loan', 'kcb m-pesa loan', 'hustler fund',
                    'okash', 'zenka', 'timiza', 'Overdraft',
                ],
                'priority': 1,
            },

            'Loan Repayment': {
                'keywords': ['repayment', 'overdraw'],
                'priority': 1,
            },
            
            # Online Shopping - REMOVED "online" and "online purchase"
            'Online Shopping': {
                'keywords': [
                    'jumia', 'kilimall', 'masoko', 'glovo', 'jiji',
                    'aliexpress', 'amazon', 'alibaba', 'uber eats', 'bolt food',
                    'sky garden', 'food delivery', 'home delivery',
                ],
                'priority': 1,
            },
            
            'Bills': {
                'keywords': [
                    'kplc', 'water', 'rent', 'insurance', 'gas refill',
                    'internet', 'home wifi', 'land rates', 'security',
                    'parking', 'electricity', 'prepaid', 'postpaid',
                ],
                'priority': 1,
            },
            
            'Subscriptions': {
                'keywords': [
                    'netflix', 'spotify', 'youtube', 'prime', 'hbo',
                    'gotv', 'dstv', 'showmax', 'apple music', 'startimes',
                    'zuku', 'subscription', 'microsoft 365', 'office 365',
                ],
                'priority': 1,
            },
            
            'Education': {
                'keywords': [
                    'university', 'school', 'college', 'helb', 'kuccps',
                    'knec', 'tvet', 'kmtc', 'fees', 'tuition', 'catering',
                    'kabarak', 'student', 'academy', 'exam fee', 'hostel',
                ],
                'priority': 1,
            },
            
            'Savings': {
                'keywords': [
                    'mshwari deposit', 'unit trust', 'mmf', 'fixed deposit',
                    'investment', 'koala', 'ndovu', 'etica', 'chama',
                    'ziidi', 'savings', 'sacco deposit', 'Sacco'
                ],
                'priority': 1,
            },
            
            # MEDIUM PRIORITY
            'Shopping': {
                'keywords': [
                    'supermarket', 'naivas', 'quickmart', 'quick mart',
                    'carrefour', 'chandarana', 'foodplus', 'cleanshelf',
                    'eastmatt', 'tuskys', 'kabsmart', 'nakumatt', 'Store', 
                ],
                'priority': 2,
            },
            
            'Fast Foods': {
                'keywords': [
                    'kfc', 'chicken inn', 'java house', 'artcaffe',
                    'pizza', 'burger king', 'dominos', 'debonairs',
                    'pizza hut', 'pizza inn', 'subway', 'steers','inn',
                ],
                'priority': 2,
            },
            
            'Food & Dining': {
                'keywords': [
                    'restaurant', 'hotel', 'cafe', 'eatery', 'food court',
                    'dining', 'meat', 'vegetables', 'fruits', 'milk','food',
                ],
                'priority': 2,
            },
            
            'Personal Care': {
                'keywords': [
                    'beauty', 'cosmetics', 'skincare', 'makeup', 'barber',
                    'salon', 'spa', 'kinyozi', 'grooming', 'hair', 'nails',
                ],
                'priority': 2,
            },
            
            'Transport': {
                'keywords': [
                    'uber', 'bolt', 'taxi', 'little cab', 'transport',
                    'fuel', 'petrol', 'diesel', 'shell', 'total', 'parking',
                ],
                'priority': 2,
            },
            
            'Entertainment': {
                'keywords': [
                    'liquor', 'bar', 'wine', 'beer', 'club', 'lounge',
                    'pub', 'cinema', 'bowling', 'arcade', 'entertainment',
                ],
                'priority': 2,
            },
            
            # LOW PRIORITY
            'Bank Transfer': {
                'keywords': [
                    'equity', 'kcb', 'family bank', 'co-op', 'ncba',
                    'stanbic', 'absa', 'bank transfer',
                ],
                'priority': 3,
            },
        }
    
    def categorize(self, description: str, transaction_type: str, extracted_fields: Dict = None) -> str:
        """Categorize transaction"""
        if pd.isna(description):
            return 'Uncategorized'
        
        desc_lower = str(description).lower()
        
        # Build search text
        search_text = desc_lower
        if extracted_fields:
            for key in ['merchant_name', 'recipient_name', 'sender_name', 'agent_name']:
                if key in extracted_fields:
                    search_text += ' ' + str(extracted_fields[key]).lower()
        
        # PRIORITY 1: Transaction type based (FIXED)
        
        # Income
        if transaction_type in ['Received Money', 'LOOP Payment']:
            return 'Income'
        
        # Cash operations
        if transaction_type == 'Cash Deposit':
            return 'Cash Deposit'
        
        if transaction_type == 'Cash Withdrawal':
            return 'Cash Withdrawal'
        
        # Loans (including Fuliza/OverDraft from Stage 2)
        if transaction_type == 'Overdraft':
            return 'Loans'
        
        # Data vs Airtime (now separated in Stage 2)
        if transaction_type == 'Data Bundles':
            return 'Data Bundles'
        
        if transaction_type == 'Airtime':
            return 'Airtime'
        
        # Fees
        if transaction_type == 'M-Pesa Fee':
            return 'M-Pesa Fees'
        
        # M-Shwari
        if transaction_type == 'M-Shwari':
            if 'withdraw' in desc_lower:
                return 'Cash Withdrawal'
            else:
                return 'Savings'
        
        # Unit Trust
        if transaction_type == 'Unit Trust':
            return 'Savings'
        
        # Reversal
        if transaction_type == 'Reversal':
            return 'Reversal'
        
        # PRIORITY 2: Send Money - LEAVE UNCATEGORIZED for Stage 4
        if transaction_type == 'Send Money':
            return 'Uncategorized'
        
        # PRIORITY 3: Till/PayBill/Pochi - Try keywords, fallback to Merchant
        if transaction_type in ['Till Payment', 'PayBill', 'Pochi la Biashara']:
            matched = self._match_keywords(search_text)
            return matched if matched else 'Merchant'
        
        # PRIORITY 4: Other - Try keywords
        matched = self._match_keywords(search_text)
        return matched if matched else 'Other'
    
    def _match_keywords(self, search_text: str) -> str:
        """Match keywords"""
        sorted_categories = sorted(
            self.category_keywords.items(),
            key=lambda x: x[1].get('priority', 99)
        )
        
        for category, rules in sorted_categories:
            for keyword in rules.get('keywords', []):
                pattern = r'\b' + re.escape(keyword) + r'\b'
                if re.search(pattern, search_text, re.IGNORECASE):
                    return category
        
        return ''
    
    def process_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """Add category column"""
        print("üè∑Ô∏è  Categorizing...")
        
        # Parse extracted_fields
        if 'extracted_fields' in df.columns:
            try:
                import ast
                df['extracted_fields_dict'] = df['extracted_fields'].apply(
                    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.strip() else {}
                )
            except:
                df['extracted_fields_dict'] = df['extracted_fields']
        else:
            df['extracted_fields_dict'] = [{}] * len(df)
        
        # Categorize
        df['category'] = df.apply(
            lambda row: self.categorize(
                row['description_clean'], 
                row['transaction_type'],
                row.get('extracted_fields_dict', {})
            ),
            axis=1
        )
        
        print(f"‚úì Categorized into {df['category'].nunique()} categories")
        
        return df


def run_stage3(input_csv: str, output_csv: str):
    """Run Stage 3"""
    
    print("=" * 80)
    print("STAGE 3: KEYWORD-BASED CATEGORIZATION (COMPLETE FIX)")
    print("=" * 80)
    print()
    print("‚úÖ Removed 'online' from Online Shopping")
    print("‚úÖ Removed 'fuliza' from Loans")
    print("‚úÖ Fuliza ‚Üí Loans (via transaction type)")
    print("‚úÖ LOOP Payment ‚Üí Income")
    print("‚úÖ Data Bundles & Airtime separate")
    print()
    
    # Load
    print(f"üìÇ Loading: {input_csv}")
    df = pd.read_csv(input_csv, low_memory=False)
    print(f"‚úì Loaded {len(df)} transactions")
    print()
    
    # Categorize
    categorizer = KeywordCategorizer()
    df = categorizer.process_dataframe(df)
    
    print()
    print("=" * 80)
    print("CATEGORY BREAKDOWN")
    print("=" * 80)
    for category, count in df['category'].value_counts().items():
        pct = (count / len(df)) * 100
        print(f"{category:30s}: {count:5d} ({pct:5.1f}%)")
    
    print()
    
    # Verify key categories
    print("=" * 80)
    print("VERIFICATION - KEY CATEGORIES")
    print("=" * 80)
    
    key_cats = ['Loans', 'Data Bundles', 'Airtime', 'Income', 
                'Cash Deposit', 'Cash Withdrawal', 'Loan repayment',]
    
    for cat in key_cats:
        cat_df = df[df['category'] == cat]
        if len(cat_df) > 0:
            print(f"\n{cat} ({len(cat_df)} transactions):")
            for _, row in cat_df.head(2).iterrows():
                print(f"  {row['description_clean'][:70]}")
    
    print()
    
    # Save
    df.to_csv(output_csv, index=False)
    print(f"‚úÖ Saved: {output_csv}")
    print()
    
    return df


if __name__ == "__main__":
    INPUT = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\stage2_with_types.csv"
    OUTPUT = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\stage3_with_categories.csv"
    
    df = run_stage3(INPUT, OUTPUT)
    print("‚úÖ Ready for Stage 4")

STAGE 3: KEYWORD-BASED CATEGORIZATION (COMPLETE FIX)

‚úÖ Removed 'online' from Online Shopping
‚úÖ Removed 'fuliza' from Loans
‚úÖ Fuliza ‚Üí Loans (via transaction type)
‚úÖ LOOP Payment ‚Üí Income
‚úÖ Data Bundles & Airtime separate

üìÇ Loading: C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\stage2_with_types.csv
‚úì Loaded 2715 transactions

üè∑Ô∏è  Categorizing...
‚úì Categorized into 16 categories

CATEGORY BREAKDOWN
M-Pesa Fees                   :   779 ( 28.7%)
Uncategorized                 :   626 ( 23.1%)
Merchant                      :   420 ( 15.5%)
Cash Deposit                  :   297 ( 10.9%)
Income                        :   241 (  8.9%)
Bills                         :   104 (  3.8%)
Bank Transfer                 :    87 (  3.2%)
Cash Withdrawal               :    62 (  2.3%)
Savings                       :    46 (  1.7%)
Airtime                       :    32 (  1.2%)
Shopping                      :     6 (  0.2%)
Reversal                      :     4 (  0.1%)
Oth

## Smart Rule

In [None]:
"""
STAGE 4: SEND MONEY CATEGORIZATION
Categorizes uncategorized Send Money transactions based on amount and recurring patterns

RULES:
1. Send Money + Recurring (‚â•2 times) + Amount > 500 ‚Üí Friends & Family
2. Send Money + Recurring (‚â•2 times) + Amount ‚â§ 500 ‚Üí Merchant
3. Send Money + Non-recurring (any amount) ‚Üí Merchant

All other categories from Stage 3 remain unchanged.
All improvements from Stage 2 & 3 are preserved:
‚úÖ Fuliza (LOAN) vs Fuliza payments separated
‚úÖ Loan Repayment as separate category
‚úÖ LOOP Payment ‚Üí Income
‚úÖ Data Bundles ‚â† Airtime
‚úÖ Cash Deposit ‚â† Cash Withdrawal
"""

import pandas as pd
import re
from typing import Dict
from collections import Counter


class SendMoneyCategorizer:
    """Categorize Send Money transactions using recurring detection and amount thresholds"""
    
    def __init__(self, amount_threshold: float = 500.0, recurring_threshold: int = 2):
        """
        Initialize with thresholds
        
        Args:
            amount_threshold: Amount threshold (default 500 KES)
            recurring_threshold: Minimum occurrences to be recurring (default 2)
        """
        self.amount_threshold = amount_threshold
        self.recurring_threshold = recurring_threshold
    
    def extract_recipient_id(self, extracted_fields_str: str) -> str:
        """
        Extract recipient identifier from extracted_fields string
        
        Args:
            extracted_fields_str: String representation of extracted fields
            
        Returns:
            Recipient identifier (phone number or name)
        """
        if pd.isna(extracted_fields_str) or extracted_fields_str == '':
            return None
        
        try:
            import ast
            fields = ast.literal_eval(extracted_fields_str)
            
            # Use phone number as primary identifier
            if 'recipient_number' in fields:
                return fields['recipient_number']
            elif 'recipient_name' in fields:
                return fields['recipient_name']
        except:
            pass
        
        return None
    
    def detect_recurring_recipients(self, df: pd.DataFrame) -> Dict[str, int]:
        """
        Detect recurring recipients in Send Money transactions
        
        Args:
            df: DataFrame with Send Money transactions
            
        Returns:
            Dictionary mapping recipient IDs to occurrence count
        """
        print("üîç Detecting recurring recipients in Send Money transactions...")
        
        # Filter to uncategorized Send Money only
        send_money_df = df[
            (df['transaction_type'] == 'Send Money') & 
            (df['category'] == 'Uncategorized')
        ].copy()
        
        if len(send_money_df) == 0:
            print("  No uncategorized Send Money transactions found")
            return {}
        
        # Extract recipient IDs
        send_money_df['recipient_id'] = send_money_df['extracted_fields_str'].apply(
            self.extract_recipient_id
        )
        
        # Count occurrences
        recipient_counts = send_money_df['recipient_id'].value_counts().to_dict()
        
        # Remove None
        recipient_counts = {k: v for k, v in recipient_counts.items() if k is not None}
        
        # Filter to recurring only
        recurring_recipients = {
            k: v for k, v in recipient_counts.items() 
            if v >= self.recurring_threshold
        }
        
        print(f"‚úì Found {len(recipient_counts)} unique recipients")
        print(f"‚úì {len(recurring_recipients)} recurring recipients (‚â•{self.recurring_threshold} transactions)")
        
        return recurring_recipients
    
    def categorize_send_money(self, row: pd.Series, recurring_recipients: Dict) -> str:
        """
        Categorize a single Send Money transaction
        
        Args:
            row: DataFrame row
            recurring_recipients: Dict of recurring recipient IDs
            
        Returns:
            Category (Friends & Family or Merchant)
        """
        # Extract recipient ID
        recipient_id = self.extract_recipient_id(row['extracted_fields_str'])
        
        # Check if recurring
        is_recurring = recipient_id in recurring_recipients if recipient_id else False
        
        # Get amount from Withdrawn column
        amount = 0
        if pd.notna(row['Withdrawn']):
            try:
                amount = abs(float(row['Withdrawn']))
            except:
                amount = 0
        
        # Apply rules
        if is_recurring:
            if amount > self.amount_threshold:
                # RULE 1: Recurring + >500 ‚Üí Friends & Family
                return 'Friends & Family'
            else:
                # RULE 2: Recurring + ‚â§500 ‚Üí Merchant
                return 'Merchant'
        else:
            # RULE 3: Non-recurring (any amount) ‚Üí Merchant
            return 'Merchant'
    
    def process_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Process entire dataframe - categorize Send Money transactions only
        
        Args:
            df: DataFrame with category column
            
        Returns:
            DataFrame with updated categories
        """
        print("ü§ñ Categorizing Send Money transactions...")
        print()
        
        # Detect recurring recipients
        recurring_recipients = self.detect_recurring_recipients(df)
        
        if recurring_recipients:
            print()
            print("Top 15 recurring recipients:")
            sorted_recipients = sorted(
                recurring_recipients.items(), 
                key=lambda x: x[1], 
                reverse=True
            )[:15]
            for recipient, count in sorted_recipients:
                print(f"  {recipient[:45]:45s}: {count:3d} times")
        
        print()
        
        # Count before
        before_uncat = len(df[
            (df['transaction_type'] == 'Send Money') & 
            (df['category'] == 'Uncategorized')
        ])
        
        # Apply categorization only to uncategorized Send Money
        mask = (df['transaction_type'] == 'Send Money') & (df['category'] == 'Uncategorized')
        
        df.loc[mask, 'category'] = df[mask].apply(
            lambda row: self.categorize_send_money(row, recurring_recipients),
            axis=1
        )
        
        # Count after
        after_uncat = len(df[
            (df['transaction_type'] == 'Send Money') & 
            (df['category'] == 'Uncategorized')
        ])
        family_friends = len(df[
            (df['transaction_type'] == 'Send Money') & 
            (df['category'] == 'Friends & Family')
        ])
        merchant = len(df[
            (df['transaction_type'] == 'Send Money') & 
            (df['category'] == 'Merchant')
        ])
        
        print("=" * 80)
        print("SEND MONEY CATEGORIZATION RESULTS")
        print("=" * 80)
        print(f"Processed: {before_uncat:,} Send Money transactions")
        print(f"  ‚Üí Friends & Family: {family_friends:,}")
        print(f"  ‚Üí Merchant: {merchant:,}")
        print(f"  ‚Üí Still Uncategorized: {after_uncat:,}")
        print()
        
        return df


def run_stage4(input_csv: str, output_csv: str, 
               amount_threshold: float >= 500.0,  
               recurring_threshold: int = 5):
    """
    Run Stage 4: Send Money Categorization
    
    Args:
        input_csv: Path to Stage 3 output CSV
        output_csv: Path to save final categorized CSV
        amount_threshold: Amount threshold in KES (default 500)
        recurring_threshold: Min occurrences for recurring (default 2)
    """
    
    print("=" * 80)
    print("STAGE 4: SEND MONEY CATEGORIZATION")
    print("=" * 80)
    print()
    print("Preserving all Stage 2 & 3 improvements:")
    print("   Fuliza (LOAN) vs Fuliza payments")
    print("   Loan Repayment separate")
    print("   LOOP Payment ‚Üí Income")
    print("   Data Bundles ‚â† Airtime")
    print("   Cash Deposit ‚â† Cash Withdrawal")
    print()
    print("RULES:")
    print(f"  1. Recurring (‚â•{recurring_threshold}) + Amount > {amount_threshold} ‚Üí Friends & Family")
    print(f"  2. Recurring (‚â•{recurring_threshold}) + Amount ‚â§ {amount_threshold} ‚Üí Merchant")
    print(f"  3. Non-recurring (any amount) ‚Üí Merchant")
    print()
    
    # Load data
    print(f" Loading: {input_csv}")
    df = pd.read_csv(input_csv, low_memory=False)
    print(f"‚úì Loaded {len(df):,} transactions")
    print()
    
    # Show current state
    print("Current categorization:")
    category_counts = df['category'].value_counts()
    for category in list(category_counts.head(10).index):
        count = category_counts.get(category, 0)
        pct = (count / len(df)) * 100 if len(df) > 0 else 0
        print(f"  {category:25s}: {count:5,} ({pct:5.1f}%)")
    
    send_money_uncat = len(df[
        (df['transaction_type'] == 'Send Money') & 
        (df['category'] == 'Uncategorized')
    ])
    print(f"\n  Send Money (Uncategorized): {send_money_uncat:,}")
    print()
    
    # Process
    categorizer = SendMoneyCategorizer(
        amount_threshold=amount_threshold,
        recurring_threshold=recurring_threshold
    )
    df = categorizer.process_dataframe(df)
    
    # Final summary
    print("=" * 80)
    print("FINAL CATEGORY BREAKDOWN")
    print("=" * 80)
    
    category_counts = df['category'].value_counts().sort_values(ascending=False)
    for category, count in category_counts.items():
        pct = (count / len(df)) * 100
        print(f"{category:30s}: {count:6,} ({pct:5.1f}%)")
    
    print()
    
    # Uncategorized check
    final_uncat = len(df[df['category'] == 'Uncategorized'])
    final_uncat_pct = (final_uncat / len(df)) * 100
    
    print("=" * 80)
    print("CATEGORIZATION SUCCESS RATE")
    print("=" * 80)
    print(f"Total transactions: {len(df):,}")
    print(f"Categorized: {len(df) - final_uncat:,} ({100 - final_uncat_pct:.1f}%)")
    print(f"Uncategorized: {final_uncat:,} ({final_uncat_pct:.1f}%)")
    print()
    
    # Sample results
    print("=" * 80)
    print("SAMPLE CATEGORIZATIONS")
    print("=" * 80)
    
    # Friends & Family samples
    ff_df = df[
        (df['category'] == 'Friends & Family') & 
        (df['transaction_type'] == 'Send Money')
    ]
    if len(ff_df) > 0:
        print(f"\n--- FRIENDS & FAMILY ({len(ff_df):,} transactions) ---")
        for _, row in ff_df.head(5).iterrows():
            try:
                amount = float(row['Withdrawn']) if pd.notna(row['Withdrawn']) else 0.0
            except:
                amount = 0.0
            desc = row['description_clean'][:55] if pd.notna(row['description_clean']) else ''
            print(f"  KES {amount:>8,.0f} | {desc}")
    
    # Merchant samples from Send Money
    merchant_df = df[
        (df['category'] == 'Merchant') & 
        (df['transaction_type'] == 'Send Money')
    ]
    if len(merchant_df) > 0:
        print(f"\n--- MERCHANT - from Send Money ({len(merchant_df):,} transactions) ---")
        for _, row in merchant_df.head(5).iterrows():
            try:
                amount = float(row['Withdrawn']) if pd.notna(row['Withdrawn']) else 0.0
            except:
                amount = 0.0
            desc = row['description_clean'][:55] if pd.notna(row['description_clean']) else ''
            print(f"  KES {amount:>8,.0f} | {desc}")
    
    print()
    
    # Spending summary
    print("=" * 80)
    print("SPENDING SUMMARY BY CATEGORY")
    print("=" * 80)
    
    spending_categories = df[df['Withdrawn'].notna()].copy()
    spending_categories['Withdrawn'] = pd.to_numeric(spending_categories['Withdrawn'], errors='coerce')
    
    cat_spending = spending_categories.groupby('category')['Withdrawn'].agg([
        ('Total', 'sum'),
        ('Count', 'count'),
        ('Average', 'mean')
    ]).sort_values('Total', ascending=False)
    
    print(f"\n{'Category':<30s} {'Total (KES)':>15s} {'Count':>8s} {'Avg (KES)':>12s}")
    print("-" * 80)
    
    total_spent = 0
    for category, row in cat_spending.head(15).iterrows():
        if category not in ['Income', 'Reversal', 'Cash Deposit']:
            total_spent += row['Total']
            print(f"{category:<30s} {row['Total']:>15,.2f} {int(row['Count']):>8,} {row['Average']:>12,.2f}")
    
    print("-" * 80)
    print(f"{'TOTAL SPENDING':<30s} {total_spent:>15,.2f}")
    print()
    
    # Save
    df.to_csv(output_csv, index=False)
    print(f" Saved final categorized data: {output_csv}")
    print()
    
    print("=" * 80)
    print("STAGE 4 COMPLETE! üéâ")
    print("=" * 80)
    print()
    print("Your M-Pesa data is now fully categorized!")
    print(f"Categorization rate: {100 - final_uncat_pct:.1f}%")
    print(f"Total spending: KES {total_spent:,.2f}")
    print()
    
    return df


if __name__ == "__main__":
    # File paths
    INPUT_CSV = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\stage3_with_categories.csv"
    OUTPUT_CSV = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\stage4_final_categorized.csv"
    
    # Thresholds
    AMOUNT_THRESHOLD = 500.0  # KES
    RECURRING_THRESHOLD = 2   # Minimum occurrences
    
    # Run
    df = run_stage4(
        INPUT_CSV, 
        OUTPUT_CSV,
        amount_threshold=AMOUNT_THRESHOLD,
        recurring_threshold=RECURRING_THRESHOLD
    )
    
    print("Next steps:")
    print("  ‚Ä¢ Clean the CSV (remove empty/duplicate columns)")
    print("  ‚Ä¢ Analyze spending patterns")
    print("  ‚Ä¢ Create visualizations")

STAGE 4: SEND MONEY CATEGORIZATION

Preserving all Stage 2 & 3 improvements:
  ‚úÖ Fuliza (LOAN) vs Fuliza payments
  ‚úÖ Loan Repayment separate
  ‚úÖ LOOP Payment ‚Üí Income
  ‚úÖ Data Bundles ‚â† Airtime
  ‚úÖ Cash Deposit ‚â† Cash Withdrawal

RULES:
  1. Recurring (‚â•2) + Amount > 500.0 ‚Üí Friends & Family
  2. Recurring (‚â•2) + Amount ‚â§ 500.0 ‚Üí Merchant
  3. Non-recurring (any amount) ‚Üí Merchant

üìÇ Loading: C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\stage3_with_categories.csv
‚úì Loaded 2,715 transactions

Current categorization:
  M-Pesa Fees              :   779 ( 28.7%)
  Uncategorized            :   626 ( 23.1%)
  Merchant                 :   420 ( 15.5%)
  Cash Deposit             :   297 ( 10.9%)
  Income                   :   241 (  8.9%)
  Bills                    :   104 (  3.8%)
  Bank Transfer            :    87 (  3.2%)
  Cash Withdrawal          :    62 (  2.3%)
  Savings                  :    46 (  1.7%)
  Airtime                  :    32 (  1.2%)


## Merchant Sub-categories

In [None]:
"""
MERCHANT SUBCATEGORY PREDICTION - TRAINING
Customized for your mpesa_stage6_john.csv data

Your data has:
- 1,122 labeled merchant subcategories in 'merchant deepdive' column
- 37 subcategories (deposit, food, transport, family, construction, etc.)
- Ready to train a model!
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')


def clean_subcategories(df: pd.DataFrame, subcategory_col: str) -> pd.DataFrame:
    """
    Clean up subcategory labels (fix typos, consolidate similar categories)
    
    Args:
        df: DataFrame
        subcategory_col: Name of subcategory column
        
    Returns:
        DataFrame with cleaned subcategories
    """
    print("üßπ Cleaning subcategory labels...")
    
    # Create a copy to avoid modifying original
    df[subcategory_col] = df[subcategory_col].astype(str)
    
    # Fix typos and consolidate
    replacements = {
        'depoit': 'deposit',
        'familly': 'family',
        'family=500': 'family',
        'family above500': 'family',
        'aittime': 'airtime',
        'labor,farm': 'labor',
        'transport,delivery': 'transport',
        'sacco,savings': 'saving',
        'health,code': 'health',
        'shopping,code': 'shop',
        'food,code': 'food',
        'gotv,code': 'gotv',
        'saving, code': 'saving',
        'healthcare': 'health',
        'reversed': 'reversal'
    }
    
    df[subcategory_col] = df[subcategory_col].replace(replacements)
    
    # Remove 'nan' strings
    df.loc[df[subcategory_col] == 'nan', subcategory_col] = np.nan
    
    print(f"‚úì Cleaned subcategories")
    print(f"  Unique subcategories after cleaning: {df[subcategory_col].nunique()}")
    
    return df


def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    """Extract features from transactions"""
    print(" Extracting features...")
    
    # Create combined text features
    df['text_features'] = ''
    
    # Add description
    if 'description' in df.columns:
        df['text_features'] += df['description'].fillna('') + ' '
    
    # Add details
    if 'details_original' in df.columns:
        df['text_features'] += df['details_original'].fillna('') + ' '
    
    # Extract merchant name from extracted_fields
    def extract_merchant_name(fields_str):
        if pd.isna(fields_str) or fields_str == '' or fields_str == '{}':
            return ''
        try:
            import ast
            fields = ast.literal_eval(fields_str)
            name = fields.get('merchant_name', '')
            if not name:
                name = fields.get('recipient_name', '')
            return name
        except:
            return ''
    
    if 'extracted_fields' in df.columns:
        df['merchant_name'] = df['extracted_fields'].apply(extract_merchant_name)
        df['text_features'] += df['merchant_name'].fillna('')
    
    # Clean text
    df['text_features'] = df['text_features'].str.lower()
    df['text_features'] = df['text_features'].str.replace(r'[^a-z0-9\s]', ' ', regex=True)
    df['text_features'] = df['text_features'].str.replace(r'\s+', ' ', regex=True)
    df['text_features'] = df['text_features'].str.strip()
    
    # Amount features
    if 'withdrawn' in df.columns:
        df['amount'] = pd.to_numeric(df['withdrawn'], errors='coerce').fillna(0).abs()
    else:
        df['amount'] = 0
    
    df['amount_log'] = np.log1p(df['amount'])
    
    print(f"‚úì Features extracted")
    
    return df


def train_model(input_csv: str, subcategory_col: str, output_model_path: str):
    """
    Train the merchant subcategory prediction model
    
    Args:
        input_csv: Path to your CSV file
        subcategory_col: Name of subcategory column ('merchant deepdive')
        output_model_path: Where to save the trained model
    """
    
    print("=" * 80)
    print("MERCHANT SUBCATEGORY PREDICTION - TRAINING")
    print("=" * 80)
    print()
    
    # Load data
    print(f"üìÇ Loading: {input_csv}")
    df = pd.read_csv(input_csv, low_memory=False)
    print(f"‚úì Loaded {len(df):,} transactions")
    print()
    
    # Clean subcategories
    df = clean_subcategories(df, subcategory_col)
    
    # Filter to MERCHANT category only
    if 'category' in df.columns:
        df = df[df['category'] == 'Merchant'].copy()
        print(f"‚úì Filtered to {len(df):,} Merchant transactions")
    
    # Filter to labeled data only
    df_labeled = df[df[subcategory_col].notna()].copy()
    print(f"‚úì Found {len(df_labeled):,} labeled Merchant transactions")
    print()
    
    # Show distribution
    print("=" * 80)
    print("SUBCATEGORY DISTRIBUTION")
    print("=" * 80)
    subcategory_counts = df_labeled[subcategory_col].value_counts()
    for subcat, count in subcategory_counts.head(15).items():
        pct = (count / len(df_labeled)) * 100
        print(f"{subcat:25s}: {count:5,} ({pct:5.1f}%)")
    
    if len(subcategory_counts) > 15:
        print(f"... and {len(subcategory_counts) - 15} more subcategories")
    
    print()
    
    # Filter out subcategories with only 1 example (can't be split for training/testing)
    print("üîç Filtering subcategories with too few examples...")
    min_examples = 2
    valid_subcategories = subcategory_counts[subcategory_counts >= min_examples].index
    
    before_filter = len(df_labeled)
    df_labeled = df_labeled[df_labeled[subcategory_col].isin(valid_subcategories)]
    removed = before_filter - len(df_labeled)
    
    if removed > 0:
        print(f"  ‚ö†Ô∏è  Removed {removed} transactions from rare subcategories (only 1 example)")
        print(f"  ‚úì Training with {len(df_labeled):,} transactions")
        print(f"  ‚úì Across {len(valid_subcategories)} subcategories")
    else:
        print(f"  ‚úì All subcategories have enough examples")
    
    print()
    
    # Extract features
    df_labeled = extract_features(df_labeled)
    
    # Create TF-IDF features
    print("üî§ Creating TF-IDF text features...")
    vectorizer = TfidfVectorizer(
        max_features=300,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.9
    )
    
    X_text = vectorizer.fit_transform(df_labeled['text_features'])
    tfidf_df = pd.DataFrame(
        X_text.toarray(),
        columns=[f'tfidf_{i}' for i in range(X_text.shape[1])],
        index=df_labeled.index
    )
    
    # Add amount feature
    amount_features = df_labeled[['amount_log']].copy()
    
    # Combine features
    X = pd.concat([tfidf_df, amount_features], axis=1)
    
    # Target variable
    y = df_labeled[subcategory_col]
    
    print(f"‚úì Created {X.shape[1]} features")
    print()
    
    # Check if we can use stratification (need at least 2 examples per class)
    class_counts = y.value_counts()
    can_stratify = (class_counts >= 2).all()
    
    if not can_stratify:
        print("‚ö†Ô∏è  Some subcategories have only 1 example - using random split instead of stratified")
    
    # Split data
    if can_stratify:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
    
    
    print(f"Training set: {len(X_train):,} samples")
    print(f"Test set: {len(X_test):,} samples")
    print()
    
    # Check if stratification was successful
    print(f"Training on {y_train.nunique()} subcategories")
    print()
    
    # Train models
    print("=" * 80)
    print("TRAINING MODELS")
    print("=" * 80)
    print()
    
    models = {
        'Logistic Regression': LogisticRegression(
            max_iter=1000,
            random_state=42,
            class_weight='balanced',
            C=1.0
        ),
        'Random Forest': RandomForestClassifier(
            n_estimators=100,
            max_depth=15,
            random_state=42,
            class_weight='balanced',
            min_samples_split=5
        ),
    }
    
    best_model = None
    best_accuracy = 0
    best_model_name = ''
    
    for model_name, model in models.items():
        print(f"ü§ñ Training {model_name}...")
        
        # Train
        model.fit(X_train, y_train)
        
        # Evaluate
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        
        print(f"  ‚úì Test Accuracy: {accuracy:.3f}")
        print(f"  ‚úì CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
        print()
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
            best_model_name = model_name
    
    print("=" * 80)
    print(f"üèÜ BEST MODEL: {best_model_name}")
    print(f"   Accuracy: {best_accuracy:.3f}")
    print("=" * 80)
    print()
    
    # Classification report for best model
    y_pred = best_model.predict(X_test)
    
    print("CLASSIFICATION REPORT (Top subcategories):")
    print()
    
    # Show report for top categories only (to avoid clutter)
    top_categories = subcategory_counts.head(10).index.tolist()
    
    # Filter test data to top categories
    mask = y_test.isin(top_categories)
    if mask.sum() > 0:
        print(classification_report(y_test[mask], y_pred[mask], zero_division=0))
    
    print()
    
    # Save model
    print(f"üíæ Saving model to: {output_model_path}")
    
    model_data = {
        'vectorizer': vectorizer,
        'model': best_model,
        'subcategories': list(y.unique()),
        'model_name': best_model_name,
        'accuracy': best_accuracy
    }
    
    joblib.dump(model_data, output_model_path)
    print("‚úì Model saved successfully!")
    print()
    
    print("=" * 80)
    print("TRAINING COMPLETE! üéâ")
    print("=" * 80)
    print()
    print(f"Model: {best_model_name}")
    print(f"Accuracy: {best_accuracy:.1%}")
    print(f"Trained on: {len(df_labeled):,} examples")
    print(f"Subcategories: {len(y.unique())}")
    print()
    print("Next step:")
    print("  Run the prediction script to categorize unlabeled transactions!")
    print()


if __name__ == "__main__":
    # Your file paths
    INPUT_CSV = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\Merchant sub-categories.csv"
    SUBCATEGORY_COL = "merchant deepdive"
    OUTPUT_MODEL = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\merchant_subcategory_model.pkl"
    
    # Train
    train_model(INPUT_CSV, SUBCATEGORY_COL, OUTPUT_MODEL)

MERCHANT SUBCATEGORY PREDICTION - TRAINING

üìÇ Loading: C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\Merchant sub-categories.csv
‚úì Loaded 2,715 transactions

üßπ Cleaning subcategory labels...
‚úì Cleaned subcategories
  Unique subcategories after cleaning: 23
‚úì Filtered to 1,125 Merchant transactions
‚úì Found 1,118 labeled Merchant transactions

SUBCATEGORY DISTRIBUTION
deposit                  :   283 ( 25.3%)
food                     :   220 ( 19.7%)
transport                :   160 ( 14.3%)
family                   :    95 (  8.5%)
construction             :    62 (  5.5%)
groceries                :    49 (  4.4%)
friend                   :    36 (  3.2%)
shop                     :    32 (  2.9%)
contribution             :    30 (  2.7%)
business                 :    30 (  2.7%)
kinyozi                  :    22 (  2.0%)
labor                    :    21 (  1.9%)
withdraw                 :    19 (  1.7%)
saving                   :    17 (  1.5%)
airtime                  

## Prediction

In [56]:
"""
MERCHANT SUBCATEGORY PREDICTION - INFERENCE
Predicts subcategories for unlabeled transactions using trained model

Your unlabeled data:
- 1,593 transactions without subcategories
- Mostly in: Transfers, Bills, Income, Savings, Online Shopping
"""

import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')


def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    """Extract same features used in training"""
    
    # Create combined text features
    df['text_features'] = ''
    
    if 'description' in df.columns:
        df['text_features'] += df['description'].fillna('') + ' '
    
    if 'details_original' in df.columns:
        df['text_features'] += df['details_original'].fillna('') + ' '
    
    # Extract merchant name
    def extract_merchant_name(fields_str):
        if pd.isna(fields_str) or fields_str == '' or fields_str == '{}':
            return ''
        try:
            import ast
            fields = ast.literal_eval(fields_str)
            name = fields.get('merchant_name', '')
            if not name:
                name = fields.get('recipient_name', '')
            return name
        except:
            return ''
    
    if 'extracted_fields' in df.columns:
        df['merchant_name'] = df['extracted_fields'].apply(extract_merchant_name)
        df['text_features'] += df['merchant_name'].fillna('')
    
    # Clean text
    df['text_features'] = df['text_features'].str.lower()
    df['text_features'] = df['text_features'].str.replace(r'[^a-z0-9\s]', ' ', regex=True)
    df['text_features'] = df['text_features'].str.replace(r'\s+', ' ', regex=True)
    df['text_features'] = df['text_features'].str.strip()
    
    # Amount features
    if 'withdrawn' in df.columns:
        df['amount'] = pd.to_numeric(df['withdrawn'], errors='coerce').fillna(0).abs()
    else:
        df['amount'] = 0
    
    df['amount_log'] = np.log1p(df['amount'])
    
    return df


def predict_subcategories(input_csv: str, 
                         model_path: str, 
                         subcategory_col: str,
                         output_csv: str,
                         confidence_threshold: float = 0.4):
    """
    Predict subcategories for unlabeled transactions
    
    Args:
        input_csv: Your data file
        model_path: Trained model file
        subcategory_col: Column name for subcategories
        output_csv: Where to save results
        confidence_threshold: Minimum confidence (0-1)
    """
    
    print("=" * 80)
    print("MERCHANT SUBCATEGORY PREDICTION - INFERENCE")
    print("=" * 80)
    print()
    
    # Load model
    print(f"üìÇ Loading model: {model_path}")
    model_data = joblib.load(model_path)
    
    vectorizer = model_data['vectorizer']
    model = model_data['model']
    model_name = model_data['model_name']
    accuracy = model_data['accuracy']
    
    print(f"‚úì Loaded {model_name} (Accuracy: {accuracy:.1%})")
    print(f"‚úì Trained on {len(model_data['subcategories'])} subcategories")
    print()
    
    # Load data
    print(f"üìÇ Loading data: {input_csv}")
    df = pd.read_csv(input_csv, low_memory=False)
    print(f"‚úì Loaded {len(df):,} transactions")
    print()
    
    # Check if subcategory column exists, if not create it
    if subcategory_col not in df.columns:
        print(f"‚ö†Ô∏è  Column '{subcategory_col}' not found - creating it")
        df[subcategory_col] = np.nan
    
    # Filter to MERCHANT category ONLY
    if 'category' in df.columns:
        merchant_mask = df['category'] == 'Merchant'
        print(f"‚úì Found {merchant_mask.sum():,} Merchant transactions")
        
        # Find unlabeled merchants
        df[subcategory_col] = df[subcategory_col].astype(str)
        df.loc[df[subcategory_col] == 'nan', subcategory_col] = np.nan
        
        unlabeled_mask = merchant_mask & df[subcategory_col].isna()
        df_unlabeled = df[unlabeled_mask].copy()
        
        print(f"‚úì Found {len(df_unlabeled):,} unlabeled Merchant transactions")
        print()
    else:
        # Fallback if no category column
        df[subcategory_col] = df[subcategory_col].astype(str)
        df.loc[df[subcategory_col] == 'nan', subcategory_col] = np.nan
        
        unlabeled_mask = df[subcategory_col].isna()
        df_unlabeled = df[unlabeled_mask].copy()
        
        print(f"‚úì Found {len(df_unlabeled):,} unlabeled transactions")
        print()
    
    if len(df_unlabeled) == 0:
        print("‚úÖ All Merchant transactions already labeled!")
        print()
        print("Your data is complete:")
        print(f"  Total transactions: {len(df):,}")
        print(f"  Merchant (labeled): {merchant_mask.sum():,}")
        print(f"  Other categories: {(~merchant_mask).sum():,}")
        return df
    
    print(f"Unlabeled Merchants to predict: {len(df_unlabeled):,}")
    print()
    
    # Extract features
    print("üîÆ Making predictions...")
    df_unlabeled = extract_features(df_unlabeled)
    
    # Transform text
    X_text = vectorizer.transform(df_unlabeled['text_features'])
    tfidf_df = pd.DataFrame(
        X_text.toarray(),
        columns=[f'tfidf_{i}' for i in range(X_text.shape[1])],
        index=df_unlabeled.index
    )
    
    # Add amount features
    amount_features = df_unlabeled[['amount_log']].copy()
    
    # Combine
    X = pd.concat([tfidf_df, amount_features], axis=1)
    
    # Predict
    predictions = model.predict(X)
    probabilities = model.predict_proba(X)
    max_probs = probabilities.max(axis=1)
    
    # Add predictions to unlabeled data
    df_unlabeled['predicted_subcategory'] = predictions
    df_unlabeled['prediction_confidence'] = max_probs
    df_unlabeled['needs_review'] = max_probs < confidence_threshold
    
    print(f"‚úì Predictions complete!")
    print()
    
    # Summary
    print("=" * 80)
    print("PREDICTION SUMMARY")
    print("=" * 80)
    print()
    
    print("Predicted subcategories:")
    for subcat, count in df_unlabeled['predicted_subcategory'].value_counts().head(15).items():
        pct = (count / len(df_unlabeled)) * 100
        avg_conf = df_unlabeled[df_unlabeled['predicted_subcategory'] == subcat]['prediction_confidence'].mean()
        print(f"  {subcat:25s}: {count:5,} ({pct:5.1f}%) - Avg Conf: {avg_conf:.2f}")
    
    print()
    print("Confidence distribution:")
    high = len(df_unlabeled[df_unlabeled['prediction_confidence'] >= 0.7])
    med = len(df_unlabeled[(df_unlabeled['prediction_confidence'] >= 0.4) & 
                          (df_unlabeled['prediction_confidence'] < 0.7)])
    low = len(df_unlabeled[df_unlabeled['prediction_confidence'] < 0.4])
    
    print(f"  High (‚â•0.7):     {high:5,} ({high/len(df_unlabeled)*100:5.1f}%)")
    print(f"  Medium (0.4-0.7): {med:5,} ({med/len(df_unlabeled)*100:5.1f}%)")
    print(f"  Low (<0.4):      {low:5,} ({low/len(df_unlabeled)*100:5.1f}%) ‚ö†Ô∏è  Review needed")
    print()
    
    # Merge predictions back to original dataframe
    # For unlabeled rows, use predictions; for labeled rows, keep original
    df.loc[unlabeled_mask, subcategory_col] = df_unlabeled['predicted_subcategory'].values
    
    # Add prediction metadata columns if they don't exist
    if 'prediction_confidence' not in df.columns:
        df['prediction_confidence'] = np.nan
    if 'needs_review' not in df.columns:
        df['needs_review'] = False
    if 'was_manually_labeled' not in df.columns:
        df['was_manually_labeled'] = False
    
    df.loc[unlabeled_mask, 'prediction_confidence'] = df_unlabeled['prediction_confidence'].values
    df.loc[unlabeled_mask, 'needs_review'] = df_unlabeled['needs_review'].values
    
    # Mark originally labeled rows (if any were labeled before)
    df.loc[~unlabeled_mask & merchant_mask, 'was_manually_labeled'] = True
    
    # Sample predictions
    print("=" * 80)
    print("SAMPLE PREDICTIONS")
    print("=" * 80)
    print()
    
    # High confidence
    high_conf = df_unlabeled[df_unlabeled['prediction_confidence'] >= 0.7]
    if len(high_conf) > 0:
        print("HIGH CONFIDENCE PREDICTIONS:")
        for _, row in high_conf.head(5).iterrows():
            desc = row['description'][:50] if pd.notna(row['description']) else ''
            subcat = row['predicted_subcategory']
            conf = row['prediction_confidence']
            print(f"  {subcat:20s} ({conf:.2f}) | {desc}")
    
    print()
    
    # Low confidence
    low_conf = df_unlabeled[df_unlabeled['prediction_confidence'] < 0.4]
    if len(low_conf) > 0:
        print("LOW CONFIDENCE PREDICTIONS (REVIEW THESE):")
        for _, row in low_conf.head(5).iterrows():
            desc = row['description'][:50] if pd.notna(row['description']) else ''
            subcat = row['predicted_subcategory']
            conf = row['prediction_confidence']
            print(f"  {subcat:20s} ({conf:.2f}) | {desc}")
    
    print()
    
    # Save
    print(f"üíæ Saving results to: {output_csv}")
    df.to_csv(output_csv, index=False)
    print("‚úì Saved!")
    print()
    
    # Save low confidence for review
    if low < len(df_unlabeled):
        review_file = output_csv.replace('.csv', '_needs_review.csv')
        df_review = df[df.get('needs_review', False) == True]
        df_review.to_csv(review_file, index=False)
        print(f"‚ö†Ô∏è  Saved {len(df_review):,} low-confidence predictions to:")
        print(f"   {review_file}")
        print("   Please review these manually!")
        print()
    
    print("=" * 80)
    print("PREDICTION COMPLETE! üéâ")
    print("=" * 80)
    print()
    print(f"Total labeled: {len(df[df[subcategory_col].notna()]):,}")
    print(f"  - Manual: {(~unlabeled_mask).sum():,}")
    print(f"  - Predicted: {len(df_unlabeled):,}")
    print()
    print("Final subcategory distribution:")
    for subcat, count in df[subcategory_col].value_counts().head(10).items():
        pct = (count / len(df)) * 100
        print(f"  {subcat:25s}: {count:5,} ({pct:5.1f}%)")
    print()
    
    return df


if __name__ == "__main__":
    # Your file paths
    INPUT_CSV = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\Merchant sub-categories.csv"
    MODEL_PATH = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\merchant_subcategory_model.pkl"
    SUBCATEGORY_COL = "merchant deepdive"
    OUTPUT_CSV = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\mpesa_with_all_subcategories.csv"
    
    # Confidence threshold (lower = more predictions, but less confident)
    CONFIDENCE_THRESHOLD = 0.4
    
    # Predict
    df_final = predict_subcategories(
        INPUT_CSV,
        MODEL_PATH,
        SUBCATEGORY_COL,
        OUTPUT_CSV,
        CONFIDENCE_THRESHOLD
    )
    
    print("‚úÖ All done! Check your output files.")

MERCHANT SUBCATEGORY PREDICTION - INFERENCE

üìÇ Loading model: C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\merchant_subcategory_model.pkl
‚úì Loaded Random Forest (Accuracy: 79.5%)
‚úì Trained on 21 subcategories

üìÇ Loading data: C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\Merchant sub-categories.csv
‚úì Loaded 2,715 transactions

‚úì Found 1,125 Merchant transactions
‚úì Found 7 unlabeled Merchant transactions

Unlabeled Merchants to predict: 7

üîÆ Making predictions...
‚úì Predictions complete!

PREDICTION SUMMARY

Predicted subcategories:
  deposit                  :     1 ( 14.3%) - Avg Conf: 1.00
  food                     :     1 ( 14.3%) - Avg Conf: 0.95
  contribution             :     1 ( 14.3%) - Avg Conf: 0.18
  clothing                 :     1 ( 14.3%) - Avg Conf: 0.25
  business                 :     1 ( 14.3%) - Avg Conf: 0.18
  groceries                :     1 ( 14.3%) - Avg Conf: 0.26
  airtime                  :     1 ( 14.3%) - Avg Conf: 0.93

Confi

## Final CSV

In [58]:
"""
FINAL CSV GENERATOR
Creates a clean, final CSV with:
- Original categories (Income, Bills, Transfers, etc.) remain as-is
- Merchant category replaced with specific subcategories (food, transport, deposit, etc.)
- Clean column names and structure
"""

import pandas as pd
import numpy as np


def generate_final_csv(input_csv: str, 
                      output_csv: str,
                      subcategory_col: str = 'merchant deepdive',
                      main_category_col: str = 'category'):
    """
    Generate final CSV with merchant subcategories replacing the generic Merchant category
    
    Args:
        input_csv: Path to CSV with predictions (mpesa_with_all_subcategories.csv)
        output_csv: Path to save final clean CSV
        subcategory_col: Name of merchant subcategory column
        main_category_col: Name of main category column
    """
    
    print("=" * 80)
    print("FINAL CSV GENERATOR - MERCHANT SUBCATEGORY INTEGRATION")
    print("=" * 80)
    print()
    
    # Load data
    print(f"üìÇ Loading: {input_csv}")
    df = pd.read_csv(input_csv, low_memory=False)
    print(f"‚úì Loaded {len(df):,} transactions")
    print()
    
    # Create final category column
    print("üîÑ Creating unified category column...")
    
    # Start with main category
    df['final_category'] = df[main_category_col].copy()
    
    # Replace "Merchant" with specific subcategories
    if subcategory_col in df.columns:
        merchant_mask = df[main_category_col] == 'Merchant'
        has_subcategory = df[subcategory_col].notna()
        
        # For Merchant transactions with subcategories, use the subcategory
        replace_mask = merchant_mask & has_subcategory
        df.loc[replace_mask, 'final_category'] = df.loc[replace_mask, subcategory_col]
        
        replaced_count = replace_mask.sum()
        print(f"‚úì Replaced {replaced_count:,} 'Merchant' entries with specific subcategories")
        
        # Check for any Merchants without subcategories
        unclassified_merchants = merchant_mask & ~has_subcategory
        if unclassified_merchants.sum() > 0:
            print(f"‚ö†Ô∏è  {unclassified_merchants.sum()} Merchant transactions still without subcategories")
            print("   These will remain as 'Merchant'")
    else:
        print(f"‚ö†Ô∏è  Column '{subcategory_col}' not found - keeping all categories as-is")
    
    print()
    
    # Show category distribution
    print("=" * 80)
    print("FINAL CATEGORY BREAKDOWN")
    print("=" * 80)
    print()
    
    category_counts = df['final_category'].value_counts()
    
    # Separate merchant subcategories from main categories
    main_categories = ['Income', 'M-Pesa Fees', 'Transfers', 'Bills', 'Savings', 
                      'Online Shopping', 'Friends & Family', 'Cash Withdrawal',
                      'Airtime & Data', 'Shopping', 'Education', 'Personal Care',
                      'Health Care', 'Fast Foods', 'Entertainment', 'Transport',
                      'Food & Dining', 'Reversal', 'Other', 'Merchant']
    
    # Show main categories first
    print("MAIN CATEGORIES:")
    for cat in main_categories:
        if cat in category_counts.index:
            count = category_counts[cat]
            pct = (count / len(df)) * 100
            print(f"  {cat:30s}: {count:6,} ({pct:5.1f}%)")
    
    print()
    print("MERCHANT SUBCATEGORIES:")
    merchant_subcats = [cat for cat in category_counts.index if cat not in main_categories]
    for cat in sorted(merchant_subcats):
        count = category_counts[cat]
        pct = (count / len(df)) * 100
        print(f"  {cat:30s}: {count:6,} ({pct:5.1f}%)")
    
    print()
    
    # Select essential columns for final CSV
    print("=" * 80)
    print("CREATING CLEAN FINAL CSV")
    print("=" * 80)
    print()
    
    # Define columns to keep
    essential_columns = [
        'receipt_no',
        'completion_time',
        'description',
        'type',
        'withdrawn',
        'paid_in',
        'balance',
        'final_category'
    ]
    
    # Check which columns exist
    available_columns = [col for col in essential_columns if col in df.columns]
    
    # Add optional useful columns if they exist
    optional_columns = ['extracted_fields', 'prediction_confidence', 'was_manually_labeled']
    for col in optional_columns:
        if col in df.columns:
            available_columns.append(col)
    
    df_final = df[available_columns].copy()
    
    print("Final columns included:")
    for i, col in enumerate(df_final.columns, 1):
        null_count = df_final[col].isna().sum()
        null_pct = (null_count / len(df_final)) * 100
        print(f"  {i:2d}. {col:25s} - {null_count:5,} nulls ({null_pct:4.1f}%)")
    
    print()
    
    # Save
    print(f"üíæ Saving final CSV to: {output_csv}")
    df_final.to_csv(output_csv, index=False)
    print("‚úì Saved successfully!")
    print()
    
    # Summary statistics
    print("=" * 80)
    print("SUMMARY STATISTICS")
    print("=" * 80)
    print()
    
    total_categories = df_final['final_category'].nunique()
    print(f"Total transactions: {len(df_final):,}")
    print(f"Total categories: {total_categories}")
    print(f"Merchant subcategories: {len(merchant_subcats)}")
    print(f"Main categories: {len([c for c in main_categories if c in category_counts.index])}")
    print()
    
    # Spending analysis (if withdrawn column exists)
    if 'withdrawn' in df_final.columns:
        print("=" * 80)
        print("SPENDING BY CATEGORY (Top 15)")
        print("=" * 80)
        print()
        
        spending_df = df_final[df_final['withdrawn'].notna()].copy()
        spending_df['withdrawn'] = pd.to_numeric(spending_df['withdrawn'], errors='coerce').abs()
        
        category_spending = spending_df.groupby('final_category')['withdrawn'].agg([
            ('Total', 'sum'),
            ('Count', 'count'),
            ('Average', 'mean')
        ]).sort_values('Total', ascending=False).head(15)
        
        print(f"{'Category':<30s} {'Total (KES)':>15s} {'Count':>8s} {'Avg (KES)':>12s}")
        print("-" * 80)
        
        for category, row in category_spending.iterrows():
            if category not in ['Income', 'Reversal']:
                print(f"{category:<30s} {row['Total']:>15,.2f} {int(row['Count']):>8,} {row['Average']:>12,.2f}")
        
        print()
    
    # Sample transactions
    print("=" * 80)
    print("SAMPLE TRANSACTIONS (Final Format)")
    print("=" * 80)
    print()
    
    # Show samples from different categories
    sample_categories = df_final['final_category'].value_counts().head(5).index
    for cat in sample_categories:
        cat_df = df_final[df_final['final_category'] == cat]
        print(f"\n{cat} ({len(cat_df):,} total):")
        for _, row in cat_df.head(2).iterrows():
            desc = str(row['description'])[:50] if pd.notna(row['description']) else ''
            amount = row['withdrawn'] if pd.notna(row.get('withdrawn', '')) else ''
            print(f"  {str(amount):>10s} | {desc}")
    
    print()
    
    print("=" * 80)
    print("FINAL CSV GENERATION COMPLETE! üéâ")
    print("=" * 80)
    print()
    print(f"Output file: {output_csv}")
    print()
    print("Your data is now ready for:")
    print("  ‚Ä¢ Analysis and visualization")
    print("  ‚Ä¢ Spending reports")
    print("  ‚Ä¢ Budget tracking")
    print("  ‚Ä¢ Financial insights")
    print()
    
    return df_final


def create_category_mapping_report(df: pd.DataFrame, output_file: str):
    """
    Create a report showing how categories were mapped
    
    Args:
        df: Final dataframe with 'category' and 'final_category' columns
        output_file: Path to save the mapping report
    """
    
    if 'category' not in df.columns or 'final_category' not in df.columns:
        print("‚ö†Ô∏è  Cannot create mapping report - missing required columns")
        return
    
    print("=" * 80)
    print("CREATING CATEGORY MAPPING REPORT")
    print("=" * 80)
    print()
    
    # Create mapping
    mapping_df = df.groupby(['category', 'final_category']).size().reset_index(name='count')
    mapping_df = mapping_df.sort_values(['category', 'count'], ascending=[True, False])
    
    # Save
    mapping_df.to_csv(output_file, index=False)
    
    print(f"‚úì Category mapping saved to: {output_file}")
    print()
    
    print("Mapping summary:")
    print(f"  Original categories: {df['category'].nunique()}")
    print(f"  Final categories: {df['final_category'].nunique()}")
    print()
    
    # Show how Merchant was broken down
    merchant_mapping = mapping_df[mapping_df['category'] == 'Merchant']
    if len(merchant_mapping) > 0:
        print("Merchant breakdown:")
        for _, row in merchant_mapping.head(10).iterrows():
            print(f"  Merchant ‚Üí {row['final_category']:25s}: {row['count']:5,}")
        if len(merchant_mapping) > 10:
            print(f"  ... and {len(merchant_mapping) - 10} more")
    
    print()


if __name__ == "__main__":
    # File paths
    INPUT_CSV = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\mpesa_with_all_subcategories.csv" # Output from prediction script
    OUTPUT_CSV = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\mpesa_final_categorized.csv"      # Final clean CSV
    MAPPING_REPORT = r"C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\category_mapping_report.csv"  # How categories were mapped
    
    # Generate final CSV
    df_final = generate_final_csv(
        INPUT_CSV,
        OUTPUT_CSV,
        subcategory_col='merchant deepdive',
        main_category_col='category'
    )
    
    # Create mapping report
    if 'category' in df_final.columns:
        # Need to reload full data with both columns for mapping
        df_full = pd.read_csv(INPUT_CSV, low_memory=False)
        
        # Add final_category to full dataframe
        df_full['final_category'] = df_full['category'].copy()
        merchant_mask = df_full['category'] == 'Merchant'
        has_subcategory = df_full['merchant deepdive'].notna()
        replace_mask = merchant_mask & has_subcategory
        df_full.loc[replace_mask, 'final_category'] = df_full.loc[replace_mask, 'merchant deepdive']
        
        create_category_mapping_report(df_full, MAPPING_REPORT)
    
    print("‚úÖ All files generated successfully!")
    print()
    print("Files created:")
    print(f"  1. {OUTPUT_CSV} - Your final clean data")
    print(f"  2. {MAPPING_REPORT} - Category mapping details")

FINAL CSV GENERATOR - MERCHANT SUBCATEGORY INTEGRATION

üìÇ Loading: C:\Users\setla\Documents\Flatiron\PHASE5\Capstone\mpesa_with_all_subcategories.csv
‚úì Loaded 2,715 transactions

üîÑ Creating unified category column...
‚úì Replaced 1,125 'Merchant' entries with specific subcategories

FINAL CATEGORY BREAKDOWN

MAIN CATEGORIES:
  Income                        :    241 (  8.9%)
  M-Pesa Fees                   :    779 ( 28.7%)
  Transfers                     :    148 (  5.5%)
  Bills                         :    104 (  3.8%)
  Savings                       :     70 (  2.6%)
  Online Shopping               :     72 (  2.7%)
  Friends & Family              :     60 (  2.2%)
  Cash Withdrawal               :     41 (  1.5%)
  Airtime & Data                :     30 (  1.1%)
  Shopping                      :     11 (  0.4%)
  Education                     :     11 (  0.4%)
  Personal Care                 :      7 (  0.3%)
  Health Care                   :      5 (  0.2%)
  Fast Foods   