In [None]:
# notebooks/eda.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Load the data - CORRECTED PATH
try:
    df = pd.read_csv('../data/raw/data.csv')
    print("‚úÖ Data loaded successfully!")
except FileNotFoundError:
    # Try alternative path
    try:
        df = pd.read_csv('data/raw/data.csv')
        print("‚úÖ Data loaded successfully from alternative path!")
    except FileNotFoundError:
        print("‚ùå File not found. Please ensure data.csv is in data/raw/ directory")
        print("Current directory structure:")
        import os
        for root, dirs, files in os.walk('.'):
            level = root.replace('.', '').count(os.sep)
            indent = ' ' * 2 * level
            print(f'{indent}{os.path.basename(root)}/')
            subindent = ' ' * 2 * (level + 1)
            for file in files:
                print(f'{subindent}{file}')

# If data loaded successfully, proceed with EDA
if 'df' in locals() and not df.empty:
    # 1. Overview of the Data
    print("="*80)
    print("DATASET OVERVIEW")
    print("="*80)
    print(f"üìä Dataset Shape: {df.shape}")
    print(f"üìà Number of rows: {df.shape[0]:,}")
    print(f"üìâ Number of columns: {df.shape[1]}")
    
    print("\nüìã Column Names and Data Types:")
    print(df.dtypes.to_string())
    
    print("\nüîç First 5 rows:")
    display(df.head())
    
    print("\nüìù Dataset Info:")
    df.info()
    
    # 2. Summary Statistics
    print("\n" + "="*80)
    print("SUMMARY STATISTICS")
    print("="*80)
    
    print("\nüî¢ Numerical Columns Statistics:")
    display(df.describe())
    
    print("\nüè∑Ô∏è Categorical Columns Summary:")
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        print(f"\n{col}:")
        print(f"   Unique values: {df[col].nunique()}")
        if df[col].nunique() > 20:
            print(f"   Top 10 values:")
            display(df[col].value_counts().head(10))
        else:
            display(df[col].value_counts())
    
    # 3. Distribution of Numerical Features
    print("\n" + "="*80)
    print("NUMERICAL FEATURES DISTRIBUTION")
    print("="*80)
    
    # Identify numerical columns (excluding ID columns)
    numerical_cols = []
    for col in df.columns:
        if col not in ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId']:
            if pd.api.types.is_numeric_dtype(df[col]):
                numerical_cols.append(col)
    
    if numerical_cols:
        fig, axes = plt.subplots(1, len(numerical_cols), figsize=(5*len(numerical_cols), 5))
        
        if len(numerical_cols) == 1:
            axes = [axes]
        
        for idx, col in enumerate(numerical_cols):
            if idx < len(axes):
                # Remove outliers for better visualization
                q1 = df[col].quantile(0.25)
                q3 = df[col].quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
                data_to_plot = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)][col]
                
                axes[idx].hist(data_to_plot, bins=50, edgecolor='black', alpha=0.7)
                axes[idx].set_title(f'Distribution of {col} (outliers removed)')
                axes[idx].set_xlabel(col)
                axes[idx].set_ylabel('Frequency')
                
                # Add statistics
                mean_val = df[col].mean()
                median_val = df[col].median()
                axes[idx].axvline(mean_val, color='red', linestyle='--', 
                                  label=f'Mean: {mean_val:.2f}', alpha=0.7)
                axes[idx].axvline(median_val, color='green', linestyle='--', 
                                  label=f'Median: {median_val:.2f}', alpha=0.7)
                axes[idx].legend()
        
        plt.tight_layout()
        plt.show()
    else:
        print("No numerical columns found.")
    
    # 4. Distribution of Categorical Features
    print("\n" + "="*80)
    print("CATEGORICAL FEATURES DISTRIBUTION")
    print("="*80)
    
    # Select categorical columns with reasonable number of categories
    categorical_to_plot = []
    for col in categorical_cols:
        if 1 < df[col].nunique() <= 30:
            categorical_to_plot.append(col)
    
    if categorical_to_plot:
        n_cols = 3
        n_rows = (len(categorical_to_plot) + n_cols - 1) // n_cols
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
        axes = axes.flatten() if n_rows > 1 else [axes]
        
        for idx, col in enumerate(categorical_to_plot):
            if idx < len(axes):
                top_categories = df[col].value_counts().head(10)
                axes[idx].bar(range(len(top_categories)), top_categories.values)
                axes[idx].set_title(f'Top 10 {col}')
                axes[idx].set_xlabel(col)
                axes[idx].set_ylabel('Count')
                axes[idx].set_xticks(range(len(top_categories)))
                axes[idx].set_xticklabels(top_categories.index, rotation=45, ha='right')
        
        # Hide empty subplots
        for idx in range(len(categorical_to_plot), len(axes)):
            axes[idx].set_visible(False)
        
        plt.tight_layout()
        plt.show()
    else:
        print("No suitable categorical columns for plotting.")
    
    # 5. Correlation Analysis
    print("\n" + "="*80)
    print("CORRELATION ANALYSIS")
    print("="*80)
    
    if len(numerical_cols) > 1:
        correlation_matrix = df[numerical_cols].corr()
        print("Correlation Matrix:")
        print(correlation_matrix)
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                   fmt='.2f', square=True, cbar_kws={"shrink": 0.8})
        plt.title('Correlation Heatmap of Numerical Features')
        plt.tight_layout()
        plt.show()
    else:
        print(f"Only {len(numerical_cols)} numerical column found. Need at least 2 for correlation analysis.")
    
    # 6. Missing Values Analysis
    print("\n" + "="*80)
    print("MISSING VALUES ANALYSIS")
    print("="*80)
    
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    
    missing_df = pd.DataFrame({
        'Missing Values': missing_values,
        'Percentage': missing_percentage
    })
    
    missing_summary = missing_df[missing_df['Missing Values'] > 0].sort_values('Percentage', ascending=False)
    
    if not missing_summary.empty:
        print("Missing values found:")
        display(missing_summary)
        
        plt.figure(figsize=(10, 6))
        missing_summary['Percentage'].plot(kind='bar')
        plt.title('Percentage of Missing Values by Column')
        plt.ylabel('Percentage (%)')
        plt.xlabel('Column')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    else:
        print("‚úÖ No missing values found in the dataset!")
    
    # 7. Outlier Detection
    print("\n" + "="*80)
    print("OUTLIER DETECTION")
    print("="*80)
    
    if numerical_cols:
        n_cols = len(numerical_cols)
        fig, axes = plt.subplots(1, n_cols, figsize=(5*n_cols, 5))
        
        if n_cols == 1:
            axes = [axes]
        
        for idx, col in enumerate(numerical_cols):
            if idx < len(axes):
                box_data = df[col].dropna()
                axes[idx].boxplot(box_data)
                axes[idx].set_title(f'Box Plot of {col}')
                axes[idx].set_ylabel(col)
                
                # Calculate outliers using IQR method
                Q1 = box_data.quantile(0.25)
                Q3 = box_data.quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                outliers = box_data[(box_data < lower_bound) | (box_data > upper_bound)]
                print(f"\nüìä {col}:")
                print(f"   IQR: {IQR:.2f}")
                print(f"   Lower bound: {lower_bound:.2f}")
                print(f"   Upper bound: {upper_bound:.2f}")
                print(f"   Number of outliers: {len(outliers):,}")
                print(f"   Percentage of outliers: {len(outliers)/len(box_data)*100:.2f}%")
        
        plt.tight_layout()
        plt.show()
    
    # 8. Time Series Analysis
    print("\n" + "="*80)
    print("TIME SERIES ANALYSIS")
    print("="*80)
    
    if 'TransactionStartTime' in df.columns:
        try:
            # Convert to datetime
            df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
            
            # Extract date components
            df['TransactionDate'] = df['TransactionStartTime'].dt.date
            df['TransactionHour'] = df['TransactionStartTime'].dt.hour
            df['TransactionDay'] = df['TransactionStartTime'].dt.day
            df['TransactionMonth'] = df['TransactionStartTime'].dt.month
            df['TransactionYear'] = df['TransactionStartTime'].dt.year
            df['TransactionDayOfWeek'] = df['TransactionStartTime'].dt.dayofweek
            
            # Plot transactions over time
            daily_transactions = df.groupby('TransactionDate').size()
            
            plt.figure(figsize=(15, 6))
            daily_transactions.plot()
            plt.title('Daily Transaction Volume Over Time')
            plt.xlabel('Date')
            plt.ylabel('Number of Transactions')
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            
            # Hourly pattern
            plt.figure(figsize=(12, 5))
            df['TransactionHour'].value_counts().sort_index().plot(kind='bar')
            plt.title('Transaction Distribution by Hour of Day')
            plt.xlabel('Hour of Day')
            plt.ylabel('Number of Transactions')
            plt.tight_layout()
            plt.show()
            
            print(f"\nüìÖ Time Range: {df['TransactionStartTime'].min()} to {df['TransactionStartTime'].max()}")
            
        except Exception as e:
            print(f"‚ö†Ô∏è Could not parse TransactionStartTime: {e}")
    
    # 9. Fraud Analysis
    print("\n" + "="*80)
    print("FRAUD ANALYSIS")
    print("="*80)
    
    if 'FraudResult' in df.columns:
        fraud_counts = df['FraudResult'].value_counts()
        print(f"Fraud distribution:\n{fraud_counts}")
        print(f"\nFraud percentage: {fraud_counts.get(1, 0)/len(df)*100:.4f}%")
        
        plt.figure(figsize=(8, 6))
        labels = ['Non-Fraud (0)', 'Fraud (1)'] if 1 in fraud_counts.index else ['Non-Fraud (0)']
        plt.pie(fraud_counts.values, labels=labels, autopct='%1.2f%%', 
                colors=['lightblue', 'lightcoral'], startangle=90)
        plt.title('Fraud vs Non-Fraud Transactions')
        plt.show()
    else:
        print("‚ö†Ô∏è 'FraudResult' column not found in dataset")
    
    # 10. Customer Analysis
    print("\n" + "="*80)
    print("CUSTOMER ANALYSIS")
    print("="*80)
    
    if 'CustomerId' in df.columns:
        print(f"üë• Number of unique customers: {df['CustomerId'].nunique():,}")
        print(f"üè¶ Number of unique accounts: {df['AccountId'].nunique():,}")
        
        # Transactions per customer
        transactions_per_customer = df.groupby('CustomerId').size()
        print(f"\nüìä Transactions per customer statistics:")
        print(f"   Average: {transactions_per_customer.mean():.2f}")
        print(f"   Median: {transactions_per_customer.median():.2f}")
        print(f"   Standard deviation: {transactions_per_customer.std():.2f}")
        print(f"   Minimum: {transactions_per_customer.min():.2f}")
        print(f"   Maximum: {transactions_per_customer.max():.2f}")
        
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        plt.hist(transactions_per_customer, bins=50, edgecolor='black', alpha=0.7)
        plt.title('Distribution of Transactions per Customer')
        plt.xlabel('Number of Transactions')
        plt.ylabel('Number of Customers')
        
        plt.subplot(1, 2, 2)
        # Log scale for better visualization
        plt.hist(np.log1p(transactions_per_customer), bins=50, edgecolor='black', alpha=0.7)
        plt.title('Log-transformed Transactions per Customer')
        plt.xlabel('log(Number of Transactions + 1)')
        plt.ylabel('Number of Customers')
        
        plt.tight_layout()
        plt.show()
        
        # Top customers by transaction count
        print(f"\nüèÜ Top 5 customers by transaction count:")
        display(transactions_per_customer.sort_values(ascending=False).head(5))
    
    # TOP 5 INSIGHTS SUMMARY
    print("\n" + "="*80)
    print("TOP 5 INSIGHTS SUMMARY")
    print("="*80)
    
    # Generate insights based on actual data
    insights = [
        f"1. DATA SCALE: Dataset contains {df.shape[0]:,} transactions and {df.shape[1]} features",
        f"2. TIME PERIOD: Data spans from {df['TransactionStartTime'].min() if 'TransactionStartTime' in df.columns else 'N/A'} to {df['TransactionStartTime'].max() if 'TransactionStartTime' in df.columns else 'N/A'}",
        f"3. UNIQUE CUSTOMERS: {df['CustomerId'].nunique() if 'CustomerId' in df.columns else 'N/A':,} unique customers identified",
        f"4. FRAUD RATE: {fraud_counts.get(1, 0)/len(df)*100:.4f}% fraud rate detected" if 'FraudResult' in df.columns else "4. FRAUD DATA: FraudResult column not present",
        f"5. MISSING VALUES: {missing_summary.shape[0]} columns have missing values" if not missing_summary.empty else "5. DATA QUALITY: No missing values detected"
    ]
    
    for insight in insights:
        print(f"‚Ä¢ {insight}")
    
    print("\n" + "="*80)
    print("RECOMMENDATIONS FOR FEATURE ENGINEERING")
    print("="*80)
    
    recommendations = [
        "1. Create customer-level RFM (Recency, Frequency, Monetary) features for segmentation",
        "2. Extract temporal features from TransactionStartTime (hour, day, month, weekday)",
        "3. Aggregate transaction statistics per customer (total amount, average amount, etc.)",
        "4. Encode categorical variables using frequency encoding or target encoding",
        "5. Create features based on transaction patterns (regularity, time between transactions)",
        "6. Consider creating features from ProductCategory and ProviderId interactions",
        "7. Handle any outliers in Amount and Value columns",
        "8. Create customer tenure features based on first and last transaction dates"
    ]
    
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec}")
    
    # Save processed data for next steps
    print("\n" + "="*80)
    print("SAVING PROCESSED DATA")
    print("="*80)
    
    try:
        # Save the dataframe with extracted features
        df.to_csv('../data/processed/eda_processed.csv', index=False)
        print("‚úÖ Processed data saved to: ../data/processed/eda_processed.csv")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not save processed data: {e}")
    
    print("\n" + "="*80)
    print("EDA COMPLETED SUCCESSFULLY! ‚úÖ")
    print("="*80)