In [None]:
"""
FAO Statistical Data Analysis Pipeline
=====================================

A comprehensive data analysis pipeline for FAO statistical data including:
- Data cleaning and preprocessing
- Exploratory data analysis with visualizations
- Clustering analysis
- Regression modeling

Author: Data Analysis Pipeline
Version: 2.0
"""

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import logging
import warnings
from pathlib import Path
from typing import Dict, Tuple, Optional, List
from dataclasses import dataclass

# ML imports
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_squared_error, 
    mean_absolute_error, 
    r2_score, 
    silhouette_score
)

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


@dataclass
class Config:
    """Configuration class for analysis parameters."""
    DATA_PATH: str = "./fao_cleaned_data_20250804_095749.csv"
    TEST_SIZE: float = 0.2
    RANDOM_STATE: int = 42
    N_CLUSTERS: int = 3
    IQR_MULTIPLIER: float = 1.5
    FIGURE_SIZE: Tuple[int, int] = (10, 6)
    LARGE_FIGURE_SIZE: Tuple[int, int] = (12, 8)
    
    # Column configurations
    REQUIRED_COLUMNS: List[str] = None
    TEXT_COLUMNS: List[str] = None
    LABEL_ENCODE_COLUMNS: List[str] = None
    
    def __post_init__(self):
        """Initialize column lists after dataclass creation."""
        self.REQUIRED_COLUMNS = ['Value', 'Area', 'Indicator', 'Year']
        self.TEXT_COLUMNS = ['Domain', 'Area', 'Indicator', 'Sex', 'Element', 'Source', 'Unit']
        self.LABEL_ENCODE_COLUMNS = ['Domain', 'Area', 'Indicator', 'Sex', 'Element', 'Source', 'Unit']


class FAODataAnalyzer:
    """Main class for FAO data analysis pipeline."""
    
    def __init__(self, config: Config):
        """Initialize the analyzer with configuration."""
        self.config = config
        self.df = None
        self.encoders = {}
        self.scaler = StandardScaler()
        self.model = None
        
        # Set style for all plots
        sns.set_style("whitegrid")
        plt.style.use('default')
    
    def load_data(self) -> pd.DataFrame:
        """
        Load data from CSV file with error handling.
        
        Returns:
            pd.DataFrame: Loaded dataframe
            
        Raises:
            FileNotFoundError: If data file doesn't exist
            pd.errors.EmptyDataError: If file is empty
        """
        try:
            data_path = Path(self.config.DATA_PATH)
            if not data_path.exists():
                raise FileNotFoundError(f"Data file not found: {data_path}")
            
            logger.info(f"Loading data from {data_path}")
            self.df = pd.read_csv(data_path)
            logger.info(f"Data loaded successfully. Shape: {self.df.shape}")
            
            # Validate required columns
            missing_cols = set(self.config.REQUIRED_COLUMNS) - set(self.df.columns)
            if missing_cols:
                raise ValueError(f"Missing required columns: {missing_cols}")
            
            return self.df
            
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise
    
    def clean_data(self) -> pd.DataFrame:
        """
        Clean and preprocess the dataframe.
        
        Returns:
            pd.DataFrame: Cleaned dataframe
        """
        if self.df is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        logger.info("Starting data cleaning process")
        initial_shape = self.df.shape
        
        # Handle missing values
        logger.info("Handling missing values...")
        missing_summary = self.df.isnull().sum()
        logger.info(f"Missing values summary:\n{missing_summary[missing_summary > 0]}")
        
        # Drop rows with missing critical fields
        self.df.dropna(subset=self.config.REQUIRED_COLUMNS, inplace=True)
        
        # Fill optional text fields
        optional_text_fields = ['Note', 'Flag', 'Flag Description']
        for field in optional_text_fields:
            if field in self.df.columns:
                self.df[field] = self.df[field].fillna('')
        
        # Standardize text columns
        logger.info("Standardizing text columns...")
        for col in self.config.TEXT_COLUMNS:
            if col in self.df.columns:
                self.df[col] = self.df[col].astype(str).str.strip().str.title()
        
        # Remove outliers using IQR method
        logger.info("Removing outliers...")
        self.df = self._remove_outliers(self.df, 'Value')
        
        # Label encoding
        logger.info("Applying label encoding...")
        self._apply_label_encoding()
        
        # Scale values
        logger.info("Scaling numerical values...")
        self.df['Value_Scaled'] = self.scaler.fit_transform(self.df[['Value']])
        
        final_shape = self.df.shape
        logger.info(f"Data cleaning completed. Shape changed from {initial_shape} to {final_shape}")
        
        return self.df
    
    def _remove_outliers(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
        """Remove outliers using IQR method."""
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - self.config.IQR_MULTIPLIER * IQR
        upper_bound = Q3 + self.config.IQR_MULTIPLIER * IQR
        
        initial_count = len(df)
        df_clean = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
        final_count = len(df_clean)
        
        logger.info(f"Removed {initial_count - final_count} outliers from {column}")
        return df_clean
    
    def _apply_label_encoding(self) -> None:
        """Apply label encoding to categorical columns."""
        for col in self.config.LABEL_ENCODE_COLUMNS:
            if col in self.df.columns:
                le = LabelEncoder()
                self.df[f'{col}_Encoded'] = le.fit_transform(self.df[col])
                self.encoders[col] = le
    
    def generate_summary_statistics(self) -> None:
        """Generate and display summary statistics."""
        if self.df is None:
            raise ValueError("Data not available. Load and clean data first.")
        
        logger.info("Generating summary statistics")
        
        print("=" * 60)
        print("DATASET SUMMARY")
        print("=" * 60)
        
        print(f"Dataset shape: {self.df.shape}")
        print(f"Memory usage: {self.df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        
        print("\n" + "-" * 40)
        print("DATA TYPES")
        print("-" * 40)
        print(self.df.dtypes.value_counts())
        
        print("\n" + "-" * 40)
        print("NUMERICAL STATISTICS")
        print("-" * 40)
        print(self.df.describe())
        
        print("\n" + "-" * 40)
        print("CATEGORICAL STATISTICS")
        print("-" * 40)
        print(f"Unique Areas: {self.df['Area'].nunique()}")
        print(f"Unique Indicators: {self.df['Indicator'].nunique()}")
        
        if 'Sex' in self.df.columns:
            print(f"\nSex distribution:\n{self.df['Sex'].value_counts()}")
        
        if 'Element' in self.df.columns:
            print(f"\nElement distribution:\n{self.df['Element'].value_counts()}")
    
    def create_exploratory_plots(self) -> None:
        """Create comprehensive exploratory data analysis plots."""
        if self.df is None:
            raise ValueError("Data not available. Load and clean data first.")
        
        logger.info("Creating exploratory plots")
        
        # 1. Distribution of Values
        plt.figure(figsize=self.config.FIGURE_SIZE)
        sns.histplot(self.df['Value'], bins=50, kde=True, alpha=0.7)
        plt.title('Distribution of Values', fontsize=14, fontweight='bold')
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        plt.tight_layout()
        plt.show()
        
        # 2. Boxplot by Element (if available)
        if 'Element' in self.df.columns and self.df['Element'].nunique() <= 20:
            plt.figure(figsize=self.config.LARGE_FIGURE_SIZE)
            sns.boxplot(data=self.df, x='Element', y='Value')
            plt.xticks(rotation=45, ha='right')
            plt.title('Distribution of Values by Element', fontsize=14, fontweight='bold')
            plt.tight_layout()
            plt.show()
        
        # 3. Time series analysis
        if 'Year' in self.df.columns:
            yearly_avg = self.df.groupby('Year')['Value'].agg(['mean', 'std']).reset_index()
            
            plt.figure(figsize=self.config.FIGURE_SIZE)
            plt.plot(yearly_avg['Year'], yearly_avg['mean'], marker='o', linewidth=2)
            plt.fill_between(yearly_avg['Year'], 
                           yearly_avg['mean'] - yearly_avg['std'],
                           yearly_avg['mean'] + yearly_avg['std'], 
                           alpha=0.3)
            plt.title('Average Value Over Time (with Standard Deviation)', 
                     fontsize=14, fontweight='bold')
            plt.xlabel('Year')
            plt.ylabel('Average Value')
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
        
        # 4. Gender comparison (if available)
        if 'Sex' in self.df.columns:
            sex_data = self.df[self.df['Sex'].isin(['Male', 'Female'])]
            if not sex_data.empty:
                plt.figure(figsize=self.config.FIGURE_SIZE)
                sns.boxplot(data=sex_data, x='Sex', y='Value')
                plt.title('Value Distribution by Gender', fontsize=14, fontweight='bold')
                plt.tight_layout()
                plt.show()
        
        # 5. Correlation matrix
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 1:
            plt.figure(figsize=(8, 6))
            correlation_matrix = self.df[numeric_cols].corr()
            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', 
                       center=0, fmt='.2f', square=True)
            plt.title('Correlation Matrix', fontsize=14, fontweight='bold')
            plt.tight_layout()
            plt.show()
    
    def perform_clustering_analysis(self) -> Dict:
        """
        Perform K-means clustering analysis on areas by average value.
        
        Returns:
            Dict: Clustering results and metrics
        """
        if self.df is None:
            raise ValueError("Data not available. Load and clean data first.")
        
        logger.info("Performing clustering analysis")
        
        # Group by area and calculate mean values
        df_grouped = self.df.groupby('Area')[['Value']].mean().reset_index()
        
        # Scale the values for clustering
        scaler = StandardScaler()
        df_grouped['Value_scaled'] = scaler.fit_transform(df_grouped[['Value']])
        
        # Perform K-means clustering
        kmeans = KMeans(n_clusters=self.config.N_CLUSTERS, 
                       random_state=self.config.RANDOM_STATE)
        df_grouped['Cluster'] = kmeans.fit_predict(df_grouped[['Value_scaled']])
        
        # Calculate metrics
        silhouette_avg = silhouette_score(df_grouped[['Value_scaled']], 
                                        df_grouped['Cluster'])
        
        # Visualization
        plt.figure(figsize=self.config.LARGE_FIGURE_SIZE)
        scatter = sns.scatterplot(data=df_grouped, x='Area', y='Value', 
                                hue='Cluster', palette='Set2', s=100)
        plt.xticks(rotation=90, ha='right')
        plt.title(f'K-Means Clustering of Areas by Average Value (k={self.config.N_CLUSTERS})', 
                 fontsize=14, fontweight='bold')
        plt.xlabel('Country/Area')
        plt.ylabel('Average Value')
        plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.show()
        
        # Results summary
        results = {
            'silhouette_score': round(silhouette_avg, 3),
            'inertia': round(kmeans.inertia_, 2),
            'n_clusters': self.config.N_CLUSTERS,
            'cluster_sizes': df_grouped['Cluster'].value_counts().to_dict()
        }
        
        logger.info(f"Clustering completed. Silhouette Score: {results['silhouette_score']}")
        return results
    
    def build_regression_model(self) -> Dict:
        """
        Build and evaluate a Random Forest regression model.
        
        Returns:
            Dict: Model performance metrics
        """
        if self.df is None:
            raise ValueError("Data not available. Load and clean data first.")
        
        logger.info("Building regression model")
        
        try:
            # Prepare features for modeling
            feature_columns = ['Year']
            categorical_features = []
            
            # Add categorical features that exist and have reasonable cardinality
            for col in ['Sex', 'Indicator', 'Element']:
                if col in self.df.columns and self.df[col].nunique() <= 50:
                    categorical_features.append(col)
            
            feature_columns.extend(categorical_features)
            
            # Create feature dataframe
            df_model = self.df[feature_columns + ['Value']].copy()
            
            # One-hot encode categorical variables
            if categorical_features:
                df_encoded = pd.get_dummies(df_model[feature_columns], 
                                          columns=categorical_features, 
                                          drop_first=True)
            else:
                df_encoded = df_model[feature_columns].copy()
            
            # Prepare features and target
            X = df_encoded
            y = df_model['Value']
            
            # Train-test split
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=self.config.TEST_SIZE, 
                random_state=self.config.RANDOM_STATE
            )
            
            # Train model
            self.model = RandomForestRegressor(random_state=self.config.RANDOM_STATE)
            self.model.fit(X_train, y_train)
            
            # Make predictions
            y_pred = self.model.predict(X_test)
            
            # Calculate metrics
            metrics = {
                'rmse': round(np.sqrt(mean_squared_error(y_test, y_pred)), 2),
                'mae': round(mean_absolute_error(y_test, y_pred), 2),
                'r2_score': round(r2_score(y_test, y_pred), 3),
                'n_features': X.shape[1],
                'n_samples': len(X)
            }
            
            logger.info(f"Model training completed. R² Score: {metrics['r2_score']}")
            return metrics
            
        except Exception as e:
            logger.error(f"Error in model building: {e}")
            raise
    
    def print_model_results(self, clustering_results: Dict, model_metrics: Dict) -> None:
        """Print formatted results summary."""
        print("\n" + "=" * 60)
        print("ANALYSIS RESULTS SUMMARY")
        print("=" * 60)
        
        print("\nCLUSTERING ANALYSIS:")
        print("-" * 30)
        print(f"Number of clusters: {clustering_results['n_clusters']}")
        print(f"Silhouette Score: {clustering_results['silhouette_score']}")
        print(f"Inertia: {clustering_results['inertia']}")
        print("Cluster sizes:", clustering_results['cluster_sizes'])
        
        print("\nREGRESSION MODEL PERFORMANCE:")
        print("-" * 30)
        print(f"RMSE: {model_metrics['rmse']}")
        print(f"MAE: {model_metrics['mae']}")
        print(f"R² Score: {model_metrics['r2_score']}")
        print(f"Number of features: {model_metrics['n_features']}")
        print(f"Number of samples: {model_metrics['n_samples']}")
        
        print("\n" + "=" * 60)
    
    def run_complete_analysis(self) -> Tuple[Dict, Dict]:
        """
        Run the complete analysis pipeline.
        
        Returns:
            Tuple[Dict, Dict]: Clustering results and model metrics
        """
        try:
            # Load and clean data
            self.load_data()
            self.clean_data()
            
            # Generate summary statistics
            self.generate_summary_statistics()
            
            # Create exploratory plots
            self.create_exploratory_plots()
            
            # Perform clustering analysis
            clustering_results = self.perform_clustering_analysis()
            
            # Build regression model
            model_metrics = self.build_regression_model()
            
            # Print results summary
            self.print_model_results(clustering_results, model_metrics)
            
            logger.info("Complete analysis pipeline finished successfully")
            return clustering_results, model_metrics
            
        except Exception as e:
            logger.error(f"Error in analysis pipeline: {e}")
            raise


def main():
    """Main function to run the analysis."""
    # Initialize configuration
    config = Config()
    
    # Create analyzer instance
    analyzer = FAODataAnalyzer(config)
    
    # Run complete analysis
    try:
        clustering_results, model_metrics = analyzer.run_complete_analysis()
        
        # Optional: Save results to file
        # results = {
        #     'clustering': clustering_results,
        #     'regression': model_metrics
        # }
        # with open('analysis_results.json', 'w') as f:
        #     json.dump(results, f, indent=2)
        
    except Exception as e:
        logger.error(f"Analysis failed: {e}")
        return 1
    
    return 0


if __name__ == "__main__":
    exit(main())

2025-08-04 13:07:00,187 - ERROR - Error loading data: Data file not found: data\FAOSTAT_data_en_8-3-2025.csv
2025-08-04 13:07:00,188 - ERROR - Error in analysis pipeline: Data file not found: data\FAOSTAT_data_en_8-3-2025.csv
2025-08-04 13:07:00,189 - ERROR - Analysis failed: Data file not found: data\FAOSTAT_data_en_8-3-2025.csv
