In [None]:
# Greenhouse Gas Analytics - Data Exploration
# Notebook 01: Comprehensive Data Exploration and Initial Analysis

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set styling
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🌍 Greenhouse Gas Analytics - Data Exploration")
print("="*50)

# ## 1. Data Loading and Initial Inspection

# Load the dataset
try:
    df = pd.read_csv('../data/raw/Methane_final.csv')
    print(f"✅ Data loaded successfully!")
    print(f"📊 Dataset shape: {df.shape}")
except FileNotFoundError:
    print("❌ Dataset not found. Creating sample data for demonstration...")
    
    # Create comprehensive sample data
    np.random.seed(42)
    
    countries = ['China', 'India', 'USA', 'Indonesia', 'Brazil', 'Nigeria', 'Russia', 
                'Mexico', 'Iran', 'Germany', 'Turkey', 'Canada', 'Australia', 
                'Argentina', 'Algeria', 'Kazakhstan', 'Uzbekistan', 'Thailand',
                'Malaysia', 'Venezuela', 'Saudi Arabia', 'Pakistan', 'Egypt',
                'Ukraine', 'Bangladesh', 'Vietnam', 'Philippines', 'Myanmar',
                'Poland', 'South Africa']
    
    regions = {
        'China': 'Asia', 'India': 'Asia', 'USA': 'North America', 'Indonesia': 'Asia',
        'Brazil': 'South America', 'Nigeria': 'Africa', 'Russia': 'Europe',
        'Mexico': 'North America', 'Iran': 'Asia', 'Germany': 'Europe',
        'Turkey': 'Europe', 'Canada': 'North America', 'Australia': 'Oceania',
        'Argentina': 'South America', 'Algeria': 'Africa', 'Kazakhstan': 'Asia',
        'Uzbekistan': 'Asia', 'Thailand': 'Asia', 'Malaysia': 'Asia',
        'Venezuela': 'South America', 'Saudi Arabia': 'Asia', 'Pakistan': 'Asia',
        'Egypt': 'Africa', 'Ukraine': 'Europe', 'Bangladesh': 'Asia',
        'Vietnam': 'Asia', 'Philippines': 'Asia', 'Myanmar': 'Asia',
        'Poland': 'Europe', 'South Africa': 'Africa'
    }
    
    types = ['Agriculture', 'Energy', 'Waste', 'Other']
    segments = ['Livestock', 'Oil & Gas', 'Landfills', 'Rice Cultivation', 
               'Coal Mining', 'Bioenergy', 'Gas pipelines', 'Onshore oil', 'Total']
    base_years = ['2019-2021', '2020-2021', '2022', '2021', '2019']
    
    data = []
    for country in countries:
        for _ in range(np.random.randint(15, 25)):  # Variable entries per country
            emission_base = np.random.exponential(50) + np.random.normal(20, 30)
            data.append({
                'region': regions[country],
                'country': country,
                'emissions': max(0, emission_base),
                'type': np.random.choice(types),
                'segment': np.random.choice(segments),
                'reason': 'All',
                'baseYear': np.random.choice(base_years)
            })
    
    df = pd.DataFrame(data)
    print(f"📊 Sample dataset created with shape: {df.shape}")

# ## 2. Basic Data Information

print("\n📋 DATASET OVERVIEW")
print("="*30)
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nData Types:")
print(df.dtypes)

print(f"\n📊 MISSING VALUES:")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percent
}).sort_values('Percentage', ascending=False)
print(missing_df[missing_df['Missing Count'] > 0])

# ## 3. Statistical Summary

print(f"\n📈 STATISTICAL SUMMARY:")
print("="*30)
print(df.describe())

# ## 4. Categorical Variables Analysis

print(f"\n🏷️ CATEGORICAL VARIABLES:")
print("="*30)

categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col.upper()}:")
    value_counts = df[col].value_counts()
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Top 5 values:")
    for val, count in value_counts.head().items():
        pct = (count / len(df)) * 100
        print(f"    {val}: {count} ({pct:.1f}%)")

# ## 5. Numerical Variables Analysis

print(f"\n🔢 NUMERICAL VARIABLES:")
print("="*30)

# Clean emissions data
df['emissions'] = pd.to_numeric(df['emissions'], errors='coerce')

numerical_cols = df.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    print(f"\n{col.upper()}:")
    print(f"  Mean: {df[col].mean():.2f}")
    print(f"  Median: {df[col].median():.2f}")
    print(f"  Std: {df[col].std():.2f}")
    print(f"  Min: {df[col].min():.2f}")
    print(f"  Max: {df[col].max():.2f}")
    print(f"  25th percentile: {df[col].quantile(0.25):.2f}")
    print(f"  75th percentile: {df[col].quantile(0.75):.2f}")

# ## 6. Data Quality Assessment

print(f"\n🔍 DATA QUALITY ASSESSMENT:")
print("="*40)

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

# Check for negative emissions
if 'emissions' in df.columns:
    negative_emissions = (df['emissions'] < 0).sum()
    print(f"Negative emissions: {negative_emissions}")
    
    # Check for zero emissions
    zero_emissions = (df['emissions'] == 0).sum()
    print(f"Zero emissions: {zero_emissions}")
    
    # Check for outliers using IQR method
    Q1 = df['emissions'].quantile(0.25)
    Q3 = df['emissions'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = ((df['emissions'] < lower_bound) | (df['emissions'] > upper_bound)).sum()
    print(f"Potential outliers (IQR method): {outliers}")

# ## 7. Visual Data Exploration

print(f"\n📊 CREATING VISUALIZATIONS...")
print("="*40)

# Set up matplotlib for better visualization
plt.rcParams['figure.figsize'] = (15, 10)

# 1. Emissions Distribution
plt.figure(figsize=(15, 12))

plt.subplot(3, 2, 1)
df['emissions'].hist(bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Distribution of Methane Emissions', fontsize=14, fontweight='bold')
plt.xlabel('Emissions (Mt CO₂e)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

# 2. Emissions by Region
plt.subplot(3, 2, 2)
regional_emissions = df.groupby('region')['emissions'].sum().sort_values(ascending=True)
regional_emissions.plot(kind='barh', color='lightcoral')
plt.title('Total Emissions by Region', fontsize=14, fontweight='bold')
plt.xlabel('Total Emissions (Mt CO₂e)')
plt.tight_layout()

# 3. Emissions by Type
plt.subplot(3, 2, 3)
type_emissions = df.groupby('type')['emissions'].sum()
colors = plt.cm.Set3(np.linspace(0, 1, len(type_emissions)))
type_emissions.plot(kind='pie', autopct='%1.1f%%', colors=colors)
plt.title('Emissions Distribution by Type', fontsize=14, fontweight='bold')
plt.ylabel('')

# 4. Top 10 Countries
plt.subplot(3, 2, 4)
top_countries = df.groupby('country')['emissions'].sum().sort_values(ascending=False).head(10)
top_countries.plot(kind='bar', color='gold', alpha=0.8)
plt.title('Top 10 Emitting Countries', fontsize=14, fontweight='bold')
plt.xlabel('Country')
plt.ylabel('Emissions (Mt CO₂e)')
plt.xticks(rotation=45)

# 5. Emissions by Segment
plt.subplot(3, 2, 5)
segment_emissions = df.groupby('segment')['emissions'].sum().sort_values(ascending=False)
segment_emissions.plot(kind='bar', color='lightgreen', alpha=0.8)
plt.title('Emissions by Segment', fontsize=14, fontweight='bold')
plt.xlabel('Segment')
plt.ylabel('Emissions (Mt CO₂e)')
plt.xticks(rotation=45)

# 6. Box plot for emissions by type
plt.subplot(3, 2, 6)
df.boxplot(column='emissions', by='type', ax=plt.gca())
plt.title('Emissions Distribution by Type (Box Plot)', fontsize=14, fontweight='bold')
plt.suptitle('')  # Remove automatic title
plt.xlabel('Emission Type')
plt.ylabel('Emissions (Mt CO₂e)')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# ## 8. Interactive Plotly Visualizations

print(f"\n🎨 CREATING INTERACTIVE VISUALIZATIONS...")
print("="*50)

# 1. Interactive Regional Analysis
fig_regional = px.treemap(
    df.groupby(['region', 'country'])['emissions'].sum().reset_index(),
    path=['region', 'country'],
    values='emissions',
    title='Hierarchical View: Emissions by Region and Country',
    color='emissions',
    color_continuous_scale='Reds'
)
fig_regional.update_layout(height=600)
fig_regional.show()

# 2. Interactive Scatter Plot
country_summary = df.groupby('country').agg({
    'emissions': 'sum',
    'region': 'first'
}).reset_index()

fig_scatter = px.scatter(
    country_summary,
    x=range(len(country_summary)),
    y='emissions',
    color='region',
    size='emissions',
    hover_name='country',
    title='Country Emissions Overview',
    labels={'x': 'Country Index', 'y': 'Total Emissions (Mt CO₂e)'}
)
fig_scatter.update_layout(height=500)
fig_scatter.show()

# 3. Sunburst Chart
fig_sunburst = px.sunburst(
    df.groupby(['region', 'type', 'segment'])['emissions'].sum().reset_index(),
    path=['region', 'type', 'segment'],
    values='emissions',
    title='Multi-level Emissions Breakdown'
)
fig_sunburst.update_layout(height=600)
fig_sunburst.show()

# ## 9. Correlation Analysis

print(f"\n🔗 CORRELATION ANALYSIS:")
print("="*30)

# Create dummy variables for categorical analysis
df_encoded = df.copy()

# One-hot encode categorical variables
categorical_cols = ['region', 'type', 'segment']
for col in categorical_cols:
    dummies = pd.get_dummies(df_encoded[col], prefix=col)
    df_encoded = pd.concat([df_encoded, dummies], axis=1)

# Select numeric columns for correlation
numeric_cols = df_encoded.select_dtypes(include=[np.number]).columns
corr_matrix = df_encoded[numeric_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix))
sns.heatmap(corr_matrix, 
            annot=True, 
            cmap='RdYlBu_r', 
            center=0,
            mask=mask,
            square=True,
            fmt='.2f')
plt.title('Correlation Matrix of Encoded Variables', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# ## 10. Time Series Analysis (if applicable)

if 'baseYear' in df.columns:
    print(f"\n📅 TEMPORAL ANALYSIS:")
    print("="*25)
    
    # Extract year information
    df['year'] = df['baseYear'].str.extract('(\d{4})').astype(float)
    
    if df['year'].notna().sum() > 0:
        # Yearly emissions trend
        yearly_emissions = df.groupby('year')['emissions'].agg(['sum', 'mean', 'count']).reset_index()
        
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
        
        # Total emissions by year
        ax1.plot(yearly_emissions['year'], yearly_emissions['sum'], 
                marker='o', linewidth=2, markersize=8, color='red', alpha=0.8)
        ax1.set_title('Total Methane Emissions Over Time', fontsize=14, fontweight='bold')
        ax1.set_xlabel('Year')
        ax1.set_ylabel('Total Emissions (Mt CO₂e)')
        ax1.grid(True, alpha=0.3)
        
        # Average emissions by year
        ax2.plot(yearly_emissions['year'], yearly_emissions['mean'], 
                marker='s', linewidth=2, markersize=8, color='blue', alpha=0.8)
        ax2.set_title('Average Methane Emissions per Record Over Time', fontsize=14, fontweight='bold')
        ax2.set_xlabel('Year')
        ax2.set_ylabel('Average Emissions (Mt CO₂e)')
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Sector trends over time
        if len(df['year'].unique()) > 1:
            sector_trends = df.groupby(['year', 'type'])['emissions'].sum().reset_index()
            
            fig_trends = px.line(
                sector_trends,
                x='year',
                y='emissions',
                color='type',
                markers=True,
                title='Emissions Trends by Sector Over Time',
                labels={'emissions': 'Emissions (Mt CO₂e)', 'year': 'Year'}
            )
            fig_trends.update_layout(height=500)
            fig_trends.show()

# ## 11. Advanced Statistical Analysis

print(f"\n📊 ADVANCED STATISTICAL INSIGHTS:")
print("="*40)

# Emissions concentration analysis
total_emissions = df['emissions'].sum()
country_emissions = df.groupby('country')['emissions'].sum().sort_values(ascending=False)

print(f"📈 CONCENTRATION METRICS:")
print(f"  • Top 10 countries account for {(country_emissions.head(10).sum() / total_emissions * 100):.1f}% of emissions")
print(f"  • Top 5 countries account for {(country_emissions.head(5).sum() / total_emissions * 100):.1f}% of emissions")
print(f"  • Top country ({country_emissions.index[0]}) accounts for {(country_emissions.iloc[0] / total_emissions * 100):.1f}% of emissions")

# Regional analysis
regional_stats = df.groupby('region')['emissions'].agg(['sum', 'mean', 'std', 'count'])
print(f"\n🌍 REGIONAL STATISTICS:")
for region in regional_stats.index:
    stats = regional_stats.loc[region]
    print(f"  • {region}:")
    print(f"    - Total: {stats['sum']:.1f} Mt CO₂e ({stats['sum']/total_emissions*100:.1f}%)")
    print(f"    - Average: {stats['mean']:.1f} Mt CO₂e")
    print(f"    - Std Dev: {stats['std']:.1f}")
    print(f"    - Records: {int(stats['count'])}")

# Sector analysis
sector_stats = df.groupby('type')['emissions'].agg(['sum', 'mean', 'std', 'count'])
print(f"\n🏭 SECTOR STATISTICS:")
for sector in sector_stats.index:
    stats = sector_stats.loc[sector]
    print(f"  • {sector}:")
    print(f"    - Total: {stats['sum']:.1f} Mt CO₂e ({stats['sum']/total_emissions*100:.1f}%)")
    print(f"    - Average: {stats['mean']:.1f} Mt CO₂e")
    print(f"    - Records: {int(stats['count'])}")

# ## 12. Data Quality and Completeness Report

print(f"\n✅ DATA QUALITY REPORT:")
print("="*35)

quality_metrics = {
    'Total Records': len(df),
    'Complete Records': len(df.dropna()),
    'Completeness Rate': f"{(len(df.dropna()) / len(df) * 100):.1f}%",
    'Unique Countries': df['country'].nunique(),
    'Unique Regions': df['region'].nunique(),
    'Unique Sectors': df['type'].nunique(),
    'Emission Range': f"{df['emissions'].min():.1f} - {df['emissions'].max():.1f} Mt CO₂e",
    'Average Emission': f"{df['emissions'].mean():.1f} Mt CO₂e"
}

for metric, value in quality_metrics.items():
    print(f"  • {metric}: {value}")

# ## 13. Key Findings and Insights

print(f"\n🎯 KEY FINDINGS:")
print("="*20)

findings = [
    f"Dataset contains {len(df)} records across {df['country'].nunique()} countries",
    f"{'Agriculture' if df.groupby('type')['emissions'].sum().idxmax() == 'Agriculture' else df.groupby('type')['emissions'].sum().idxmax()} sector dominates with {df.groupby('type')['emissions'].sum().max():.1f} Mt CO₂e",
    f"{df.groupby('region')['emissions'].sum().idxmax()} region leads in total emissions",
    f"Top emitter is {df.groupby('country')['emissions'].sum().idxmax()} with {df.groupby('country')['emissions'].sum().max():.1f} Mt CO₂e",
    f"Emissions show {'high' if df['emissions'].std() / df['emissions'].mean() > 1 else 'moderate'} variability (CV: {df['emissions'].std() / df['emissions'].mean():.2f})",
    f"Data quality is {'excellent' if len(df.dropna()) / len(df) > 0.95 else 'good' if len(df.dropna()) / len(df) > 0.8 else 'moderate'} with {len(df.dropna()) / len(df) * 100:.1f}% completeness"
]

for i, finding in enumerate(findings, 1):
    print(f"  {i}. {finding}")

print(f"\n🔄 NEXT STEPS:")
print("="*15)
print("  1. Data cleaning and preprocessing")
print("  2. Feature engineering and transformation")
print("  3. Statistical analysis and hypothesis testing")
print("  4. Visualization and dashboard development")
print("  5. Predictive modeling and forecasting")

print(f"\n✨ EXPLORATION COMPLETE!")
print("="*25)