In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style for better visualization
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = [10, 6]

# --- 1. DATASET SIMULATION (Replacing real data loading) ---
# In a real scenario, you would load data from a CSV or API.
print("--- 1. Generating Synthetic SDG 13 Data ---")
np.random.seed(42)
N_SAMPLES = 1000
COUNTRIES = ['USA', 'CHN', 'IND', 'DEU', 'BRA', 'NGA']

# Generate features
data = {
    'Country': np.random.choice(COUNTRIES, N_SAMPLES),
    'Year': np.random.randint(2000, 2020, N_SAMPLES),
    # FIX APPLIED: Using positional argument 1000 for the lower bound instead of min=1000
    'GDP_per_capita': np.random.normal(30000, 25000, N_SAMPLES).clip(1000),
    'Population_Total': np.random.normal(50_000_000, 100_000_000, N_SAMPLES).clip(100_000),
    'Fossil_Fuel_Pct': np.random.uniform(50, 95, N_SAMPLES),
    'Industrial_Output_Index': np.random.normal(120, 30, N_SAMPLES)
}

df = pd.DataFrame(data)

# Generate Target Variable: CO2 Emissions (kt)
# The formula simulates the relationship: CO2 = f(GDP, Population, Fossil Fuel %) + noise
df['CO2_Emissions_kt'] = (
    0.05 * df['GDP_per_capita'] +
    1.2e-4 * df['Population_Total'] +
    50000 * (df['Fossil_Fuel_Pct'] / 100) +
    np.random.normal(0, 50000, N_SAMPLES)
# FIX APPLIED: Using positional argument 10000 for the lower bound instead of min=10000
).clip(10000)

print(f"Dataset Head:\n{df.head()}")
print(f"\nDataset Shape: {df.shape}")

# Define features (X) and target (y)
X = df.drop('CO2_Emissions_kt', axis=1)
y = df['CO2_Emissions_kt']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --------------------------------------------------------------------------------------

# --- 2. PREPROCESSING PIPELINE ---
print("\n--- 2. Setting Up Preprocessing Pipeline ---")

# Define categorical and numerical features
categorical_features = ['Country']
numerical_features = ['GDP_per_capita', 'Population_Total', 'Fossil_Fuel_Pct', 'Industrial_Output_Index']

# Create preprocessors for different feature types
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a ColumnTransformer to apply the correct transformation to the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features + ['Year']),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# --------------------------------------------------------------------------------------

# --- 3. MODEL BUILDING (Pipeline Integration) ---
print("\n--- 3. Building and Training Model Pipeline (Random Forest Regressor) ---")

# Create the full pipeline: Preprocessor -> Model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

# Train the model
model_pipeline.fit(X_train, y_train)
print("Model training complete.")

# --------------------------------------------------------------------------------------

# --- 4. EVALUATION ---
print("\n--- 4. Model Evaluation ---")
y_pred = model_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:,.2f} kt")
print(f"R-squared (R2) Score: {r2:.4f}")

# --------------------------------------------------------------------------------------

# --- 5. VISUALIZATION and INTERPRETATION (Feature Importance) ---
print("\n--- 5. Visualization and Interpretation ---")

# A. Actual vs. Predicted Plot
plt.figure(figsize=(10, 6))
# Create a DataFrame for plotting clarity
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
sns.scatterplot(x='Actual', y='Predicted', data=results_df, alpha=0.6)
# Plot the ideal prediction line (y=x)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Ideal Prediction')
plt.title('Actual vs. Predicted CO2 Emissions (kt) - SDG 13 Forecast')
plt.xlabel('Actual CO2 Emissions (kt)')
plt.ylabel('Predicted CO2 Emissions (kt)')
plt.legend()
plt.show()

# B. Feature Importance Plot
rf_model = model_pipeline.named_steps['regressor']

# Get feature names after one-hot encoding
feature_names = (
    numerical_features + ['Year'] +
    list(model_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features))
)

# Create a DataFrame for importance
feature_importances = pd.Series(rf_model.feature_importances_, index=feature_names).sort_values(ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importances.head(10).values, y=feature_importances.head(10).index, palette="viridis")
plt.title('Top 10 Feature Importance for CO2 Emission Prediction')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.show()

--- 1. Generating Synthetic SDG 13 Data ---


TypeError: clip() got an unexpected keyword argument 'min'

ModuleNotFoundError: No module named 'tensorflow'