In [None]:
# CO2 Emissions Calculator Notebook

## 1. Download and Load the Dataset from Kaggle


# pandas: For working with tables (DataFrames)
import pandas as pd
# numpy: For numerical calculations and arrays
import numpy as np
# sklearn: For machine learning
from sklearn.model_selection import train_test_split # To split data for training and testing
from sklearn.tree import DecisionTreeRegressor      # Decision Tree
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error # To measure how good the model is
# matplotlib and seaborn: For plotting graphs
import matplotlib.pyplot as plt
import seaborn as sns
# kagglehub: To download datasets from Kaggle
import kagglehub
# os: For file and folder operations (like listing files in a folder)
import os

In [None]:
# Part 1: Download and Load the Dataset from Kaggle
print("\n--- 1. Download and Load the Dataset from Kaggle ---")

dataset_path = kagglehub.dataset_download("anshtanwar/global-data-on-sustainable-energy")
print(f"Folder where dataset files are located: {dataset_path}")

csv_file_name = None
# All files in the downloaded folder
for file in os.listdir(dataset_path):
    if file.endswith('.csv'):
        csv_file_name = file
        break

if csv_file_name is None:
    print(f"\nOH NO! We couldn't find a CSV file in the '{dataset_path}' folder.")
    print("There might have been a problem with the download. Please check.")
    exit() # We can't continue without a CSV file, let's stop the program

full_csv_path = os.path.join(dataset_path, csv_file_name)

df = pd.read_csv(full_csv_path)
print(f"\nGreat! We successfully read the '{full_csv_path}' file and loaded it into the 'df' table.")

print("\nFirst look at our data table (first 5 rows):")
print(df.head()) # head() shows us the first few rows of the table

print("\nWhat is the size of our table? (number of rows, number of columns):")
print(df.shape)

print("\nWhat are the names of the columns in our table?:")
print(df.columns)

print("\nMore detailed information about our columns (data types, are there any missing values?):")
df.info() # info() tells the type of each column and how many non-null values there are

In [None]:
# Part 2: What Will We Predict? (Target Variable) and Which Information Will We Use? (Features)
print("\n--- 2. Define Our Target and the Information We Will Use ---")
# What we want to predict: 'Value_co2_emissions_kt_by_country'
# i.e., CO2 emission amounts of countries
target_column_name = 'Value_co2_emissions_kt_by_country'

# does this column really exist in our table?
if target_column_name not in df.columns:
    print(f"\nOH NO! It seems there is no column named '{target_column_name}' in our table.")
    print("Are you sure you wrote the column name correctly? Or the dataset might have changed.")
    exit() # We can't continue if the target column is missing
print(f"\nAlright! Our target is set: {target_column_name}")

# Rows where the target column (CO2 emission) has no value (is null)
# let's remove these rows from the table,
df_filtered = df.dropna(subset=[target_column_name])
print(f"\nNumber of rows with a value (non-null) in the '{target_column_name}' column: {df_filtered.shape[0]}")

# what if no rows are left?
if df_filtered.empty:
    print(f"\nOops! After cleaning the nulls in the '{target_column_name}' column, no data remained.")
    print("This is a bit strange. You might want to check the dataset.")
    exit()

# If we deleted too many rows, let's give a warning
if df_filtered.shape[0] < 0.5 * df.shape[0]:
    print(f"CAUTION: We removed a significant portion of the data due to null values in the target column. This might affect the model's result.")

In [None]:
# Part 3: Prepare the Data for Machine Learning (Data Preprocessing)
print("\n--- 3. Let's Organize Our Data a Bit for Machine Learning ---")

# Text-containing columns like 'Entity' (country name) are disabled for now
numeric_cols = df_filtered.select_dtypes(include=np.number).columns.tolist()

# Let's remove our target column from the numerical columns, the rest will be our "features".
features_candidates = [col for col in numeric_cols if col != target_column_name]
print(f"\nNumerical features (columns) we consider using to train the model: {features_candidates}")

# What if we can't find any numerical features?
if not features_candidates:
    print("\nOH NO! We couldn't find any numerical features to use for the model.")
    print("There might be an issue with the dataset or the column types might be different.")
    exit()

# Our features as 'X', our target as 'y'
X = df_filtered[features_candidates].copy()
y = df_filtered[target_column_name].copy()

for col in X.columns:
    if X[col].isnull().any(): # If there is at least one null value in this column
        mean_val = X[col].mean() # Calculate the mean of the column
        X[col].fillna(mean_val, inplace=True) # Fill the nulls with this mean
        print(f"Nulls in the '{col}' column were filled with the mean value ({mean_val:.2f}).")

print("\nLet's look at the final state of our features (X) (first 5 rows):")
print(X.head())
print("\nTotal number of null values remaining in our features (X) (hopefully zero!):")
print(X.isnull().sum().sum())

In [None]:
# Part 4: Split the Data for Training and Testing the Model
print("\n--- 4. Let's Split Our Data into Training and Test Sets ---")

# We will train the model with a portion of our data (80%),
# and test how well the model has learned with the remaining portion (20%).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nNumber of features for training (X_train): {X_train.shape}")
print(f"Number of features for testing (X_test): {X_test.shape}")
print(f"Number of targets for training (y_train): {y_train.shape}")
print(f"Number of targets for testing (y_test): {y_test.shape}")

In [None]:
# Part 5: Build and Train Our Decision Tree Model
print("\n--- 5. Let's Create and Train Our Decision Tree Model ---")

# We are creating a Decision Tree Regressor model.
# max_depth: Maximum depth of the tree. If too deep, it might overfit.
# random_state: For reproducibility of results.
# min_samples_split: Minimum number of samples required to split a node.
# min_samples_leaf: Minimum number of samples required at a leaf node.
dt_regressor = DecisionTreeRegressor(max_depth=8, random_state=42, min_samples_split=10, min_samples_leaf=5)

# We are training our model with the training data. So, it learns to predict y_train by looking at X_train.
dt_regressor.fit(X_train, y_train)
print("\nFantastic! Our Decision Tree model has been successfully trained.")

In [None]:
# Part 6: Measure How Successful Our Model Is
print("\n--- 6. Let's Look at Our Model's Report Card: Performance Evaluation ---")

# Predictions for X_test values using the trained model
y_pred_dt = dt_regressor.predict(X_test)

# MSE (Mean Squared Error): Average of the squares of the errors. More sensitive to large errors.
mse_dt = mean_squared_error(y_test, y_pred_dt)
# RMSE (Root Mean Squared Error): Square root of MSE. In the same unit as the target variable.
rmse_dt = np.sqrt(mse_dt)
# MAE (Mean Absolute Error): Average of the absolute values of the errors. Easier to interpret.
mae_dt = mean_absolute_error(y_test, y_pred_dt)
# R-squared (R2 Score): Shows how well the model explains the variation in the data. Close to 1 is very good.
r2_dt = r2_score(y_test, y_pred_dt)

print(f"\nPerformance of Our Decision Tree Regression Model:")
print(f"Mean Squared Error (MSE): {mse_dt:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_dt:.2f}")
print(f"Mean Absolute Error (MAE): {mae_dt:.2f}")
print(f"R-squared (R2) Score: {r2_dt:.2f} (The closer this value is to 1, the better!)")

In [None]:
# Part 7: Which Features Were More Important for Prediction? (Feature Importances)
print("\n--- 7. Which Information Was More Important for Our Model? (Feature Importances) ---")

importances = dt_regressor.feature_importances_
feature_names = X_train.columns # Names of the features (columns)

# Let's convert these into a more readable table
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
# Sort
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nImportance Levels of Features (From most important to least):")
print(feature_importance_df)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(10), palette="viridis")
plt.title('Top 10 Most Important Features (For CO2 Emission Prediction)')
plt.xlabel('Importance Level (The higher, the more important)')
plt.ylabel('Features (Our Information Sources)')
plt.tight_layout()
print("\nPreparing feature importance graph...")
plt.show() # Display the graph on the screen

In [None]:
# Part 8: How Close Are Our Predictions to Actual Values? Let's Plot!
print("\n--- 8. How Close Are Our Predictions to Reality? Let's Plot! ---")

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_dt, alpha=0.6, edgecolors='w', linewidth=0.5, label='Our Predictions')
min_val = min(y_test.min(), y_pred_dt.min()) # To set the start and end of the axes
max_val = max(y_test.max(), y_pred_dt.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Match Line')
plt.xlabel('Actual CO2 Emissions (kt)')
plt.ylabel('Our Predicted CO2 Emissions (kt)')
plt.title('Actual Values vs. Our Predictions')
plt.legend()
plt.grid(True)
print("\nPreparing comparison graph of actual and predicted values...")
plt.show()

# Error = Actual Value - Predicted Value
residuals = y_test - y_pred_dt
plt.figure(figsize=(10, 6))
# Let's plot the histogram (frequency graph) of errors.
sns.histplot(residuals, kde=True, bins=30) # kde=True also plots the density curve
plt.xlabel('Error Amount (Actual Value - Predicted Value)')
plt.ylabel('How Many Times This Error Occurred (Frequency)')
plt.title('Distribution of Our Prediction Errors (Residuals)')
plt.axvline(0, color='red', linestyle='--', lw=1.5) # Ideally, errors should be around 0
plt.grid(True)
print("\nPreparing distribution graph of prediction errors...")
plt.show()
# Ideally, we expect errors to show a normal distribution around 0.

In [None]:
# Part 9: Can We Improve Our Model a Bit More? (Simple Optimization)
print("\n--- 9. Could We Improve the Model by Changing Its Settings? (max_depth Optimization) ---")

# Let's see how the R2 score changes by altering the 'max_depth'
# (maximum depth) setting of the Decision Tree.
depths_to_try = [3, 5, 8, 10, 12, 15, 20] # Different depths we will try
best_r2_so_far = -float('inf') # To store the best R2 score so far (let's start with a very small number)
best_depth_found = None # We will write the best depth here
optimization_results = [] # We will save the results here

print("\nR2 scores for different 'max_depth' values:")
for depth in depths_to_try:
    # Let's create a new model, this time with a different 'max_depth'
    temp_model = DecisionTreeRegressor(max_depth=depth, random_state=42, min_samples_split=10, min_samples_leaf=5)
    temp_model.fit(X_train, y_train) # Train it
    y_pred_temp = temp_model.predict(X_test) # Make predictions
    r2_temp = r2_score(y_test, y_pred_temp) # Calculate the R2 score
    optimization_results.append({'depth': depth, 'r2_score': r2_temp})
    print(f"R2 Score when max_depth = {depth} = {r2_temp:.4f}")
    # If this R2 score is better than any so far, let's save it as the best
    if r2_temp > best_r2_so_far:
        best_r2_so_far = r2_temp
        best_depth_found = depth

print(f"\nBest performing max_depth: {best_depth_found} (R2 Score at this depth: {best_r2_so_far:.4f})")

# Let's show the optimization results in a graph
results_df_for_plot = pd.DataFrame(optimization_results)
plt.figure(figsize=(10,6))
sns.lineplot(x='depth', y='r2_score', data=results_df_for_plot, marker='o', linestyle='-')
plt.title('Effect of Different Maximum Depth (max_depth) Values on R2 Score')
plt.xlabel('Maximum Depth (max_depth)')
plt.ylabel('R2 Score (On Test Set)')
plt.xticks(depths_to_try) # Let's show all tried depths on the X-axis
plt.grid(True, which="both", ls="--")
print("\nPreparing max_depth optimization graph...")
plt.show()

In [None]:
print("\n--- 10. What We Learned and What's Next? ---")
print(f"* Our model, with its best setting (max_depth={best_depth_found}), could explain approximately {best_r2_so_far*100:.1f}% of the variation in CO2 emissions. (R2 Score: {best_r2_so_far:.3f})")
print("* We saw which information (features) was more useful in predicting CO2 emissions.")
print("* Thanks to the graphs we plotted, we better understood our model's predictions and errors.")
print("\nWhat can be done next (Ideas):")
print("  * Maybe we can include text information like 'Entity' (country name) in the model somehow (e.g., One-Hot Encoding).")