In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense, Dropout

from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns

import os
import warnings

warnings.filterwarnings('ignore')

## Data Loading

In [None]:
input_path = '/kaggle/input/ml-olympiad-co2-emissions-prediction-challenge'
output_path = '/kaggle/working/'

Train = pd.read_csv(input_path + '/train.csv')
Test = pd.read_csv(input_path + '/test.csv')
sample_submission = pd.read_csv(input_path + '/sample_submission.csv')

In [None]:
print(len(Train), len(Test))

## Data Formatting

### Invalid Data Handling

In [None]:
train_df = Train.copy()
test_df = Test.copy()

# Replace ".." values with zeros in the main_df
train_df.replace("..", 0, inplace=True)
test_df.replace("..", 0, inplace=True)

## Data Preprocessing

In [None]:
unique_countries = Train['Country Name'].unique()
unique_indicators = train_df['Indicator'].unique()

In [None]:
def preprocess_country_data(input_df, country_name, input_dict):
    
    # Individual country data
    country_df = input_df[input_df['Country Name'] == country_name].reset_index(drop=True)
    
    columns_to_drop = ['Country Code', 'Country Name']
    country_df = country_df.drop(columns_to_drop, axis=1, errors='ignore')
    
    country_df = country_df.transpose()
    
    # Column names as Indicators
    column_names = country_df.iloc[0].values
    country_df.columns = column_names
    country_df = country_df.drop(country_df.index[0])
    
    # Replace indexes with range values
    country_df.index = range(len(country_df))
    
    country_df = country_df.astype(float)
    
    # Taking only specified columns
    columns_to_drop = country_df.columns[[5, 6, 7]]
    country_df = country_df.drop(columns=columns_to_drop)
    
    input_dict[country_name] = country_df

In [None]:
country_df_dict = {}

for country in tqdm(unique_countries, desc="Processing country data"):
    preprocess_country_data(train_df, country, country_df_dict)

In [None]:
country_df = country_df_dict['Bangladesh']
country_df

### Normalization

In [None]:
country_df_dict_scaled = {}

for country in tqdm(unique_countries, desc="Scaling country data"):
    
    country_df = country_df_dict[country]
    scaler = StandardScaler()
    scaler = scaler.fit(country_df)
    country_df_scaled = scaler.transform(country_df)
    country_df_dict_scaled[country] = country_df_scaled


In [None]:
country_df = country_df_dict_scaled['Bangladesh']
print(pd.DataFrame(country_df))

## Model Training

### Model Architecture

In [None]:
n_future = 1   # Number of years we want to look into the future based on the past years
n_past = 5  # Number of past years we want to use to predict the future.
n_predict = 5 # Number of years we want to predict using trained model after last year of trained data
indicators_count = 9

In [None]:
model = Sequential([
    LSTM(64, activation='relu', input_shape=(n_past, indicators_count), return_sequences=True),
    LSTM(32, activation='relu', return_sequences=False),
    Dropout(0.2),
    Dense(n_future)
])

model.compile(optimizer='adam', loss='mse')
model.summary()

### Training Data

In [None]:
forecast_dict = {}

for country in tqdm(unique_countries, desc="Country data"): 
    country_df_scaled = country_df_dict_scaled[country]
    
    trainX = []
    trainY = []
    
    for i in range(n_past, len(country_df_scaled) - n_future +1):
        trainX.append(country_df_scaled[i - n_past:i, 0:country_df_scaled.shape[1]])
        trainY.append(country_df_scaled[i + n_future - 1:i + n_future, -1])

    trainX, trainY = np.array(trainX), np.array(trainY)

    print('trainX shape == {}.'.format(trainX.shape))
    print('trainY shape == {}.'.format(trainY.shape))
    
    # Training
    model.fit(trainX, trainY, epochs=30, batch_size=16, validation_split=0.1, verbose=1)
    
    # Predicting
    forecast = model.predict(trainX[-n_predict:])
    
    forecast_dict[country] = forecast

In [None]:
print(forecast_dict['Afghanistan'])

### Inverse Scaling

In [None]:
og_forecast_dict = {}

for country in unique_countries:
    forecast = forecast_dict[country]

    forecast_copies = np.repeat(forecast, indicators_count, axis=-1)
    y_pred_future = scaler.inverse_transform(forecast_copies)[:, -1]
    og_forecast_dict[country] = y_pred_future

In [None]:
print(og_forecast_dict['Bangladesh'])

## Test Data Processing

### Add Predicted CO2 to Test Data

In [None]:
test_df = Test.copy()
test_df.replace("..", 0, inplace=True)

# Placeholder DataFrame to collect new rows
new_rows = []

# Iterate through all combinations of countries and missing indicators
for country in unique_countries:
    # Predicted CO2
    forecasted_co2 = og_forecast_dict[country]
    
    # Check if the CO2 emissions row already exists for the country in test_df
    if not test_df[(test_df['Country Name'] == country) & (test_df['Indicator'] == 'CO2 emissions (metric tons per capita)')].empty:
        continue  # Skip adding a new row
    
    # Create a new row with the country, indicator, and predicted CO2 for all years
    new_row = {'Country Name': country, 'Indicator': 'CO2 emissions (metric tons per capita)'}
    for i, year in enumerate(range(2016, 2021)):  # Adjust the range according to your needs
        new_row[f"{year} [YR{year}]"] = forecasted_co2[i]  # Fill year columns with zeros
    new_rows.append(new_row)

# Convert the list of new rows into a DataFrame
new_rows_df = pd.DataFrame(new_rows)

# Append these new rows to the original test_df
test_df = pd.concat([test_df, new_rows_df], ignore_index=True)

In [None]:
country_df = test_df[test_df['Country Name'] == 'Albania']
country_df

### Format Test Data

In [None]:
test_country_df_dict = {}

for country in tqdm(unique_countries, desc="Processing country data"):
    preprocess_country_data(test_df, country, test_country_df_dict)

In [None]:
test_country_df_dict['Albania']

### Combine Test and Train

In [None]:
final_dict = {}

for country in tqdm(unique_countries, desc="Combining country data"):
    train_dict_df = country_df_dict[country]
    test_dict_df = test_country_df_dict[country]
    final_df = pd.concat([train_dict_df, test_dict_df], ignore_index=True)
    final_dict[country] = final_df

In [None]:
final_dict['Afghanistan']

### Normalization

In [None]:
final_dict_scaled = {}

for country in tqdm(unique_countries, desc="Scaling country data"):
    
    final_country_df = final_dict[country]
    scaler = StandardScaler()
    scaler = scaler.fit(final_country_df)
    final_country_df_scaled = scaler.transform(final_country_df)
    final_dict_scaled[country] = final_country_df_scaled

In [None]:
pd.DataFrame(final_dict_scaled['Albania'])

### Model Architecture

In [None]:
n_future = 1   # Number of years we want to look into the future based on the past years
n_past = 5  # Number of past years we want to use to predict the future.
n_predict = 10 # Number of years we want to predict using trained model after last year of trained data
indicators_count = 9

In [None]:
model = Sequential([
    LSTM(64, activation='relu', input_shape=(n_past, indicators_count), return_sequences=True),
    LSTM(32, activation='relu', return_sequences=False),
    Dropout(0.2),
    Dense(n_future)
])

model.compile(optimizer='adam', loss='mse')
model.summary()

### Model Training

In [None]:
final_forecast_dict = {}

for country in tqdm(unique_countries, desc="Country data"): 
    final_country_df_scaled = final_dict_scaled[country]
    
    trainX = []
    trainY = []
    
    for i in range(n_past, len(final_country_df_scaled) - n_future +1):
        trainX.append(final_country_df_scaled[i - n_past:i, 0:final_country_df_scaled.shape[1]])
        trainY.append(final_country_df_scaled[i + n_future - 1:i + n_future, -1])

    trainX, trainY = np.array(trainX), np.array(trainY)

    print('trainX shape == {}.'.format(trainX.shape))
    print('trainY shape == {}.'.format(trainY.shape))
    
    # Training
    model.fit(trainX, trainY, epochs=30, batch_size=16, validation_split=0.1, verbose=1)
    
    # Predicting
    final_forecast = model.predict(trainX[-n_predict:])
    
    final_forecast_dict[country] = final_forecast

In [None]:
print(final_forecast_dict['Bangladesh'])

### Inverse Scaling

In [None]:
og_final_forecast_dict = {}

for country in unique_countries:
    final_forecast = final_forecast_dict[country]
    
    final_forecast_copies = np.repeat(final_forecast, indicators_count, axis=-1)
    y_pred_future_final = scaler.inverse_transform(final_forecast_copies)[:, -1]
    og_final_forecast_dict[country] = y_pred_future_final

In [None]:
og_final_forecast_dict['Bangladesh']

## Arrange Output for Submission

In [None]:
new_rows = []

for country in unique_countries:
    
    new_row = {'Country Name': country}
    
    f5 = og_forecast_dict[country]
    f10 = og_final_forecast_dict[country]
    f10 = f10[-1]
    
    for i, year in enumerate(range(2016, 2021)):
        new_row[f"{year} [YR{year}]"] = f5[i]
    
    year = 2030
    new_row[f"{year} [YR{year}]"] = f10
        
    new_rows.append(new_row)

submission_df = pd.DataFrame(new_rows)
submission_df

In [None]:
# Export the DataFrame to a CSV file
submission_df.to_csv(output_path + '/submission.csv', index=False)