# 1. Model Training

This notebook is the single source of truth for training the sea level prediction models. It performs the following steps:
1. Loads and preprocesses the historical GHG and sea level data.
2. Splits the data into training, validation, and test sets using a **chronological** split, which is appropriate for time-series data.
3. Initializes two models: a baseline linear model and a 2-hidden-layer non-linear model.
4. Trains both models on the training data.
5. Saves the trained model objects to the `../models/` directory for later use in analysis and prediction.

### 1.1 Setup and Data Preparation

In [None]:
import pandas as pd
import numpy as np
import sys
import os

# Add src directory to path to import neural_networks module
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
from neural_networks import NeuralNetwork_0hl, NeuralNetwork_2hl

# Set random seed for reproducibility
np.random.seed(42)

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Data Loading
df_pred_raw = pd.read_excel("../data/AR6-SYR-LR-F2-5-Panel(a).xlsx", sheet_name="Data")
df_pred = df_pred_raw.drop([1,2,3,4,6,7,8,10,11,13,14,16])
df_pred = df_pred.drop(['Unnamed: 1',2019], axis=1)
GHG_past_raw = df_pred.iloc[[0]].values[0,1:7]
df_pred = df_pred.rename(columns={'spm_cat (year)': 'Year'})
df_pred = df_pred.set_index('Year')
df_pred = df_pred.transpose()
df_pred = df_pred.drop('Past GHG emissions (Black line) ', axis=1)
df_pred = df_pred.rename(columns={'Trend from implemented policies (Lowest bound of  red shading ) ': 'Trend from implemented policies','Limit warming to 2°C (>67%) or return warming to 1.5°C (>50%) after a high overshoot, NDCs until 2030 (Median , dark navy blue line )': 'Limit warming to 2°C or return warming to 1.5°C after a high overshoot', 'Limit warming to 2°C (>67%) (Median , dark green line )': 'Limit warming to 2°C', 'Limit warming to 1.5°C (>50%) with no or limited overshoot ( Median ligh blue line ) ': 'Limit warming to 1.5°C'})
df_pred = df_pred.drop([2010,2011,2012,2013,2014], axis=0)
for year in df_pred.index[:-1]:
    diff = df_pred.loc[year+5] - df_pred.loc[year]
    for i in range(4):
        df_pred.loc[year+i+1] = df_pred.loc[year] + (i+1)*diff/5
df_pred = df_pred.sort_index()
df_past_GHG2 = pd.read_csv("https://ourworldindata.org/grapher/total-ghg-emissions.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
df_past_GHG2 = df_past_GHG2.loc[df_past_GHG2['Entity'] == 'World']
df_past_GHG2 = df_past_GHG2.drop(['Entity','Code'], axis=1)
df_past_GHG2 = df_past_GHG2.set_index('Year')
df_past_GHG2.annual_emissions_ghg_total_co2eq *= 10**(-9)
GHG_past_comb = df_past_GHG2.copy()
GHG_past_comb.loc[[2012, 2013, 2014], 'annual_emissions_ghg_total_co2eq'] = [float(emi) for emi in GHG_past_raw[2:5]]
del_years = np.arange(2015,2024)
GHG_past_comb = GHG_past_comb.drop(del_years)
df_sealevel = pd.read_csv("https://ourworldindata.org/grapher/sea-level.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})
df_sealevel = df_sealevel.drop(['Entity','Code','sea_level_church_and_white_2011','sea_level_average'], axis=1)
df_sealevel = df_sealevel.dropna()
df_sealevel['Day'] = [np.datetime64(day) for day in df_sealevel['Day']]
df_sealevel = df_sealevel.groupby(df_sealevel.Day.dt.year).mean()
df_sealevel = df_sealevel.drop('Day', axis=1, errors='ignore')

# Normalization 
GHG_past_norm = (GHG_past_comb - GHG_past_comb.mean()) / GHG_past_comb.std()
sealevel_norm = (df_sealevel - df_sealevel.mean()) / df_sealevel.std()

# Sequence and Splitting 
def get_GHG_sequence(n_years, df_GHG, start_year, end_year):
    X, y = list(), list()
    for i in range(start_year, end_year + 1):
        end_ix = i - 1
        start_ix = end_ix - n_years + 1
        seq_x = df_GHG.loc[start_ix:end_ix]
        X.append(seq_x.to_numpy())
        y.append(sealevel_norm.loc[i].values)
    return np.array(X), np.array(y)

timespan = 15
train_end_year = 2000
validation_end_year = 2007
test_end_year = 2014

X_train, y_train = get_GHG_sequence(timespan, GHG_past_norm, 1970, train_end_year)
X_val, y_val = get_GHG_sequence(timespan, GHG_past_norm, train_end_year + 1, validation_end_year)
X_test, y_test = get_GHG_sequence(timespan, GHG_past_norm, validation_end_year + 1, test_end_year)

print(f'Training set size: {len(X_train)}')
print(f'Validation set size: {len(X_val)}')
print(f'Test set size: {len(X_test)}')

### 1.2 Train Baseline Model (Linear)

In [None]:
print('Training Baseline Model')
nn_base = NeuralNetwork_0hl(input_size=timespan, output_size=1)
mse_base_train = nn_base.train(np.squeeze(X_train), y_train, epochs=20000, learningrate=0.001, print_output=False)
print('Training complete.')

# Save the model
nn_base.save_model('../models/baseline_model.pkl')
print('Baseline model saved to ../models/baseline_model.pkl')

### 1.3 Train 2-Hidden-Layer Model (Non-Linear)

In [None]:
print('- Training 2-Hidden-Layer Model ---')
nn_2hl = NeuralNetwork_2hl(input_size=timespan, hidden_size1=8, hidden_size2=4, output_size=1)
mse_2hl_train = nn_2hl.train(np.squeeze(X_train), y_train, epochs=20000, learningrate=0.001, print_output=False)
print('Training complete.')

# Save the model
nn_2hl.save_model('../models/2hl_model.pkl')
print('2-layer model saved to ../models/2hl_model.pkl')