# Ensemble Learning Project


In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, PowerTransformer
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# Reading the datasets
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test_final = pd.read_csv('X_test_final.csv')

In [4]:
france_dataset = pd.read_csv('france_dataset.csv')
germany_dataset = pd.read_csv('germany_dataset.csv')
training_dataset = pd.read_csv('training_dataset.csv')

In [5]:
print(training_dataset['COUNTRY'].value_counts())
print(training_dataset['COUNTRY'].value_counts(1))

FR    851
DE    643
Name: COUNTRY, dtype: int64
FR    0.569612
DE    0.430388
Name: COUNTRY, dtype: float64


# Modeling

In [6]:
germany_dataset.head()

Unnamed: 0,ID,DAY_ID,GAS_RET,COAL_RET,CARBON_RET,DE_TEMP,DE_WIND,DE_RAIN,DE_GAS,DE_COAL,DE_HYDRO,DE_NUCLEAR,DE_SOLAR,DE_WINDPOW,DE_LIGNITE,DE_CONSUMPTION,DE_RESIDUAL_LOAD,DE_NET_IMPORT,DE_NET_EXPORT,DE_FR_EXCHANGE,FR_DE_EXCHANGE,TARGET
0,297,720,0.911652,-0.296168,1.073948,0.350938,-0.499409,-1.114838,0.487818,-1.473817,-0.368417,-0.205547,1.751523,-0.01009,-2.330557,-0.983324,-1.191889,0.27087,-0.27087,-0.839586,0.839586,-0.260356
1,819,116,-0.359866,-0.203952,-0.376234,1.595158,1.143607,1.608304,0.882313,-0.042992,1.282374,-2.509255,1.172155,-0.103994,-0.35448,-0.055692,-0.178397,0.851082,-0.851082,0.237105,-0.237105,-0.133381
2,918,406,1.17076,0.133643,0.033874,1.241892,1.383171,-0.194927,-0.18843,-0.354327,-0.168264,-1.879712,-0.324789,1.448078,-0.237658,0.532116,-0.795593,0.173123,-0.173123,0.339942,-0.339942,0.196312
3,283,1175,0.122818,0.220077,5.453331,-0.634046,-0.138918,0.974454,0.544008,-1.183566,0.52361,-0.646234,0.509242,0.35921,-2.244028,-0.328286,-0.676137,1.046122,-1.046122,-1.380464,1.380464,-0.025477
4,158,309,0.689483,1.095473,0.342798,-0.667496,-1.106067,0.194458,2.170761,1.831623,1.223032,0.125765,-1.234093,-1.210165,0.892261,1.028987,2.156285,0.391261,-0.391261,1.129663,-1.129663,0.460278


In [7]:
france_dataset.head()

Unnamed: 0,ID,DAY_ID,GAS_RET,COAL_RET,CARBON_RET,FR_TEMP,FR_WIND,FR_RAIN,FR_GAS,FR_COAL,FR_HYDRO,FR_NUCLEAR,FR_SOLAR,FR_WINDPOW,FR_CONSUMPTION,FR_RESIDUAL_LOAD,FR_NET_IMPORT,FR_NET_EXPORT,DE_FR_EXCHANGE,FR_DE_EXCHANGE,TARGET
0,1054,206,0.339041,0.124552,-0.002445,-0.063404,-0.28316,-0.556356,-0.213766,0.288782,0.207838,-0.190463,1.248911,-0.26946,-0.427458,-0.444661,-0.69286,0.69286,-0.606523,0.606523,0.028313
1,2049,501,-0.659091,0.047114,-0.490365,1.831241,0.828412,-0.770457,0.42694,-0.762153,-0.807112,-2.185961,3.23738,-0.10735,-1.003452,-1.183194,1.130838,-1.130838,-0.022063,0.022063,-0.112516
2,1924,687,0.535974,0.743338,0.204952,0.114836,0.487608,-0.313338,2.122241,0.777053,0.779142,0.735137,-0.371039,-0.141239,1.978665,1.947273,1.682587,-1.682587,1.021305,-1.021305,-0.18084
3,1101,818,0.245109,1.526606,2.614378,0.729495,-1.01156,-0.42455,-0.240862,-0.274975,-0.795983,0.176935,0.723587,-0.564498,-0.617038,-0.526267,-0.990324,0.990324,-0.92499,0.92499,-0.071733
4,1520,467,0.891049,0.861408,1.124457,0.472708,-0.917234,-0.193837,0.306422,-0.775944,0.593251,-1.920695,2.054491,-0.245628,-0.76512,-0.860628,0.200305,-0.200305,-0.71749,0.71749,0.932105


## Baseline Model

In [20]:
#Create X_train and y_train for France
training_set_x_fr = france_dataset.copy()
training_set_x_fr = training_set_x_fr.sort_values(by='DAY_ID', ascending=True)
training_set_x_fr = training_set_x_fr.set_index('ID')
training_set_y_fr = training_set_x_fr['TARGET']
training_set_x_fr = training_set_x_fr.drop(['TARGET'], axis=1)

In [22]:
#Create X_train and y_train for Germany
training_set_x_de = germany_dataset.copy()
training_set_x_de = training_set_x_de.sort_values(by='DAY_ID', ascending=True)
training_set_x_de = training_set_x_de.set_index('ID')
training_set_y_de = training_set_x_de['TARGET']
training_set_x_de = training_set_x_de.drop(['TARGET'], axis=1)

In [29]:
from sklearn.model_selection import KFold, cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import spearmanr
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

In [28]:
# Define the number of splits for the kfold validation
num_splits = 5

# Define the random seed for the algorithms
random_seed = 42

# Define regression models
regression_models = {
    'Decision Tree': DecisionTreeRegressor(random_state=random_seed),
    'ExtraTrees Regressor': ExtraTreesRegressor(random_state=random_seed),
    'AdaBoost Regressor': AdaBoostRegressor(random_state=random_seed),
    'Linear Regression': LinearRegression(),
    'Random Forest Regression': RandomForestRegressor(random_state=random_seed),
    'XGBoost': XGBRegressor(random_state=random_seed),
    'LightGBM': LGBMRegressor(random_state=random_seed)
}

# Define the scoring functions
scoring = {
    'RMSE': make_scorer(mean_squared_error, squared=False),
    'Spearman Correlation': make_scorer(lambda y, y_pred: spearmanr(y, y_pred)[0]),
    'MAE': make_scorer(mean_absolute_error)
}

# Evaluate regression models
regression_results = {}
for model_name, model in regression_models.items():
    print(f"Evaluating {model_name} for regression...")

    # Perform cross-validation
    scores = cross_validate(model, training_set_x_fr, training_set_y_fr, cv=kfold, scoring=scoring)

    # Extract the results for each fold
    rmse_scores = scores['test_RMSE']
    spearman_scores = scores['test_Spearman Correlation']
    mae_scores = scores['test_MAE']

    regression_results[model_name] = {'RMSE': rmse_scores, 'Spearman Correlation': spearman_scores, 'MAE': mae_scores}

    for fold, (rmse, spearman, mae) in enumerate(zip(rmse_scores, spearman_scores, mae_scores), start=1):
        print(f"  Fold {fold}: RMSE: {rmse:.4f}, Spearman's Correlation: {spearman:.4f}, MAE: {mae:.4f}")

    print(f"Mean RMSE: {np.mean(rmse_scores):.4f}, Mean Spearman's Correlation: {np.mean(spearman_scores):.4f}, Mean MAE: {np.mean(mae_scores):.4f}")
    print("\n")

# Print the final results for regression
print("\nResults Summary for Regression:")
for model_name, scores in regression_results.items():
    print(f"{model_name}: Mean RMSE: {np.mean(scores['RMSE']):.4f}, Mean Spearman Correlation: {np.mean(scores['Spearman Correlation']):.4f}, Mean MAE: {np.mean(scores['MAE']):.4f}")

Evaluating Decision Tree for regression...
  Fold 1: RMSE: 1.5302, Spearman's Correlation: 0.1277, MAE: 0.8852
  Fold 2: RMSE: 1.5614, Spearman's Correlation: 0.0866, MAE: 0.9215
  Fold 3: RMSE: 1.4122, Spearman's Correlation: -0.0525, MAE: 0.8472
  Fold 4: RMSE: 1.4250, Spearman's Correlation: -0.0076, MAE: 0.8115
  Fold 5: RMSE: 1.5639, Spearman's Correlation: 0.1766, MAE: 0.9034
Mean RMSE: 1.4985, Mean Spearman's Correlation: 0.0662, Mean MAE: 0.8738


Evaluating ExtraTrees Regressor for regression...
  Fold 1: RMSE: 0.9422, Spearman's Correlation: 0.1108, MAE: 0.5680
  Fold 2: RMSE: 1.0146, Spearman's Correlation: 0.0439, MAE: 0.5869
  Fold 3: RMSE: 1.0575, Spearman's Correlation: 0.0472, MAE: 0.5981
  Fold 4: RMSE: 1.0354, Spearman's Correlation: 0.0999, MAE: 0.5587
  Fold 5: RMSE: 1.3064, Spearman's Correlation: 0.1614, MAE: 0.6522
Mean RMSE: 1.0712, Mean Spearman's Correlation: 0.0927, Mean MAE: 0.5928


Evaluating AdaBoost Regressor for regression...
  Fold 1: RMSE: 0.9732, Spe

In [30]:
# Define the number of splits for the kfold validation
num_splits = 5

# Define the random seed for the algorithms
random_seed = 42

# Define regression models
regression_models = {
    'Decision Tree': DecisionTreeRegressor(random_state=random_seed),
    'ExtraTrees Regressor': ExtraTreesRegressor(random_state=random_seed),
    'AdaBoost Regressor': AdaBoostRegressor(random_state=random_seed),
    'Linear Regression': LinearRegression(),
    'Random Forest Regression': RandomForestRegressor(random_state=random_seed),
    'XGBoost': XGBRegressor(random_state=random_seed),
    'LightGBM': LGBMRegressor(random_state=random_seed)
}

# Define the scoring functions
scoring = {
    'RMSE': make_scorer(mean_squared_error, squared=False),
    'Spearman Correlation': make_scorer(lambda y, y_pred: spearmanr(y, y_pred)[0]),
    'MAE': make_scorer(mean_absolute_error)
}

# Evaluate regression models
regression_results = {}
for model_name, model in regression_models.items():
    print(f"Evaluating {model_name} for regression...")

    # Perform cross-validation
    scores = cross_validate(model, training_set_x_de, training_set_y_de, cv=kfold, scoring=scoring)

    # Extract the results for each fold
    rmse_scores = scores['test_RMSE']
    spearman_scores = scores['test_Spearman Correlation']
    mae_scores = scores['test_MAE']

    regression_results[model_name] = {'RMSE': rmse_scores, 'Spearman Correlation': spearman_scores, 'MAE': mae_scores}

    for fold, (rmse, spearman, mae) in enumerate(zip(rmse_scores, spearman_scores, mae_scores), start=1):
        print(f"  Fold {fold}: RMSE: {rmse:.4f}, Spearman's Correlation: {spearman:.4f}, MAE: {mae:.4f}")

    print(f"Mean RMSE: {np.mean(rmse_scores):.4f}, Mean Spearman's Correlation: {np.mean(spearman_scores):.4f}, Mean MAE: {np.mean(mae_scores):.4f}")
    print("\n")

# Print the final results for regression
print("\nResults Summary for Regression:")
for model_name, scores in regression_results.items():
    print(f"{model_name}: Mean RMSE: {np.mean(scores['RMSE']):.4f}, Mean Spearman Correlation: {np.mean(scores['Spearman Correlation']):.4f}, Mean MAE: {np.mean(scores['MAE']):.4f}")

Evaluating Decision Tree for regression...
  Fold 1: RMSE: 1.4798, Spearman's Correlation: 0.2054, MAE: 0.8987
  Fold 2: RMSE: 1.4972, Spearman's Correlation: 0.1444, MAE: 0.9651
  Fold 3: RMSE: 1.2052, Spearman's Correlation: 0.0621, MAE: 0.8197
  Fold 4: RMSE: 1.2213, Spearman's Correlation: 0.0726, MAE: 0.8348
  Fold 5: RMSE: 1.6667, Spearman's Correlation: 0.0589, MAE: 1.0506
Mean RMSE: 1.4140, Mean Spearman's Correlation: 0.1087, Mean MAE: 0.9138


Evaluating ExtraTrees Regressor for regression...
  Fold 1: RMSE: 1.1215, Spearman's Correlation: 0.2040, MAE: 0.6967
  Fold 2: RMSE: 0.9228, Spearman's Correlation: 0.3739, MAE: 0.5883
  Fold 3: RMSE: 1.0208, Spearman's Correlation: 0.3329, MAE: 0.6760
  Fold 4: RMSE: 0.6892, Spearman's Correlation: 0.3275, MAE: 0.5013
  Fold 5: RMSE: 1.3589, Spearman's Correlation: 0.2097, MAE: 0.8492
Mean RMSE: 1.0227, Mean Spearman's Correlation: 0.2896, Mean MAE: 0.6623


Evaluating AdaBoost Regressor for regression...
  Fold 1: RMSE: 1.1871, Spear

## Feature Engineering: Feature Creation

In [32]:
# 1. Price and Production Ratios
france_dataset['gas_price_to_production_ratio'] = france_dataset['GAS_RET'] / france_dataset['FR_GAS']
germany_dataset['gas_price_to_production_ratio'] = germany_dataset['GAS_RET'] / germany_dataset['DE_GAS']

# 2. Daily Consumption Variations
france_dataset['consumption_variation'] = france_dataset['FR_CONSUMPTION'].pct_change() * 100  # Percentage change
germany_dataset['consumption_variation'] = germany_dataset['DE_CONSUMPTION'].pct_change() * 100  # Percentage change
# Forward-fill missing values
france_dataset['consumption_variation'].fillna(0, inplace=True)
germany_dataset['consumption_variation'].fillna(0, inplace=True)


# 3. Moving Averages for Consumption
window_size_moving_average = 3  # You can adjust the window size based on the characteristics of your data
france_dataset['consumption_moving_average'] = france_dataset['FR_CONSUMPTION'].rolling(window=window_size_moving_average).mean()
france_dataset['gas_moving_average'] = france_dataset['GAS_RET'].rolling(window=window_size_moving_average).mean()
france_dataset['coal_moving_average'] = france_dataset['COAL_RET'].rolling(window=window_size_moving_average).mean()
france_dataset['carbon_moving_average'] = france_dataset['CARBON_RET'].rolling(window=window_size_moving_average).mean()

france_dataset['consumption_moving_average'].fillna(france_dataset['consumption_moving_average'].mean(), inplace=True)
france_dataset['gas_moving_average'].fillna(france_dataset['gas_moving_average'].mean(), inplace=True)
france_dataset['coal_moving_average'].fillna(france_dataset['coal_moving_average'].mean(), inplace=True)
france_dataset['carbon_moving_average'].fillna(france_dataset['carbon_moving_average'].mean(), inplace=True)

window_size_moving_average = 3  # You can adjust the window size based on the characteristics of your data
germany_dataset['consumption_moving_average'] = germany_dataset['DE_CONSUMPTION'].rolling(window=window_size_moving_average).mean()
germany_dataset['gas_moving_average'] = germany_dataset['GAS_RET'].rolling(window=window_size_moving_average).mean()
germany_dataset['coal_moving_average'] = germany_dataset['COAL_RET'].rolling(window=window_size_moving_average).mean()
germany_dataset['carbon_moving_average'] = germany_dataset['CARBON_RET'].rolling(window=window_size_moving_average).mean()

germany_dataset['consumption_moving_average'].fillna(germany_dataset['consumption_moving_average'].mean(), inplace=True)
germany_dataset['gas_moving_average'].fillna(germany_dataset['gas_moving_average'].mean(), inplace=True)
germany_dataset['coal_moving_average'].fillna(germany_dataset['coal_moving_average'].mean(), inplace=True)
germany_dataset['carbon_moving_average'].fillna(germany_dataset['carbon_moving_average'].mean(), inplace=True)

In [31]:
france_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 851 entries, 0 to 850
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                851 non-null    int64  
 1   DAY_ID            851 non-null    int64  
 2   GAS_RET           851 non-null    float64
 3   COAL_RET          851 non-null    float64
 4   CARBON_RET        851 non-null    float64
 5   FR_TEMP           851 non-null    float64
 6   FR_WIND           851 non-null    float64
 7   FR_RAIN           851 non-null    float64
 8   FR_GAS            851 non-null    float64
 9   FR_COAL           851 non-null    float64
 10  FR_HYDRO          851 non-null    float64
 11  FR_NUCLEAR        851 non-null    float64
 12  FR_SOLAR          851 non-null    float64
 13  FR_WINDPOW        851 non-null    float64
 14  FR_CONSUMPTION    851 non-null    float64
 15  FR_RESIDUAL_LOAD  851 non-null    float64
 16  FR_NET_IMPORT     851 non-null    float64
 1

In [33]:
# FRANCE
# Create a new feature for renewable energy consumption
france_dataset['FR_RENEWABLE_CONSUMPTION'] = france_dataset['FR_HYDRO'] + X_train['FR_SOLAR'] + X_train['FR_WINDPOW']
germany_dataset['DE_RENEWABLE_CONSUMPTION'] = germany_dataset['DE_HYDRO'] + germany_dataset['DE_SOLAR'] + germany_dataset['DE_WINDPOW']

# Create a new feature for non-renewable energy consumption
france_dataset['FR_NON_RENEWABLE_CONSUMPTION'] = france_dataset['FR_GAS'] + france_dataset['FR_COAL'] + france_dataset['FR_NUCLEAR']
germany_dataset['DE_NON_RENEWABLE_CONSUMPTION'] = germany_dataset['DE_GAS'] + germany_dataset['DE_COAL'] + germany_dataset['DE_NUCLEAR'] + germany_dataset['DE_LIGNITE']

# Create a new feature for net energy exchange
france_dataset['FR_NET_ENERGY_EXCHANGE'] = france_dataset['FR_NET_IMPORT'] - france_dataset['FR_NET_EXPORT']
germany_dataset['DE_NET_ENERGY_EXCHANGE'] = germany_dataset['DE_NET_IMPORT'] - germany_dataset['DE_NET_EXPORT']

# Create a new feature for total energy production
france_dataset['FR_TOTAL_PRODUCTION'] = france_dataset['FR_GAS'] + france_dataset['FR_COAL'] + france_dataset['FR_HYDRO'] + france_dataset['FR_NUCLEAR'] + france_dataset['FR_SOLAR'] + france_dataset['FR_WINDPOW']
germany_dataset['DE_TOTAL_PRODUCTION'] = germany_dataset['DE_GAS'] + germany_dataset['DE_COAL'] + germany_dataset['DE_HYDRO'] + germany_dataset['DE_NUCLEAR'] + germany_dataset['DE_SOLAR'] + germany_dataset['DE_WINDPOW'] + germany_dataset['DE_LIGNITE']

# Create a new feature for percentage of renewable energy consumption w.r.t total produced in the country
france_dataset['FR_PERCENT_RENEWABLE_PROD'] = (france_dataset['FR_RENEWABLE_CONSUMPTION'] / france_dataset['FR_TOTAL_PRODUCTION']) * 100
germany_dataset['DE_PERCENT_RENEWABLE_PROD'] = (germany_dataset['DE_RENEWABLE_CONSUMPTION'] / germany_dataset['DE_TOTAL_PRODUCTION']) * 100


# Create a new feature for percentage of non-renewable energy consumption
france_dataset['FR_PERCENT_NON_RENEWABLE_PROD'] = (france_dataset['FR_NON_RENEWABLE_CONSUMPTION'] / france_dataset['FR_TOTAL_PRODUCTION']) * 100
germany_dataset['DE_PERCENT_NON_RENEWABLE_PROD'] = (germany_dataset['DE_NON_RENEWABLE_CONSUMPTION'] / germany_dataset['DE_TOTAL_PRODUCTION']) * 100


# Create a new feature for percentage of renewable energy consumption
france_dataset['FR_PERCENT_RENEWABLE'] = (france_dataset['FR_RENEWABLE_CONSUMPTION'] / france_dataset['FR_CONSUMPTION']) * 100
germany_dataset['DE_PERCENT_RENEWABLE'] = (germany_dataset['DE_RENEWABLE_CONSUMPTION'] / germany_dataset['DE_CONSUMPTION']) * 100

# Create a new feature for percentage of non-renewable energy consumption
france_dataset['FR_PERCENT_NON_RENEWABLE'] = (france_dataset['FR_NON_RENEWABLE_CONSUMPTION'] / france_dataset['FR_CONSUMPTION']) * 100
germany_dataset['DE_PERCENT_NON_RENEWABLE'] = (germany_dataset['DE_NON_RENEWABLE_CONSUMPTION'] / germany_dataset['DE_CONSUMPTION']) * 100

## Experiment 1 - Dropping High Correlated Features

In [36]:
germany_info = ['ID','DAY_ID','GAS_RET',
                            'COAL_RET',
                            'CARBON_RET',
                            'DE_TEMP',
                            'DE_WIND',
                            'DE_RAIN',
                          #  'DE_GAS',
                           # 'DE_COAL',
                            'DE_HYDRO',
                            'DE_NUCLEAR',
                            'DE_SOLAR',
                          #  'DE_WINDPOW',
                            'DE_LIGNITE',
                            'DE_CONSUMPTION',
                            'DE_RESIDUAL_LOAD',
                            'DE_NET_IMPORT',
                            'DE_NET_EXPORT',
                            'DE_FR_EXCHANGE',
                            'FR_DE_EXCHANGE',
                            'TARGET',
                            'gas_price_to_production_ratio',
                            'consumption_variation',
                            'consumption_moving_average',
                            'gas_moving_average',
                            'coal_moving_average',
                            'carbon_moving_average']

france_info = ['ID','DAY_ID','GAS_RET',
                            'COAL_RET',
                            'CARBON_RET',
                            'FR_TEMP',
                            'FR_WIND',
                            'FR_RAIN',
                            'FR_GAS',
                            'FR_COAL',
                            'FR_HYDRO',
                            'FR_NUCLEAR',
                            'FR_SOLAR',
                            'FR_WINDPOW',
                            'FR_CONSUMPTION',
                           # 'FR_RESIDUAL_LOAD',
                            #'FR_NET_IMPORT',
                            'FR_NET_EXPORT',
                            'DE_FR_EXCHANGE',
                            'FR_DE_EXCHANGE',
                            'TARGET',
                            'gas_price_to_production_ratio',
                            'consumption_variation',
                            'consumption_moving_average',
                            'gas_moving_average',
                            'coal_moving_average',
                            'carbon_moving_average']

#### France

In [37]:
france_order = france_dataset[france_info]
x_train_clean_fr = france_order.sort_values(by='DAY_ID', ascending=True)
x_train_clean_fr = x_train_clean_fr.set_index('ID')
y_train_clean_fr = x_train_clean_fr['TARGET']
x_train_clean_fr = x_train_clean_fr.drop(['TARGET'], axis=1)

In [39]:
# Define the number of splits for the kfold validation
num_splits = 5

# Define the random seed for the algorithms
random_seed = 42

# Define regression models
regression_models = {
    'Decision Tree': DecisionTreeRegressor(random_state=random_seed),
    'ExtraTrees Regressor': ExtraTreesRegressor(random_state=random_seed),
    'AdaBoost Regressor': AdaBoostRegressor(random_state=random_seed),
    'Linear Regression': LinearRegression(),
    'Random Forest Regression': RandomForestRegressor(random_state=random_seed),
    'XGBoost': XGBRegressor(random_state=random_seed),
    'LightGBM': LGBMRegressor(random_state=random_seed)
}

# Define the scoring functions
scoring = {
    'RMSE': make_scorer(mean_squared_error, squared=False),
    'Spearman Correlation': make_scorer(lambda y, y_pred: spearmanr(y, y_pred)[0]),
    'MAE': make_scorer(mean_absolute_error)
}

# Evaluate regression models
regression_results = {}
for model_name, model in regression_models.items():
    print(f"Evaluating {model_name} for regression...")

    # Perform cross-validation
    scores = cross_validate(model, x_train_clean_fr, y_train_clean_fr, cv=kfold, scoring=scoring)

    # Extract the results for each fold
    rmse_scores = scores['test_RMSE']
    spearman_scores = scores['test_Spearman Correlation']
    mae_scores = scores['test_MAE']

    regression_results[model_name] = {'RMSE': rmse_scores, 'Spearman Correlation': spearman_scores, 'MAE': mae_scores}

    for fold, (rmse, spearman, mae) in enumerate(zip(rmse_scores, spearman_scores, mae_scores), start=1):
        print(f"  Fold {fold}: RMSE: {rmse:.4f}, Spearman's Correlation: {spearman:.4f}, MAE: {mae:.4f}")

    print(f"Mean RMSE: {np.mean(rmse_scores):.4f}, Mean Spearman's Correlation: {np.mean(spearman_scores):.4f}, Mean MAE: {np.mean(mae_scores):.4f}")
    print("\n")

# Print the final results for regression
print("\nResults Summary for Regression:")
for model_name, scores in regression_results.items():
    print(f"{model_name}: Mean RMSE: {np.mean(scores['RMSE']):.4f}, Mean Spearman Correlation: {np.mean(scores['Spearman Correlation']):.4f}, Mean MAE: {np.mean(scores['MAE']):.4f}")

Evaluating Decision Tree for regression...
  Fold 1: RMSE: 1.6814, Spearman's Correlation: -0.1080, MAE: 1.0291
  Fold 2: RMSE: 1.5745, Spearman's Correlation: 0.0179, MAE: 0.9198
  Fold 3: RMSE: 1.7022, Spearman's Correlation: -0.0264, MAE: 0.9897
  Fold 4: RMSE: 1.4534, Spearman's Correlation: 0.0238, MAE: 0.8331
  Fold 5: RMSE: 1.6176, Spearman's Correlation: 0.0334, MAE: 0.9927
Mean RMSE: 1.6058, Mean Spearman's Correlation: -0.0119, Mean MAE: 0.9529


Evaluating ExtraTrees Regressor for regression...
  Fold 1: RMSE: 0.9548, Spearman's Correlation: 0.1122, MAE: 0.5655
  Fold 2: RMSE: 1.0193, Spearman's Correlation: 0.0036, MAE: 0.5792
  Fold 3: RMSE: 1.0677, Spearman's Correlation: -0.0300, MAE: 0.5978
  Fold 4: RMSE: 0.9946, Spearman's Correlation: 0.1018, MAE: 0.5451
  Fold 5: RMSE: 1.3094, Spearman's Correlation: 0.1639, MAE: 0.6380
Mean RMSE: 1.0692, Mean Spearman's Correlation: 0.0703, Mean MAE: 0.5851


Evaluating AdaBoost Regressor for regression...
  Fold 1: RMSE: 1.0070, S

#### Germany

In [38]:
germany_order = germany_dataset[germany_info]
x_train_clean_de = germany_order.sort_values(by='DAY_ID', ascending=True)
x_train_clean_de = x_train_clean_de.set_index('ID')
y_train_clean_de = x_train_clean_de['TARGET']
x_train_clean_de = x_train_clean_de.drop(['TARGET'], axis=1)

In [40]:
# Define the number of splits for the kfold validation
num_splits = 5

# Define the random seed for the algorithms
random_seed = 42

# Define regression models
regression_models = {
    'Decision Tree': DecisionTreeRegressor(random_state=random_seed),
    'ExtraTrees Regressor': ExtraTreesRegressor(random_state=random_seed),
    'AdaBoost Regressor': AdaBoostRegressor(random_state=random_seed),
    'Linear Regression': LinearRegression(),
    'Random Forest Regression': RandomForestRegressor(random_state=random_seed),
    'XGBoost': XGBRegressor(random_state=random_seed),
    'LightGBM': LGBMRegressor(random_state=random_seed)
}

# Define the scoring functions
scoring = {
    'RMSE': make_scorer(mean_squared_error, squared=False),
    'Spearman Correlation': make_scorer(lambda y, y_pred: spearmanr(y, y_pred)[0]),
    'MAE': make_scorer(mean_absolute_error)
}

# Evaluate regression models
regression_results = {}
for model_name, model in regression_models.items():
    print(f"Evaluating {model_name} for regression...")

    # Perform cross-validation
    scores = cross_validate(model, x_train_clean_de, y_train_clean_de, cv=kfold, scoring=scoring)

    # Extract the results for each fold
    rmse_scores = scores['test_RMSE']
    spearman_scores = scores['test_Spearman Correlation']
    mae_scores = scores['test_MAE']

    regression_results[model_name] = {'RMSE': rmse_scores, 'Spearman Correlation': spearman_scores, 'MAE': mae_scores}

    for fold, (rmse, spearman, mae) in enumerate(zip(rmse_scores, spearman_scores, mae_scores), start=1):
        print(f"  Fold {fold}: RMSE: {rmse:.4f}, Spearman's Correlation: {spearman:.4f}, MAE: {mae:.4f}")

    print(f"Mean RMSE: {np.mean(rmse_scores):.4f}, Mean Spearman's Correlation: {np.mean(spearman_scores):.4f}, Mean MAE: {np.mean(mae_scores):.4f}")
    print("\n")

# Print the final results for regression
print("\nResults Summary for Regression:")
for model_name, scores in regression_results.items():
    print(f"{model_name}: Mean RMSE: {np.mean(scores['RMSE']):.4f}, Mean Spearman Correlation: {np.mean(scores['Spearman Correlation']):.4f}, Mean MAE: {np.mean(scores['MAE']):.4f}")

Evaluating Decision Tree for regression...
  Fold 1: RMSE: 1.5206, Spearman's Correlation: 0.1194, MAE: 0.9540
  Fold 2: RMSE: 1.3465, Spearman's Correlation: 0.2069, MAE: 0.8433
  Fold 3: RMSE: 1.3959, Spearman's Correlation: 0.1359, MAE: 0.8929
  Fold 4: RMSE: 1.3077, Spearman's Correlation: 0.0586, MAE: 0.8646
  Fold 5: RMSE: 1.6317, Spearman's Correlation: 0.1554, MAE: 1.0889
Mean RMSE: 1.4405, Mean Spearman's Correlation: 0.1352, Mean MAE: 0.9287


Evaluating ExtraTrees Regressor for regression...
  Fold 1: RMSE: 1.1018, Spearman's Correlation: 0.2866, MAE: 0.6944
  Fold 2: RMSE: 0.9284, Spearman's Correlation: 0.3845, MAE: 0.5895
  Fold 3: RMSE: 1.0496, Spearman's Correlation: 0.2174, MAE: 0.6986
  Fold 4: RMSE: 0.7143, Spearman's Correlation: 0.2735, MAE: 0.5349
  Fold 5: RMSE: 1.3628, Spearman's Correlation: 0.2469, MAE: 0.8409
Mean RMSE: 1.0314, Mean Spearman's Correlation: 0.2818, Mean MAE: 0.6717


Evaluating AdaBoost Regressor for regression...
  Fold 1: RMSE: 1.1874, Spear

## Experiment 2- Adding Feature Creation

In [73]:
germany_info = ['ID','DAY_ID','GAS_RET',
                            'COAL_RET',
                            'CARBON_RET',
                            'DE_TEMP',
                            'DE_WIND',
                            'DE_RAIN',
                            'DE_GAS',
                            'DE_COAL',
                            'DE_HYDRO',
                            'DE_NUCLEAR',
                            'DE_SOLAR',
                            'DE_WINDPOW',
                            'DE_LIGNITE',
                            'DE_CONSUMPTION',
                            'DE_RESIDUAL_LOAD',
                           # 'DE_NET_IMPORT',
                            'DE_NET_EXPORT',
                            'DE_FR_EXCHANGE',
                            'FR_DE_EXCHANGE',
                            'TARGET',
                           # 'gas_price_to_production_ratio',
                           # 'consumption_variation',
                            'consumption_moving_average',
                            'gas_moving_average',
                            'coal_moving_average',
                            'carbon_moving_average',
                           # 'DE_RENEWABLE_CONSUMPTION',
                           # 'DE_NON_RENEWABLE_CONSUMPTION',
                          #  'DE_NET_ENERGY_EXCHANGE',
                          #  'DE_TOTAL_PRODUCTION',
                            'DE_PERCENT_RENEWABLE_PROD',
                            'DE_PERCENT_NON_RENEWABLE_PROD',
                            'DE_PERCENT_RENEWABLE',
                            'DE_PERCENT_NON_RENEWABLE']

france_info = ['ID','DAY_ID','GAS_RET',
                            'COAL_RET',
                            'CARBON_RET',
                            'FR_TEMP',
                            'FR_WIND',
                            'FR_RAIN',
                            'FR_GAS',
                            'FR_COAL',
                            'FR_HYDRO',
                            'FR_NUCLEAR',
                            'FR_SOLAR',
                            'FR_WINDPOW',
                            'FR_CONSUMPTION',
                           # 'FR_RESIDUAL_LOAD',
                            #'FR_NET_IMPORT',
                            'FR_NET_EXPORT',
                            'DE_FR_EXCHANGE',
                            'FR_DE_EXCHANGE',
                            'TARGET',
                            'gas_price_to_production_ratio',
                            'consumption_variation',
                            'consumption_moving_average',
                            'gas_moving_average',
                            'coal_moving_average',
                            'carbon_moving_average',
                           # 'FR_RENEWABLE_CONSUMPTION',
                            #'FR_NON_RENEWABLE_CONSUMPTION',
                            'FR_NET_ENERGY_EXCHANGE',
                            #'FR_TOTAL_PRODUCTION',
                            'FR_PERCENT_RENEWABLE_PROD',
                            'FR_PERCENT_NON_RENEWABLE_PROD',
                            'FR_PERCENT_RENEWABLE',
                            'FR_PERCENT_NON_RENEWABLE']

#### France

In [49]:
france_order = france_dataset[france_info]
x_train_clean_fr = france_order.sort_values(by='DAY_ID', ascending=True)
x_train_clean_fr = x_train_clean_fr.set_index('ID')
y_train_clean_fr = x_train_clean_fr['TARGET']
x_train_clean_fr = x_train_clean_fr.drop(['TARGET'], axis=1)

In [50]:
# Define the number of splits for the kfold validation
num_splits = 5

# Define the random seed for the algorithms
random_seed = 42

# Define regression models
regression_models = {
    'Decision Tree': DecisionTreeRegressor(random_state=random_seed),
    'ExtraTrees Regressor': ExtraTreesRegressor(random_state=random_seed),
    'AdaBoost Regressor': AdaBoostRegressor(random_state=random_seed),
    'Linear Regression': LinearRegression(),
    'Random Forest Regression': RandomForestRegressor(random_state=random_seed),
    'XGBoost': XGBRegressor(random_state=random_seed),
    'LightGBM': LGBMRegressor(random_state=random_seed)
}

# Define the scoring functions
scoring = {
    'RMSE': make_scorer(mean_squared_error, squared=False),
    'Spearman Correlation': make_scorer(lambda y, y_pred: spearmanr(y, y_pred)[0]),
    'MAE': make_scorer(mean_absolute_error)
}

# Evaluate regression models
regression_results = {}
for model_name, model in regression_models.items():
    print(f"Evaluating {model_name} for regression...")

    # Perform cross-validation
    scores = cross_validate(model, x_train_clean_fr, y_train_clean_fr, cv=kfold, scoring=scoring)

    # Extract the results for each fold
    rmse_scores = scores['test_RMSE']
    spearman_scores = scores['test_Spearman Correlation']
    mae_scores = scores['test_MAE']

    regression_results[model_name] = {'RMSE': rmse_scores, 'Spearman Correlation': spearman_scores, 'MAE': mae_scores}

    for fold, (rmse, spearman, mae) in enumerate(zip(rmse_scores, spearman_scores, mae_scores), start=1):
        print(f"  Fold {fold}: RMSE: {rmse:.4f}, Spearman's Correlation: {spearman:.4f}, MAE: {mae:.4f}")

    print(f"Mean RMSE: {np.mean(rmse_scores):.4f}, Mean Spearman's Correlation: {np.mean(spearman_scores):.4f}, Mean MAE: {np.mean(mae_scores):.4f}")
    print("\n")

# Print the final results for regression
print("\nResults Summary for Regression:")
for model_name, scores in regression_results.items():
    print(f"{model_name}: Mean RMSE: {np.mean(scores['RMSE']):.4f}, Mean Spearman Correlation: {np.mean(scores['Spearman Correlation']):.4f}, Mean MAE: {np.mean(scores['MAE']):.4f}")

Evaluating Decision Tree for regression...
  Fold 1: RMSE: 1.3437, Spearman's Correlation: 0.0849, MAE: 0.7899
  Fold 2: RMSE: 1.4021, Spearman's Correlation: -0.0720, MAE: 0.8346
  Fold 3: RMSE: 1.6171, Spearman's Correlation: 0.0053, MAE: 0.9242
  Fold 4: RMSE: 1.4100, Spearman's Correlation: 0.0289, MAE: 0.7932
  Fold 5: RMSE: 1.6227, Spearman's Correlation: 0.0699, MAE: 0.9352
Mean RMSE: 1.4791, Mean Spearman's Correlation: 0.0234, Mean MAE: 0.8554


Evaluating ExtraTrees Regressor for regression...
  Fold 1: RMSE: 0.9372, Spearman's Correlation: 0.1826, MAE: 0.5643
  Fold 2: RMSE: 1.0343, Spearman's Correlation: 0.0118, MAE: 0.5844
  Fold 3: RMSE: 1.0429, Spearman's Correlation: 0.0171, MAE: 0.5766
  Fold 4: RMSE: 1.0156, Spearman's Correlation: 0.0994, MAE: 0.5549
  Fold 5: RMSE: 1.3010, Spearman's Correlation: 0.1859, MAE: 0.6351
Mean RMSE: 1.0662, Mean Spearman's Correlation: 0.0993, Mean MAE: 0.5831


Evaluating AdaBoost Regressor for regression...
  Fold 1: RMSE: 0.9997, Spea

#### Germany

In [59]:
germany_order = germany_dataset[germany_info]
x_train_clean_de = germany_order.sort_values(by='DAY_ID', ascending=True)
x_train_clean_de = x_train_clean_de.set_index('ID')
y_train_clean_de = x_train_clean_de['TARGET']
x_train_clean_de = x_train_clean_de.drop(['TARGET'], axis=1)

In [60]:
# Define the number of splits for the kfold validation
num_splits = 5

# Define the random seed for the algorithms
random_seed = 42

# Define regression models
regression_models = {
    'Decision Tree': DecisionTreeRegressor(random_state=random_seed),
    'ExtraTrees Regressor': ExtraTreesRegressor(random_state=random_seed),
    'AdaBoost Regressor': AdaBoostRegressor(random_state=random_seed),
    'Linear Regression': LinearRegression(),
    'Random Forest Regression': RandomForestRegressor(random_state=random_seed),
    'XGBoost': XGBRegressor(random_state=random_seed),
    'LightGBM': LGBMRegressor(random_state=random_seed)
}

# Define the scoring functions
scoring = {
    'RMSE': make_scorer(mean_squared_error, squared=False),
    'Spearman Correlation': make_scorer(lambda y, y_pred: spearmanr(y, y_pred)[0]),
    'MAE': make_scorer(mean_absolute_error)
}

# Evaluate regression models
regression_results = {}
for model_name, model in regression_models.items():
    print(f"Evaluating {model_name} for regression...")

    # Perform cross-validation
    scores = cross_validate(model, x_train_clean_de, y_train_clean_de, cv=kfold, scoring=scoring)

    # Extract the results for each fold
    rmse_scores = scores['test_RMSE']
    spearman_scores = scores['test_Spearman Correlation']
    mae_scores = scores['test_MAE']

    regression_results[model_name] = {'RMSE': rmse_scores, 'Spearman Correlation': spearman_scores, 'MAE': mae_scores}

    for fold, (rmse, spearman, mae) in enumerate(zip(rmse_scores, spearman_scores, mae_scores), start=1):
        print(f"  Fold {fold}: RMSE: {rmse:.4f}, Spearman's Correlation: {spearman:.4f}, MAE: {mae:.4f}")

    print(f"Mean RMSE: {np.mean(rmse_scores):.4f}, Mean Spearman's Correlation: {np.mean(spearman_scores):.4f}, Mean MAE: {np.mean(mae_scores):.4f}")
    print("\n")

# Print the final results for regression
print("\nResults Summary for Regression:")
for model_name, scores in regression_results.items():
    print(f"{model_name}: Mean RMSE: {np.mean(scores['RMSE']):.4f}, Mean Spearman Correlation: {np.mean(scores['Spearman Correlation']):.4f}, Mean MAE: {np.mean(scores['MAE']):.4f}")

Evaluating Decision Tree for regression...
  Fold 1: RMSE: 1.5764, Spearman's Correlation: 0.0450, MAE: 1.0124
  Fold 2: RMSE: 1.5936, Spearman's Correlation: 0.0045, MAE: 1.0216
  Fold 3: RMSE: 1.4253, Spearman's Correlation: 0.0776, MAE: 0.8954
  Fold 4: RMSE: 1.1627, Spearman's Correlation: 0.0670, MAE: 0.7946
  Fold 5: RMSE: 1.5947, Spearman's Correlation: 0.1708, MAE: 1.0593
Mean RMSE: 1.4705, Mean Spearman's Correlation: 0.0730, Mean MAE: 0.9567


Evaluating ExtraTrees Regressor for regression...
  Fold 1: RMSE: 1.1020, Spearman's Correlation: 0.2608, MAE: 0.6980
  Fold 2: RMSE: 0.9261, Spearman's Correlation: 0.3667, MAE: 0.5951
  Fold 3: RMSE: 1.0231, Spearman's Correlation: 0.3388, MAE: 0.6844
  Fold 4: RMSE: 0.7123, Spearman's Correlation: 0.2976, MAE: 0.5245
  Fold 5: RMSE: 1.3249, Spearman's Correlation: 0.2935, MAE: 0.8130
Mean RMSE: 1.0177, Mean Spearman's Correlation: 0.3115, Mean MAE: 0.6630


Evaluating AdaBoost Regressor for regression...
  Fold 1: RMSE: 1.1499, Spear

### Best model and Feature Importance

In [74]:
france_order = france_dataset[france_info]
x_train_clean_fr = france_order.sort_values(by='DAY_ID', ascending=True)
x_train_clean_fr = x_train_clean_fr.set_index('ID')
y_train_clean_fr = x_train_clean_fr['TARGET']
x_train_clean_fr = x_train_clean_fr.drop(['TARGET'], axis=1)

In [75]:
germany_order = germany_dataset[germany_info]
x_train_clean_de = germany_order.sort_values(by='DAY_ID', ascending=True)
x_train_clean_de = x_train_clean_de.set_index('ID')
y_train_clean_de = x_train_clean_de['TARGET']
x_train_clean_de = x_train_clean_de.drop(['TARGET'], axis=1)

#### France
Extratrees Regressor

In [78]:
# Now, train the model with the entire training set to apply to the Test data
et_model_fr = ExtraTreesRegressor(random_state=random_seed)
et_model_fr.fit(x_train_clean_fr,y_train_clean_fr)

In [79]:
from sklearn.inspection import permutation_importance

# Assuming et_model_fr is your trained Extra Trees Regressor
# and x_test_clean_de, y_test_clean_fr are your test features and target

# Calculate feature importance using permutation method
result = permutation_importance(et_model_fr, x_train_clean_fr, y_train_clean_fr, n_repeats=10, random_state=random_seed)

# Get the importance scores
importance_scores = result.importances_mean

# Get the feature names
feature_names = x_train_clean_fr.columns

# Create a DataFrame to display the feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance_scores})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print or visualize the feature importance
print("Feature Importance (Permutation Method):")
print(feature_importance_df)

Feature Importance (Permutation Method):
                          Feature  Importance
12                     FR_WINDPOW    0.242077
9                        FR_HYDRO    0.076297
1                         GAS_RET    0.067962
3                      CARBON_RET    0.056095
19     consumption_moving_average    0.050428
22          carbon_moving_average    0.039849
13                 FR_CONSUMPTION    0.038410
18          consumption_variation    0.035528
16                 FR_DE_EXCHANGE    0.034296
7                          FR_GAS    0.033616
11                       FR_SOLAR    0.030851
21            coal_moving_average    0.030114
25  FR_PERCENT_NON_RENEWABLE_PROD    0.029806
20             gas_moving_average    0.027631
2                        COAL_RET    0.026954
6                         FR_RAIN    0.023381
15                 DE_FR_EXCHANGE    0.021341
0                          DAY_ID    0.017331
5                         FR_WIND    0.016384
10                     FR_NUCLEAR    0.

#### Germany
Extratrees RegreEssor

In [80]:
# Now, train the model with the entire training set to apply to the Test data
et_model_de = ExtraTreesRegressor(random_state=random_seed)
et_model_de.fit(x_train_clean_de,y_train_clean_de)

In [83]:
from sklearn.inspection import permutation_importance

# Assuming et_model_fr is your trained Extra Trees Regressor
# and x_test_clean_de, y_test_clean_fr are your test features and target

# Calculate feature importance using permutation method
result = permutation_importance(et_model_de, x_train_clean_de, y_train_clean_de, n_repeats=10, random_state=random_seed)

# Get the importance scores
importance_scores = result.importances_mean

# Get the feature names
feature_names = x_train_clean_de.columns

# Create a DataFrame to display the feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance_scores})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print or visualize the feature importance
print("Feature Importance (Permutation Method):")
print(feature_importance_df)

Feature Importance (Permutation Method):
                          Feature  Importance
12                     DE_WINDPOW    0.233582
16                  DE_NET_EXPORT    0.163891
15               DE_RESIDUAL_LOAD    0.132818
11                       DE_SOLAR    0.116512
13                     DE_LIGNITE    0.076306
5                         DE_WIND    0.061685
19     consumption_moving_average    0.039313
2                        COAL_RET    0.032519
7                          DE_GAS    0.029091
20             gas_moving_average    0.026123
9                        DE_HYDRO    0.022386
1                         GAS_RET    0.021092
6                         DE_RAIN    0.019338
3                      CARBON_RET    0.018802
14                 DE_CONSUMPTION    0.017270
4                         DE_TEMP    0.016450
22          carbon_moving_average    0.014830
18                 FR_DE_EXCHANGE    0.014459
21            coal_moving_average    0.013777
17                 DE_FR_EXCHANGE    0.