### Modeling

#### Imports

In [12]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
import xgboost as xgb

#### Data Preprocessing 

In [24]:
def load_data(separate_country=False):
    # Load train and set
    X_train     = pd.read_csv('X_train.csv', index_col='ID')
    Y_train     = pd.read_csv('Y_train.csv', index_col='ID')
    X_test      = pd.read_csv('X_test.csv', index_col='ID')
    Y_test      = pd.read_csv('Y_test.csv', index_col='ID')

    # If separate country return separate dataset for each country
    if separate_country:
        # Join features and target for preprocessing
        train_df    = X_train.join(Y_train)
        test_df     = X_test.join(Y_test)

        # Split training data into DE and FR datasets
        train_fr    = train_df[train_df.COUNTRY=='FR']
        train_de    = train_df[train_df.COUNTRY=='DE']

        # Split test data into DE and FR datasets
        test_fr     = test_df[test_df.COUNTRY=='FR']
        test_de     = test_df[test_df.COUNTRY=='DE']

        X_train_fr  = train_fr.drop(columns=['TARGET','COUNTRY','DAY_ID'])
        X_train_de  = train_de.drop(columns=['TARGET','COUNTRY','DAY_ID'])
        Y_train_fr  = train_fr[['TARGET']]
        Y_train_de  = train_de[['TARGET']]
        
        X_test_fr  = test_fr.drop(columns=['TARGET','COUNTRY','DAY_ID'])
        X_test_de  = test_de.drop(columns=['TARGET','COUNTRY','DAY_ID'])
        Y_test_fr  = test_fr[['TARGET']]
        Y_test_de  = test_de[['TARGET']]
    
        return [X_train_fr, Y_train_fr, X_train_de, Y_train_de, X_test_fr, Y_test_fr, X_test_de, Y_test_de]
    
    # If NOT separate country then return full train and test data
    else:
        ohc                 = OneHotEncoder(drop='first')
        X_train['COUNTRY']  = ohc.fit_transform(X_train.COUNTRY.values.reshape(-1,1)).toarray()
        X_train.drop(columns=['DAY_ID'], inplace=True)
        
        X_test['COUNTRY']   = ohc.fit_transform(X_test.COUNTRY.values.reshape(-1,1)).toarray()
        X_test.drop(columns=['DAY_ID'], inplace=True)
        
        return [X_train, Y_train, X_test, Y_test]

In [25]:
X_train, Y_train, X_test, Y_test = load_data(separate_country=False)

#### Modeling

Decision Tree

In [11]:
dt  = DecisionTreeRegressor()

pgrid = {"max_depth": [3, 5, 7, 10, 15, 20, 25, 30],
      "min_samples_split": [2, 3, 5, 10, 15, 20]}

grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid=pgrid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, Y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_.score(X_test, Y_test))

ValueError: 
All the 480 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
48 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/tree/_classes.py", line 1320, in fit
    super()._fit(
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/tree/_classes.py", line 242, in _fit
    X, y = self._validate_data(
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 617, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/utils/validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/pandas/core/generic.py", line 2084, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'DE'

--------------------------------------------------------------------------------
432 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/tree/_classes.py", line 1320, in fit
    super()._fit(
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/tree/_classes.py", line 242, in _fit
    X, y = self._validate_data(
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 617, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/utils/validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/peterkeszthelyi/Library/Python/3.9/lib/python/site-packages/pandas/core/generic.py", line 2084, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'FR'


In [29]:
mean_squared_error(Y_test_fr, rf.predict(X_test_fr))

1.234642040974661

In [17]:
#pd.DataFrame(index=rf.feature_names_in_, data=rf.feature_importances_).sort_values(by=0,ascending=False).plot(kind='bar')
pd.DataFrame(index=rf.feature_names_in_, data=rf.feature_importances_).sort_values(by=0,ascending=False)

Unnamed: 0,0
FR_SOLAR_LAG2,0.060660
DE_TEMP_LAG0,0.030368
FR_RESIDUAL_LOAD_LAG8,0.021692
GAS_RET_LAG11,0.020461
DE_NUCLEAR_LAG3,0.014675
...,...
FR_DE_EXCHANGE_LAG10,0.000177
FR_RAIN_LAG7,0.000163
FR_NET_IMPORT_LAG6,0.000147
DE_FR_EXCHANGE_LAG0,0.000130


#### Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define parameter grids for hyperparameter tuning
param_grid_decision_tree = {'max_depth': [None, 10, 20, 30],
                            'min_samples_split': [2, 5, 10]}

param_grid_adaboost = {'n_estimators': [50, 100, 200],
                       'learning_rate': [0.01, 0.1, 1.0]}

param_grid_bagging = {'n_estimators': [10, 50, 100],
                      'max_samples': [0.5, 0.8, 1.0],
                      'max_features': [0.5, 0.8, 1.0]}

param_grid_gradient_boosting = {'n_estimators': [50, 100, 200],
                                'learning_rate': [0.01, 0.1, 1.0],
                                'max_depth': [3, 5, 10]}

param_grid_random_forest = {'n_estimators': [50, 100, 200],
                            'max_depth': [None, 10, 20],
                            'min_samples_split': [2, 5, 10]}

param_grid_extra_trees = {'n_estimators': [50, 100, 200],
                          'max_depth': [None, 10, 20],
                          'min_samples_split': [2, 5, 10]}

# Initialize models
decision_tree = DecisionTreeRegressor()
adaboost = AdaBoostRegressor(base_estimator=decision_tree)
bagging = BaggingRegressor(base_estimator=decision_tree)
gradient_boosting = GradientBoostingRegressor()
random_forest = RandomForestRegressor()
extra_trees = ExtraTreesRegressor()

# List of models and corresponding parameter grids
models = [(decision_tree, param_grid_decision_tree),
          (adaboost, param_grid_adaboost),
          (bagging, param_grid_bagging),
          (gradient_boosting, param_grid_gradient_boosting),
          (random_forest, param_grid_random_forest),
          (extra_trees, param_grid_extra_trees)]

# Fit and tune each model
for model, param_grid in models:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, Y_train)
    
    # Best hyperparameters
    best_params = grid_search.best_params_
    
    # Best model
    best_model = grid_search.best_estimator_
    
    # Evaluate on test set
    predictions = best_model.predict(X_test)
    mse = mean_squared_error(Y_test, predictions)
    
    # Print results
    print("Model:", model.__class__.__name__)
    print("Best Parameters:", best_params)
    print("Mean Squared Error (Test):", mse)
    print("\n")


Model: DecisionTreeRegressor
Best Parameters: {'max_depth': 10, 'min_samples_split': 10}
Mean Squared Error (Test): 1.4903456298393765




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Model: AdaBoostRegressor
Best Parameters: {'learning_rate': 1.0, 'n_estimators': 100}
Mean Squared Error (Test): 1.6936049692508026




  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, war

Model: BaggingRegressor
Best Parameters: {'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 100}
Mean Squared Error (Test): 1.1633569364971386




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

KeyboardInterrupt: 