# Feature engineering

## Library importation

In [None]:
# Traitement de données
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split, KFold
from typing import List
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.base import clone

## Download the dataset

In [2]:
X = pd.read_csv("X_train_NHkHMNU.csv")
y = pd.read_csv("y_train_ZAN5mwg.csv")

df = pd.concat([X, y], axis=1)

df = df.drop(df.columns[-2], axis=1)

## Feature Engineering

Feature engineering is a key step in a machine Learning project. This step prepares the data for the models. Here are the steps we followed to prepare the dataset : 

**Remove columns that have -1 correlation**

Some vairables have -1 correlation :
- `DE_NET_EXPORT` and `DE_NET_IMPORT`
- `FR_NET_EXPORT` and `FR_NET_IMPORT`
- `DE_FR_EXCHANGE` and `FR_DE_EXCHANGE`

Moreover they have the same correlation with the other variables. So keeping both variables doesn't add meaning full information. That is why we chose to drop one of the variables from each -1 correlation.

**Remove `FR_COAL` variable**

This variable is not diversified. Thus its values are not interesting to keep.

**Split the dataset**

As decided thanks to the data analysis, we splited the dataset into two : french and german dataset.

**Remove Nan Values from both dataset**

The proportion of Nan values as well as the few rows we have for each dataset were the reasons why we chose to replace nan values by the median of each column.

**Create additionnal columns according to a Threshold**

Seuils pour df_fr
- COAL_RET < 0.8
- FR_CONSUMPTION > 1.5
- FR_NUCLEAR < -1.8
- FR_HYDRO < -0.4

Seuils pour df_de
- DE_CONSUMPTION > 1.2
- DE_NET_EXPORT > -0.45
- DE_WINDPOW > 0.3

Transformation "ReLu"

**Remove Columns that have a low correlation with the TARGET variable**

Each variables whose spearman corelation with the `TARGET` variable is lower than 0.05 will be removed from the dataset. We don't consider those variables to have a correlation high enough to have a positive impact on models' performance.

In [None]:
def drop_columns(df, columns):
    for c in columns:
        df.drop(columns=c, inplace=True, errors="ignore")

def compute_median(df):
    numeric_cols = df.select_dtypes(include=["number"]).columns
    medians = df[numeric_cols].median()
    return medians

def missing_values_changed_with_median(df, medians):
    numeric_cols = df.select_dtypes(include=["number"]).columns
    df[numeric_cols] = df[numeric_cols].fillna(medians[numeric_cols])
    return df

def add_threshold_columns(df: pd.DataFrame, column_name: str, threshold: float, way: str):
    message = column_name + "_THRESHOLD_" + str(threshold)
    # when way = "sup", we want to keep only values that are higher than the threshold
    # else we keep the values that are lower than the threshold
    if way == "sup":
        df[message] = df[column_name].where(df[column_name] >= threshold, 0)
    else:
        df[message] = df[column_name].where(df[column_name] <= threshold, 0)

def compute_quantiles(df, low = 0.25, high = 0.75, coeff=5):
    bounds = {}
    for column in df.select_dtypes(include=["number"]).columns:
        Q1 = df[column].quantile(low)
        Q3 = df[column].quantile(high)
        delta = Q3 - Q1
        lower_bound = Q1 - coeff * delta
        upper_bound = Q3 + coeff * delta
        bounds[column] = (lower_bound, upper_bound)
    return bounds

def outliers_filter(df, bounds):
    filter_ = pd.Series(True, index=df.index)
    for column, (low, high) in bounds.items():
        if column in df.columns:
            filter_ &= (df[column] >= low) & (df[column] <= high)
    return filter_

def feature_engineering(df, medians, threshold, columns_kept):
    # remove unecessary columns
    columns_name = ["DE_NET_IMPORT", "FR_NET_IMPORT", "DE_FR_EXCHANGE"]
    drop_columns(df, columns_name)

    # remove FR_COAL
    drop_columns(df, ["FR_COAL"])

    # modify missing values
    df = missing_values_changed_with_median(df, medians)

    # add threshold columns to the french dataset
    for key, value in threshold.items():
        add_threshold_columns(df, key, value[0], value[1])

    # drop columns that are not in the list or that have not _THRESHOLD_ in their name
    to_keep = [c for c in df.columns if (c in columns_kept) or ("_THRESHOLD_" in c)]
    df = df[to_keep]
    
    return df

def transform_one_country(df, threshold, columns_kept, standardisation = True):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["TARGET"]), df["TARGET"], test_size=0.2, random_state=42)

    medians = compute_median(X_train)

    X_train = feature_engineering(X_train, medians, threshold, columns_kept)
    X_test = feature_engineering(X_test, medians, threshold, columns_kept)

    # filter : remove outliers from the train data
    bounds = compute_quantiles(X_train)
    filter_ = outliers_filter(X_train, bounds)
    X_train = X_train[filter_]
    y_train = y_train[filter_]

    if standardisation:
        # Standardisation
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled  = scaler.transform(X_test)

        # X_train_fr_scaled, X_test_fr_scaled, X_train_de_scaled, X_test_de_scaled are not dataframe, 
        # we prefer to work with dataframe to keep columns name
        X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
        X_test  = pd.DataFrame(X_test_scaled,  index=X_test.index,  columns=X_test.columns)


    return X_train, X_test, y_train, y_test

def transform(df, threshold_fr, threshold_de, columns_kept_fr, columns_kept_de, standardisation=True):
    # split the dataset
    df_fr = df[df["COUNTRY"] == "FR"].copy()
    df_de = df[df["COUNTRY"] == "DE"].copy()

    X_train_fr, X_test_fr, y_train_fr, y_test_fr = transform_one_country(
        df_fr, threshold_fr, columns_kept_fr, standardisation=standardisation
    )

    X_train_de, X_test_de, y_train_de, y_test_de = transform_one_country(
        df_de, threshold_de, columns_kept_de, standardisation=standardisation
    )

    return (
        X_train_fr, X_test_fr, y_train_fr, y_test_fr,
        X_train_de, X_test_de, y_train_de, y_test_de
    )

In [None]:
threshold_fr = {"COAL_RET": [0.8, "inf"],
                "FR_CONSUMPTION": [1.5, "sup"],
                "FR_NUCLEAR": [-1.8, "inf"],
                "FR_HYDRO":[-0.4, "inf"]                
                }

threshold_de = {"DE_CONSUMPTION": [1.2, "sup"],
                "DE_NET_EXPORT": [-0.45, "sup"],
                "DE_WINDPOW": [0.3, "sup"]
}

# COLONNES RECUPEREES TEMPORAIREMENT A LA MAIN CAR SEPARATIONN DES FICHIERS ANALYSES ET ENGINEERING
# A RECUPER DES VARIBALES QUAND LE RASSEMBLEMENT DES FICHIERS SERA FAIT
columns_kept_fr = ["DE_NET_EXPORT",
                "DE_HYDRO",
                "DE_WINDPOW",
                "FR_WINDPOW",
                "GAS_RET",
                "CARBON_RET"]

columns_kept_de = ["DE_NET_EXPORT",
                "DE_GAS",
                "DE_COAL",
                "DE_HYDRO",
                "DE_WINDPOW",
                "FR_WINDPOW",
                "DE_LIGNITE",
                "DE_RESIDUAL_LOAD",
                "DE_WIND"]

X_train_fr, X_test_fr, y_train_fr, y_test_fr, X_train_de, X_test_de, y_train_de, y_test_de = transform(df,
                                                                                                       threshold_fr,
                                                                                                       threshold_de,
                                                                                                       columns_kept_fr,
                                                                                                       columns_kept_de)

## Pipeline for all models

We observe that if our features engineering seems very relevant for simple and interpretable models, however models that handle better the complexity and non linear relationsip didn't require as feature engineering than a simple linear regression. For that purpose the goal of this part is to do a general pipeline using the last feature engineering pipeline to have a flexible way of testing new models. Furthermore since the observation of an important part of outliers in the French side, make the relationships very noisy, we will remove the extreme outliers, only on training data. We also aim to have the possibilitie to use a different model for France and Allemagne since the optimal model for each could be different. Finally in our objective to avoid overfitting we will use K-fold optimization.

In [None]:
def spearman_corr(y_true, y_pred):
    return spearmanr(y_true, y_pred).correlation

def kfold_score(model, X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = []

    for train_idx, test_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model_ = clone(model)  
        model_.fit(X_train, y_train)
        y_pred = model_.predict(X_val)

        scores.append(spearman_corr(y_val, y_pred))

    return np.mean(scores), np.std(scores)

In [None]:
def Pipeline_All(df, fr_model, de_model, features_engineering= True, remove_outliers = True, k = 5):
    if features_engineering == True:
        df_fr, df_de = feature_engineering(df, remove_outliers)
    else:
        df_fr = df[df["COUNTRY"] == "FR"].drop(columns="COUNTRY").fillna(0)
        df_de = df[df["COUNTRY"] == "DE"].drop(columns="COUNTRY").fillna(0)

    X_fr = df_fr.drop(columns=["TARGET"])
    y_fr = df_fr["TARGET"]

    X_de = df_de.drop(columns=["TARGET"])
    y_de = df_de["TARGET"]

    X_train_fr, X_test_fr, y_train_fr, y_test_fr = train_test_split(X_fr, y_fr, test_size=0.2, random_state=42)
    X_train_de, X_test_de, y_train_de, y_test_de = train_test_split(X_de, y_de, test_size=0.2, random_state=42)

    fr_mean, fr_std = kfold_score(fr_model, X_train_fr, y_train_fr, k=k)

    de_mean, de_std = kfold_score(de_model, X_train_de, y_train_de, k=k)

    fr_model.fit(X_train_fr, y_train_fr)
    de_model.fit(X_train_de, y_train_de)

    y_pred_test_fr = fr_model.predict(X_test_fr)
    y_pred_test_de = de_model.predict(X_test_de)

    fr_test_score = spearman_corr(y_test_fr, y_pred_test_fr)
    de_test_score = spearman_corr(y_test_de, y_pred_test_de)

    y_true_global = np.concatenate([y_test_fr, y_test_de])
    y_pred_global = np.concatenate([y_pred_test_fr, y_pred_test_de])

    spearman_global = spearman_corr(y_true_global, y_pred_global)

    return {
        "fr_kfold": (fr_mean, fr_std),
        "de_kfold": (de_mean, de_std),
        "spearman_fr_test": fr_test_score,
        "spearman_de_test": de_test_score,
        "spearman_global_test": spearman_global,
    }


### Basic Model

The first step is to test the simpliest model with almost no feature engineering, to have a sort of reference model and to not considerate all the models less performant. In this first implementation the dataset isn't separate between France and Germany, all the columns are keep and there is no transformation on the columns. The model used is a linear regression.

In [7]:
X_all = df.drop(columns=["TARGET", "COUNTRY"]).fillna(0)
y_all = df["TARGET"]


X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
y_pred_test  = lr.predict(X_test)


print("Spearman train : {:.1f}%".format(100 * spearman_corr(y_train, y_pred_train)))
print("Spearman test  : {:.1f}%".format(100 * spearman_corr(y_test,  y_pred_test)))

Spearman train : 28.9%
Spearman test  : 19.5%


## Models with our Pipeline

### Linear Regression


In [8]:
Pipeline_All(df, LinearRegression(), LinearRegression())

TypeError: Feature_engineering() missing 2 required positional arguments: 'threshold' and 'columns_kept'

We can see an important improvement of our spearman score, with an amelioration of 8% comparing to the reference model (from 19% to 27%). This justify our global strategy at least for Linear Regression.

### Polynomiale Regression 

In [None]:
poly_model_fr = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lr", LinearRegression())
])

poly_model_de = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lr", LinearRegression())
])

Pipeline_All(df, poly_model_fr, poly_model_de)

{'fr_kfold': (0.13778979428894014, 0.10676547796809663),
 'de_kfold': (0.1761269917353287, 0.1291168563390837),
 'spearman_fr_test': 0.2646904412907821,
 'spearman_de_test': 0.32879025044722726,
 'spearman_global_test': 0.29068113675983126}

With polynomial regression, we keep improving our performance, however this model seems adapted only for the french dataset an hybrid model (polynomial regression for the french dataset and linear regression for the deutsh one)

#

### A simple hybrid model

In [None]:
Pipeline_All(df, poly_model_fr, LinearRegression())

{'fr_kfold': (0.13778979428894014, 0.10676547796809663),
 'de_kfold': (0.3019722817513048, 0.06163762273514753),
 'spearman_fr_test': 0.2646904412907821,
 'spearman_de_test': 0.38370974955277287,
 'spearman_global_test': 0.30588975694833276}

### Decision Tree Regressor 

In [None]:
spearman_score = make_scorer(spearman_corr, greater_is_better=True)
fr_param_grid = {
    "model__max_depth": [3, 4, 5, 7],
    "model__min_samples_leaf": [10, 20, 50],
    "model__min_samples_split": [10, 20, 30]
}

fr_base = Pipeline([("model", DecisionTreeRegressor(random_state=42))])

fr_search = GridSearchCV(
    estimator=fr_base,
    param_grid=fr_param_grid,
    scoring=spearman_score,
    cv=5,
    n_jobs=1,
    refit=True
)

de_param_grid = {
    "model__max_depth": [3, 4, 5, 7, 10, 15],
    "model__min_samples_leaf": [5, 10, 20, 30, 50],
    "model__min_samples_split": [5, 10, 20]
}

de_base = Pipeline([("model", DecisionTreeRegressor(random_state=42))])

de_search = GridSearchCV(
    estimator=de_base,
    param_grid=de_param_grid,
    scoring=spearman_score,
    cv=5,
    n_jobs=1,
    refit=True
)

Pipeline_All(df, fr_search, de_search, features_engineering=  True)




{'fr_kfold': (0.1030746921499699, 0.08823846652670929),
 'de_kfold': (0.22776058012535377, 0.054213820605364386),
 'spearman_fr_test': 0.24802168128052768,
 'spearman_de_test': 0.20495986869711383,
 'spearman_global_test': 0.1890557652292927}

We observe poor performance, especially for the German dataset. Essayer d'optimiser ça???