# Feature engineering

## Library importation

In [None]:
# Traitement de données
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split
from typing import List

## Download and Dataset split

In [2]:
X = pd.read_csv("X_train_NHkHMNU.csv")
y = pd.read_csv("y_train_ZAN5mwg.csv")

df = pd.concat([X, y], axis=1)

df = df.drop(df.columns[-2], axis=1)

In [5]:
df_fr = df[df["COUNTRY"] == "FR"].copy()
df_de = df[df["COUNTRY"] == "DE"].copy()

## Change Nan Values from both dataset

In [6]:
# French dataset
numeric_cols_fr = df_fr.select_dtypes(include=["number"]).columns
df_fr[numeric_cols_fr] = df_fr[numeric_cols_fr].fillna(df_fr[numeric_cols_fr].median())

# German dataset
numeric_cols_de = df_de.select_dtypes(include=["number"]).columns
df_de[numeric_cols_de] = df_de[numeric_cols_de].fillna(df_de[numeric_cols_de].median())

## Create additionnal columns that represents a Threshold

##### Seuils pour df_fr
- COAL_RET < 0.8
- FR_CONSUMPTION > 1.5
- FR_NUCLEAR < -1.8
- FR_HYDRO < -0.4

##### Seuils pour df_de
- DE_NET_IMPORT < 0.45
- DE_CONSUMPTION > 1.2
- DE_NET_EXPORT > -0.45
- DE_WINDPOW > 0.3


Transformation "ReLU"

In [7]:
def AddSeuilColumn(df: pd.DataFrame, column_name: str, seuil: float, way: str):
    message = column_name + "_Threshold_" + str(seuil)
    if way == "sup":
        df[message] = df[column_name].where(df[column_name] >= seuil, 0)
    else:
        df[message] = df[column_name].where(df[column_name] <= seuil, 0)

threshold_fr = {"COAL_RET": [0.8, "inf"],
                "FR_CONSUMPTION": [1.5, "sup"],
                "FR_NUCLEAR": [-1.8, "inf"],
                "FR_HYDRO":[-0.4, "inf"]                
                }

threshold_de = {"DE_NET_IMPORT": [0.45, "inf"],
                "DE_CONSUMPTION": [1.2, "sup"],
                "DE_NET_EXPORT": [-0.45, "sup"],
                "DE_WINDPOW": [0.3, "sup"]
}

# add threshold columns to the french dataset
for key, value in threshold_fr.items():
    AddSeuilColumn(df_fr, key, value[0], value[1])

# add threshold columns to the german dataset
for key, value in threshold_de.items():
    AddSeuilColumn(df_de, key, value[0], value[1])

In [8]:
df_fr

Unnamed: 0,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,FR_NET_IMPORT,...,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET,TARGET,COAL_RET_Threshold_0.8,FR_CONSUMPTION_Threshold_1.5,FR_NUCLEAR_Threshold_-1.8,FR_HYDRO_Threshold_-0.4
0,206,FR,0.210099,-0.427458,-0.606523,0.606523,-0.244606,0.692860,0.244606,-0.692860,...,-1.069070,-0.063404,0.339041,0.124552,-0.002445,0.028313,0.124552,0.000000,0.000000,0.000000
1,501,FR,-0.022399,-1.003452,-0.022063,0.022063,-0.573520,-1.130838,0.573520,1.130838,...,0.437419,1.831241,-0.659091,0.047114,-0.490365,-0.112516,0.047114,0.000000,-2.185961,-0.807112
2,687,FR,1.395035,1.978665,1.021305,-1.021305,-0.622021,-1.682587,0.622021,1.682587,...,0.684884,0.114836,0.535974,0.743338,0.204952,-0.180840,0.743338,1.978665,0.000000,0.000000
4,818,FR,0.143807,-0.617038,-0.924990,0.924990,-0.244606,0.990324,0.244606,-0.990324,...,0.614338,0.729495,0.245109,1.526606,2.614378,-0.071733,0.000000,0.000000,0.000000,-0.795983
5,467,FR,-0.295296,-0.765120,-0.717490,0.717490,-1.117139,-0.200305,1.117139,0.200305,...,0.102046,0.472708,0.891049,0.861408,1.124457,0.932105,0.000000,0.000000,-1.920695,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1483,510,FR,0.422357,-0.704613,-1.019784,1.019784,-0.977214,1.112333,0.977214,-1.112333,...,-0.322520,-0.555211,1.946355,0.867074,1.322433,0.108277,0.000000,0.000000,0.000000,0.000000
1486,985,FR,0.117491,0.944372,1.171116,-1.171116,1.403843,0.499653,-1.403843,-0.499653,...,0.324165,0.829517,0.494188,1.011794,1.472650,0.827636,0.000000,0.000000,0.000000,0.000000
1487,905,FR,0.968724,0.459382,0.996808,-0.996808,0.101161,-1.048997,-0.101161,1.048997,...,-2.413150,-2.069991,1.145686,0.335645,0.606318,0.049618,0.335645,0.000000,0.000000,0.000000
1490,887,FR,1.618582,1.752840,0.611392,-0.611392,0.449153,-0.152146,-0.449153,0.152146,...,-0.009017,-0.012600,0.932633,-0.085690,0.356356,-0.063546,-0.085690,1.752840,0.000000,0.000000


In [9]:
df_de

Unnamed: 0,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,FR_NET_IMPORT,...,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET,TARGET,DE_NET_IMPORT_Threshold_0.45,DE_CONSUMPTION_Threshold_1.2,DE_NET_EXPORT_Threshold_-0.45,DE_WINDPOW_Threshold_0.3
3,720,DE,-0.983324,-0.849198,-0.839586,0.839586,-0.270870,0.563230,0.270870,-0.563230,...,0.350938,-0.417514,0.911652,-0.296168,1.073948,-0.260356,0.270870,0.000000,-0.270870,0.000000
11,116,DE,-0.055692,-0.811337,0.237105,-0.237105,-0.851082,-1.091142,0.851082,1.091142,...,1.595158,0.158463,-0.359866,-0.203952,-0.376234,-0.133381,0.000000,0.000000,0.000000,0.000000
12,406,DE,0.532116,-0.331101,0.339942,-0.339942,-0.173123,-1.312029,0.173123,1.312029,...,1.241892,-0.206340,1.170760,0.133643,0.033874,0.196312,0.173123,0.000000,-0.173123,1.448078
13,1175,DE,-0.328286,-1.062255,-1.380464,1.380464,-1.046122,1.002243,1.046122,-1.002243,...,-0.634046,-0.168491,0.122818,0.220077,5.453331,-0.025477,0.000000,0.000000,0.000000,0.359210
14,309,DE,1.028987,1.629315,1.129663,-1.129663,-0.391261,-1.823117,0.391261,1.823117,...,-0.667496,-1.566773,0.689483,1.095473,0.342798,0.460278,0.391261,0.000000,-0.391261,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,78,DE,0.810509,-0.235078,0.962203,-0.962203,0.971934,-0.121857,-0.971934,0.121857,...,1.400533,1.512197,0.793561,0.848558,0.517578,-0.015261,-0.971934,0.000000,0.971934,1.619323
1488,87,DE,-0.295522,-1.051247,-1.651718,1.651718,-1.969871,0.074044,1.969871,-0.074044,...,1.120555,0.865314,-0.423802,0.067325,1.558886,0.110206,0.000000,0.000000,0.000000,0.000000
1489,809,DE,1.529204,1.106682,-1.855327,1.855327,-0.218658,1.450426,0.218658,-1.450426,...,-0.035360,-0.032517,0.876984,0.819520,1.320373,-0.172597,0.218658,1.529204,-0.218658,0.000000
1491,1083,DE,0.856399,0.489199,-0.255778,0.255778,-1.531544,-0.829568,1.531544,0.829568,...,0.256338,0.402316,-1.112899,-0.237835,0.067152,0.151797,0.000000,0.000000,0.000000,0.000000


## Columns that should be kept, removed or transformed

#### Colonnes à garder df_fr:
- COAL_RET (seuil)
- FR_CONSUMPTION (seuil)
- FR_NUCLEAR (seuil)
- FR_HYDRO (seuil)

Parceque coeff de spearman avec la Target > 0.07

- DE_NET_EXPORT   
- DE_NET_IMPORT    
- DE_HYDRO         
- DE_WINDPOW      
- FR_WINDPOW      
- GAS_RET          
- CARBON_RET       

ça serait bien de réduire le nombre de colonne non-seuil en faisant une pca (et de regrouper variables PCA et seuil après)   

#### Colonnes à garder df_de

- DE_NET_IMPORT (seuil)
- DE_CONSUMPTION (seuil)
- DE_NET_EXPORT (seuil)
- DE_WINDPOW (seuil)

Parceque coeff de spearman avec la Target > 0.10

- DE_GAS              
- DE_COAL             
- DE_HYDRO            
- FR_WINDPOW         
- DE_LIGNITE          
- DE_RESIDUAL_LOAD    
- DE_WIND            -


ça serait bien de réduire le nombre de colonne non-seuil en faisant une pca (et de regrouper variables PCA et seuil après) 

## Modèle de base

In [32]:
X_all = df.drop(columns=["TARGET", "COUNTRY"])
y_all = df["TARGET"]


X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
y_pred_test  = lr.predict(X_test)

def spearman_corr(y_true, y_pred):
    return spearmanr(y_true, y_pred).correlation

print("Corrélation (Spearman) train : {:.1f}%".format(100 * spearman_corr(y_train, y_pred_train)))
print("Corrélation (Spearman) test  : {:.1f}%".format(100 * spearman_corr(y_test,  y_pred_test)))

Corrélation (Spearman) train : 29.2%
Corrélation (Spearman) test  : 20.2%


#### Modèle sur df_fr (En utilant le df contenant les colonnes from PCA plus colonnes seuils)

Et regression pour le moment


#

#### Modèle sur df_de (En utilant le de contenant les colonnes from PCA plus colonnes seuils)

Regression aussi

#### Assemblage des deux modèles et train/test comme le modèle de base pour comparer 
Il faudra penser, si on a le temps à faire du k-fold pour éviter l'overfitting, genre on divise en 5 morceau et à chaques fois on change les morceaux qui entrainent et qui test et on voit si le modèle généralise bien.

#### Autres modèles à faire ensuite

### Polynomiale Regression (vue lab2)

### Decision Tree Regressor (vue lab2)