In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.dummy import DummyRegressor

In [2]:
df_train = pd.read_csv('./data/transformed/train_final.csv')

# Train - Test

In [3]:
X = df_train.drop('target', axis=1)
y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.002, random_state=42)

In [4]:
X_train

Unnamed: 0,county,is_business,product_type,is_consumption,row_id,prediction_unit_id,euros_per_mwh,gas_price,year,sin_month,cos_month,sin_day,cos_day,sin_time,cos_time
423986,10,0,3,0,424112,39,181.38,77.91,2022,0.500000,8.660254e-01,-0.651372,-0.758758,-0.631088,-0.775711
629766,14,0,3,0,629892,54,90.84,80.00,2022,1.000000,6.123234e-17,-0.937752,0.347305,-0.398401,-0.917211
1579289,2,1,1,1,1579819,65,42.25,66.88,2023,0.500000,8.660254e-01,0.299363,-0.954139,0.730836,0.682553
1299685,15,0,1,1,1300079,57,94.92,116.65,2022,-0.866025,5.000000e-01,-0.790776,-0.612106,0.887885,0.460065
238386,13,0,1,0,238512,50,94.07,83.52,2021,-0.500000,8.660254e-01,-0.651372,-0.758758,-0.887885,0.460065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,13,0,3,0,259304,51,217.06,84.64,2021,-0.500000,8.660254e-01,-0.848644,0.528964,-0.942261,-0.334880
1414414,5,1,3,0,1414944,23,179.96,133.74,2022,-0.500000,8.660254e-01,-0.988468,0.151428,0.269797,0.962917
131932,15,0,1,0,131932,57,72.45,67.52,2021,-0.866025,5.000000e-01,0.101168,-0.994869,-0.398401,-0.917211
671155,7,1,3,1,671549,30,87.16,116.88,2022,0.866025,-5.000000e-01,0.988468,0.151428,0.136167,-0.990686


In [5]:
y_train.isna().sum()

0

In [6]:
len(X_train)

2010409

In [7]:
reg = LinearRegression().fit(X_train, y_train)

reg.score(X_train, y_train)

y_train_hat = reg.predict(X_train)

np.mean(abs(y_train_hat - y_train))

365.938814875946

In [8]:
def model_evaluator(model_trained):
    temp_y_train_hat = model_trained.predict(X_test)
    print('MAE',mean_absolute_error(y_test, temp_y_train_hat))
    print('MSE',mean_squared_error(y_test, temp_y_train_hat))
    print('MAPE:',mean_absolute_percentage_error(y_test, temp_y_train_hat))

In [9]:
dummy_regr = DummyRegressor(strategy='mean').fit(X_train, y_train)
model_evaluator(dummy_regr)

MAE 373.89438219792544
MSE 867131.8698371053
MAPE: 2.219556908757536e+17


In [10]:
model_evaluator(reg)

MAE 369.7301161563031
MSE 758084.0555965513
MAPE: 1.6783701191721706e+17


In [11]:
import xgboost as xgb

In [20]:
list_scores = []

for i in range(45, 46, 15):
    for j in range(128, 129, 2):
        print(f"Test avec les paramètres n_estimators à {i} et max_depth à {j} :")
        temp_clf = xgb.XGBRegressor(n_estimators=i ,max_depth=j, random_state=0, learning_rate=1.0).fit(X_train, y_train)
        print("Entraînement terminé")
        y_train_hat = temp_clf.predict(X_train)
        y_test_hat = temp_clf.predict(X_test)
        list_scores.append({
            "n_estimators": i,
            "max_depth": j,
            "train_mae": mean_absolute_error(y_train, y_train_hat),
            "test_mae": mean_absolute_error(y_test, y_test_hat)
        })

print("Fin de la série d'entraînement")

Test avec les paramètres n_estimators à 45 et max_depth à 60 :
Entraînement terminé
Fin de la série d'entraînement


In [21]:
scores = pd.DataFrame(list_scores)

scores.sort_values(by='test_mae')

Unnamed: 0,n_estimators,max_depth,train_mae,test_mae
0,45,60,0.00057,35.385106
