In [None]:
import pandas as pd
import numpy as np

# Chargement des données

In [None]:
y_train = pd.read_csv('/y_train_final_j5KGWWK.csv', index_col=0)
y_sample = pd.read_csv('/y_sample_final.csv', index_col=0)

x_test = pd.read_csv('/x_test_final.csv', index_col=0)
x_train = pd.read_csv('/x_train_final.csv', index_col=0)
x_train = x_train.drop(columns=['Unnamed: 0'])
# on drop une colonne de x_train car celle-ci est doublée avec l'index

In [None]:
# test des heads pour voir si tout va bien
x_train.head()

Unnamed: 0,train,gare,date,arret,p2q0,p3q0,p4q0,p0q2,p0q3,p0q4
0,VBXNMF,KYF,2023-04-03,8,0.0,0.0,1.0,-3.0,-1.0,-2.0
1,VBXNMF,JLR,2023-04-03,9,0.0,0.0,0.0,1.0,0.0,1.0
2,VBXNMF,EOH,2023-04-03,10,-1.0,0.0,0.0,-1.0,0.0,0.0
3,VBXNMF,VXY,2023-04-03,11,-1.0,-1.0,0.0,2.0,-2.0,0.0
4,VBXNMF,OCB,2023-04-03,12,-1.0,-1.0,-1.0,-1.0,3.0,2.0


Définition des fonctions de formatages

In [None]:
def format_df_x(df):
    # copy du df pour rien âbimer
    df = df.copy()

    # formatage des colonnes floats en int
    cols_to_convert = ['p2q0', 'p3q0', 'p4q0', 'p0q2', 'p0q3', 'p0q4']
    df[cols_to_convert] = df[cols_to_convert].astype(int)

    # conversion date en datetime
    df.date = pd.to_datetime(df.date)
    # faire ressortir les infos annee, mois, jour, weekend
    df['annee'] = df['date'].dt.year
    df['mois'] = df['date'].dt.month
    df['joursemaine'] = df['date'].dt.weekday
    # et on enlève la colonne date
    df = df.drop(columns=['date'])
    return df

In [None]:
def format_df_y(df):
    df = df.copy()
    df['p0q0'] = df['p0q0'].astype(int)
    return df

Création de df plus propres

In [None]:
x_train = format_df_x(x_train)
x_test = format_df_x(x_test)
y_train = format_df_y(y_train)

In [None]:
x_train.head()

Unnamed: 0,train,gare,arret,p2q0,p3q0,p4q0,p0q2,p0q3,p0q4,annee,mois,joursemaine
0,VBXNMF,KYF,8,0,0,1,-3,-1,-2,2023,4,0
1,VBXNMF,JLR,9,0,0,0,1,0,1,2023,4,0
2,VBXNMF,EOH,10,-1,0,0,-1,0,0,2023,4,0
3,VBXNMF,VXY,11,-1,-1,0,2,-2,0,2023,4,0
4,VBXNMF,OCB,12,-1,-1,-1,-1,3,2,2023,4,0


In [None]:
x_test.head()

Unnamed: 0,train,gare,arret,p2q0,p3q0,p4q0,p0q2,p0q3,p0q4,annee,mois,joursemaine
0,ZPQEKP,VXY,12,0,0,-2,-4,-2,-4,2023,11,0
1,KIQSRA,VXY,12,0,0,-1,1,-1,0,2023,11,0
2,QQJYYT,VXY,12,0,1,-1,1,-1,1,2023,11,0
3,FVKYMZ,VXY,12,0,0,-1,-1,0,-1,2023,11,0
4,GXNZBY,AZA,12,1,-2,0,0,0,0,2023,11,0


In [None]:
y_train.head()

Unnamed: 0,p0q0
0,-1
1,-1
2,-1
3,1
4,3


In [None]:
x_train.shape

(667264, 12)

In [None]:
y_train.shape

(667264, 1)

# Pipeline

In [None]:
# faisons le split
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(x_train, y_train, test_size=0.3,random_state=42)

faisons la première pipeline avec RandomForest, on fera la seconde avec XGBoost

In [None]:
pip install xgboost



In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer

In [None]:
# test de la syntaxe pour sélectionner deux colonnes
x_train[['train', 'gare']]

Unnamed: 0,train,gare
0,VBXNMF,KYF
1,VBXNMF,JLR
2,VBXNMF,EOH
3,VBXNMF,VXY
4,VBXNMF,OCB
...,...,...
667259,BFNJJK,DGG
667260,BFNJJK,KDN
667261,BFNJJK,TVD
667262,BFNJJK,TXP


In [None]:
one_hot_data = data_train[['train', 'gare']]
data_numeric = data_train.drop(columns=['train', 'gare'])

In [None]:
data_numeric.columns

Index(['arret', 'p2q0', 'p3q0', 'p4q0', 'p0q2', 'p0q3', 'p0q4', 'annee',
       'mois', 'joursemaine'],
      dtype='object')

In [None]:
# Pipeline
# one hot encoding sur train et gare et le reste on touche pas
one_hot_encoding_columns = one_hot_data.columns
numeric_columns = data_numeric.columns

preprocessor = ColumnTransformer([

    ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore'), one_hot_encoding_columns),
    ('passthrough-numeric', 'passthrough', numeric_columns)
])
model = make_pipeline(
    preprocessor,
    XGBRegressor()
)

model.fit(data_train, target_train)

In [None]:
# test de performance
from sklearn.metrics import mean_absolute_error
predictions = model.predict(data_test)
mae = mean_absolute_error(target_test, predictions)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.7828776836395264


# Pipeline avec df ext

In [None]:
# chargeons les données
x_test_ext = pd.read_csv('/x_test_ext.csv', index_col=0)
x_train_ext = pd.read_csv('/x_train_ext.csv', index_col=0)


In [None]:
# aperçu
x_train_ext.head()

Unnamed: 0,train,gare,arret,p2q0,p3q0,p4q0,p0q2,p0q3,p0q4,mois,...,RR,TN,TX,TM,DG,FFM,FXY,DXY,DRR,vacances
0,VBXNMF,KYF,8,0,0,1,-3,-1,-2,4,...,0.0,5.4,12.9,8.5,0.0,3.9,6.2,60.0,0.0,0
1,VBXNMF,JLR,9,0,0,0,1,0,1,4,...,0.0,5.4,12.9,8.5,0.0,3.9,6.2,60.0,0.0,0
2,VBXNMF,EOH,10,-1,0,0,-1,0,0,4,...,0.0,5.4,12.9,8.5,0.0,3.9,6.2,60.0,0.0,0
3,VBXNMF,VXY,11,-1,-1,0,2,-2,0,4,...,0.0,5.4,12.9,8.5,0.0,3.9,6.2,60.0,0.0,0
4,VBXNMF,OCB,12,-1,-1,-1,-1,3,2,4,...,0.0,5.4,12.9,8.5,0.0,3.9,6.2,60.0,0.0,0


In [None]:
x_test_ext.head()

Unnamed: 0,train,gare,arret,p2q0,p3q0,p4q0,p0q2,p0q3,p0q4,mois,...,RR,TN,TX,TM,DG,FFM,FXY,DXY,DRR,vacances
0,ZPQEKP,VXY,12,0,0,-2,-4,-2,-4,11,...,1.4,11.9,17.1,14.5,0.0,4.4,7.7,250.0,243.0,0
1,KIQSRA,VXY,12,0,0,-1,1,-1,0,11,...,1.4,11.9,17.1,14.5,0.0,4.4,7.7,250.0,243.0,0
2,QQJYYT,VXY,12,0,1,-1,1,-1,1,11,...,1.4,11.9,17.1,14.5,0.0,4.4,7.7,250.0,243.0,0
3,FVKYMZ,VXY,12,0,0,-1,-1,0,-1,11,...,1.4,11.9,17.1,14.5,0.0,4.4,7.7,250.0,243.0,0
4,GXNZBY,AZA,12,1,-2,0,0,0,0,11,...,1.4,11.9,17.1,14.5,0.0,4.4,7.7,250.0,243.0,0


In [None]:
x_train_ext.shape

(667264, 21)

In [None]:
x_test_ext.shape

(20657, 21)

In [None]:
# faisons le split
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(x_train_ext,
                                                                    y_train,
                                                                    test_size=0.3,
                                                                    random_state=42)

In [None]:
# besoin de redefinir ces colonnes car nouvelles colonnes ext
one_hot_data = data_train[['train', 'gare']]
data_numeric = data_train.drop(columns=['train', 'gare'])

In [None]:
# Pipeline ext
# comme les noms de colonnes et noms data sont les mêmes pas besoin de renommer one hot
one_hot_encoding_columns = one_hot_data.columns
numeric_columns = data_numeric.columns

preprocessor = ColumnTransformer([

    ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore'), one_hot_encoding_columns),
    ('passthrough-numeric', 'passthrough', numeric_columns)
])
model_ext = make_pipeline(
    preprocessor,
    XGBRegressor(n_estimators=917,
                 max_depth=10,
                 learning_rate=0.28441939194671634,
                 subsample=0.9998173252736964,
                 colsample_bytree=0.5683749396215096,
                 min_child_weight=1)
)

model_ext.fit(data_train, target_train)


# {'n_estimators': 917, 'max_depth': 10, 'learning_rate': 0.28441939194671634,
# 'subsample': 0.9998173252736964, 'colsample_bytree': 0.5683749396215096, 'min_child_weight': 1}

In [None]:
# test de performance
# le money time, on essaye de battre le 0.7828
from sklearn.metrics import mean_absolute_error
predictions = model_ext.predict(data_test)
mae = mean_absolute_error(target_test, predictions)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.7311258912086487


C'est un peu mieux mais ça a l'air quand même un peu marginal ... On s'en fout on va train le XGB sur ces données et même si ça nous fait gagner 1 place c'est benef

# LASSO

On va essayer de faire un lasso pour voir quelles sont les colonnes qui ont de l'importance. Et dans ce cas là, est-ce que ça marche mieux

In [None]:
# prompt: fais un lasso pour voir qui me donne en sortie les colonnes de x_train_ext qui ont une vraie influence sur le modèle. Les colonnes 'train' et 'gare' ont un one-hot-encoding avant d'entrer dans le modèle, sinon ce sont des strings et ça pose problème

import pandas as pd
from sklearn.linear_model import Lasso

# Prepare the data for Lasso (similar to the XGBoost pipeline)
one_hot_data = data_train[['train', 'gare']]
data_numeric = data_train.drop(columns=['train', 'gare'])

one_hot_encoding_columns = one_hot_data.columns
numeric_columns = data_numeric.columns

preprocessor = ColumnTransformer([
    ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore'), one_hot_encoding_columns),
    ('passthrough-numeric', 'passthrough', numeric_columns)
])

# Create and train the Lasso model
lasso_model = make_pipeline(
    preprocessor,
    Lasso(alpha=0.1) # You might need to tune the alpha parameter
)
lasso_model.fit(data_train, target_train)

# Get feature importances from the Lasso model
# Access the named_steps
lasso_coefficients = lasso_model.named_steps['lasso'].coef_

# Get feature names after one-hot encoding
feature_names = list(lasso_model.named_steps['columntransformer'].named_transformers_['one-hot-encoder'].get_feature_names_out(one_hot_encoding_columns))
feature_names.extend(numeric_columns)


# Create a DataFrame for better visualization
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': lasso_coefficients})

# Sort by absolute coefficient value to see most influential features
feature_importance = feature_importance.reindex(feature_importance['Coefficient'].abs().sort_values(ascending=False).index)

feature_importance


ValueError: Input X contains NaN.
Lasso does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

# Fine tuning

je sais pas faire donc je laisse gemini me faire un optuna faire pour me prédire le plus joli modèle de xgboost et je ferai une autre submission

In [None]:
pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
# prompt: utilise optuna pour trouver un best model avec les hyperparamètres de xgboost

import numpy as np
import optuna
from sklearn.metrics import mean_absolute_error

def objective(trial):
    # Define the hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }

    # Create and train the XGBoost model with the suggested hyperparameters
    model = make_pipeline(
        preprocessor,
        XGBRegressor(**params)
    )
    model.fit(data_train, target_train)

    # Evaluate the model and return the RMSE
    predictions = model.predict(data_test)
    mae = mean_absolute_error(target_test, predictions)
    return mae

# Create an Optuna study
study = optuna.create_study(direction='minimize')

# Optimize the hyperparameters
study.optimize(objective, n_trials=100) # Adjust n_trials as needed

# Print the best hyperparameters and RMSE
print('Best trial:')
trial = study.best_trial
print(f'  Value: {trial.value}')
print(f'  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

# Train a final model with the best hyperparameters
best_params = study.best_params
best_model = make_pipeline(
    preprocessor,
    XGBRegressor(**best_params)
)
best_model.fit(x_train, y_train) # train on the entire dataset


# Now you can use 'best_model' to make predictions on new data


[I 2025-03-18 10:09:20,323] A new study created in memory with name: no-name-6f8968fa-53c4-443d-aac9-5d1205564395
[I 2025-03-18 10:10:09,116] Trial 0 finished with value: 0.7759221196174622 and parameters: {'n_estimators': 519, 'max_depth': 8, 'learning_rate': 0.19370020761329085, 'subsample': 0.8926508218092969, 'colsample_bytree': 0.7844657939828439, 'min_child_weight': 8}. Best is trial 0 with value: 0.7759221196174622.
[I 2025-03-18 10:10:26,719] Trial 1 finished with value: 0.8162814378738403 and parameters: {'n_estimators': 250, 'max_depth': 5, 'learning_rate': 0.12771843865850677, 'subsample': 0.6489404090081216, 'colsample_bytree': 0.7142807188718877, 'min_child_weight': 4}. Best is trial 0 with value: 0.7759221196174622.
[I 2025-03-18 10:11:49,848] Trial 2 finished with value: 0.7749087810516357 and parameters: {'n_estimators': 766, 'max_depth': 7, 'learning_rate': 0.05651072842592158, 'subsample': 0.9590120433927528, 'colsample_bytree': 0.8869350616957978, 'min_child_weight':

n_estimator = 666, max_depth = 9, learning_rate= 0.1496192393803131, 'subsample': 0.964966489787134, 'colsample_bytree': 0.9156754709631588, 'min_child_weight': 10

# Predictions

faisons les prédictions (on passe l'étape validation)

In [None]:
pred = model.predict(x_test)

In [None]:
pred

array([-1.2079622e+00,  1.1350021e-01,  2.3593764e-01, ...,
       -1.1265864e+00,  3.4785736e-01, -1.6905501e-04], dtype=float32)

In [None]:
submission = pd.DataFrame({'p0q0': pred}, index=x_test.index)

In [None]:
submission.head()

Unnamed: 0,p0q0
0,-1.207962
1,0.1135
2,0.235938
3,-0.34339
4,-0.199952


In [None]:
submission_final = submission.copy()
submission_final['p0q0'] = submission_final['p0q0'].round().astype(int)
submission_final.head()

Unnamed: 0,p0q0
0,-1
1,0
2,0
3,0
4,0


In [None]:
submission_final.shape

(20657, 1)

In [None]:
x_test.shape

(20657, 12)

In [None]:
submission_final.to_csv('submission_final_adrien_2.csv')

# Prédiction best model

In [None]:
best_pred = model_ext.predict(x_test_ext)

In [None]:
submission = pd.DataFrame({'p0q0': best_pred}, index=x_test.index)

In [None]:
submission.head()

Unnamed: 0,p0q0
0,-0.572579
1,0.436224
2,0.532242
3,0.254028
4,-0.336687


In [None]:
submission_final = submission.copy()
submission_final['p0q0'] = submission_final['p0q0'].round().astype(int)
submission_final.head()

Unnamed: 0,p0q0
0,-1
1,0
2,1
3,0
4,0


In [None]:
submission_final.to_csv('submission_final_adrien_4.csv')