In [4]:
import pandas as pd
import numpy as np

from skrub import TableVectorizer
import xgboost as xgb

from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline

import holidays

from datetime import datetime

from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.compose import ColumnTransformer


from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split

import optuna

import datetime

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

from lightgbm import LGBMRegressor



In [5]:
# Import the files
df_train = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/train.parquet")
df_test = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/final_test.parquet")

In [6]:
# Add external data : weather data
weather = pd.read_csv(
    "/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/weather_data.csv.gz",
    parse_dates=["AAAAMMJJHH"],
    date_format="%Y%m%d%H",
    compression="gzip",
    sep=";",
).rename(columns={"AAAAMMJJHH": "date"})

weather = weather[
    (weather["date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (weather["date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

weather_reduced = (
    weather.drop(columns=["NUM_POSTE", "NOM_USUEL", "LAT", "LON", "QDXI3S"])
    .groupby("date")
    .mean()
    .dropna(axis=1, how="all")
    .interpolate(method="linear")
)

weather_reduced = (
    weather_reduced
    .drop(columns=[
        "PSTAT", "DD", "PMER", "PMERMIN", "QNEIGETOT", "QTCHAUSSEE", "ALTI", "QDRR1", "DXY", "FXY",
        "QTNSOL", "QPMER", "DXI", "QFF", "QGLO2", "QGLO", "FF", "QHFXI3S", "QINS2", "QINS",
        "QFXI3S", "RR1", "NEIGETOT", 'HXI', 'HFXI3S', "HTN", "HTX", "HUN", "HUX", "FXI3S",
        "T10", "T20", "T50", "T100", "TNSOL", "TN50", "TCHAUSSEE", "TN", "TX"
    ])
    .dropna(axis=1, how="all")
    .loc[:, weather_reduced.nunique(dropna=True) > 1]
    .drop(columns=["QTD", "QTN", "QUN", "QUX", "QTSV", "QTX", "GLO2", "INS2", "UN", "UX"])
)

# We merge :
df_train = df_train.merge(weather_reduced, left_on="date", right_on="date", how="left")
df_test = df_test.merge(weather_reduced, left_on="date", right_on="date", how="left")

In [7]:
# Add jour ferie data
jour_feries = (
    pd.read_csv(
        "/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/jours_feries_metropole.csv",
        date_format="%Y%m%d%H"  # Ensure date format is handled correctly
    )
    .drop(columns=["annee", "zone"])  # Drop unnecessary columns
)

# Convert 'date' column to datetime
jour_feries['date'] = pd.to_datetime(jour_feries['date'])

# Filter rows based on the date range of df_train and df_test
jour_feries = jour_feries[
    (jour_feries["date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (jour_feries["date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

In [8]:
# Add mouvements sociaux data :
mouvements_sociaux = (
    pd.read_csv(
        "/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/mouvements-sociaux-depuis-2002.csv",
        date_format="%Y%m%d%H",
        sep=";"
    )
    .drop(columns=['date_de_fin', 'Organisations syndicales', 'Métiers ciblés par le préavis',
                   'Population devant travailler ciblee par le préavis', 'Nombre de grévistes du préavis'])  # Drop unnecessary columns
)

mouvements_sociaux['Date'] = pd.to_datetime(mouvements_sociaux['Date'])

mouvements_sociaux = mouvements_sociaux[
    (mouvements_sociaux["Date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (mouvements_sociaux["Date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

mouvements_sociaux = mouvements_sociaux[mouvements_sociaux['Date'] != pd.Timestamp('2021-03-08')]

In [9]:
# Extract the date feature on different time scales :

fr_holidays = holidays.France()

def _encode_dates(X):
    X = X.copy()  # Modify a copy of X

    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Creation of a binary variable depicting if the day is a weekend
    X["is_weekend"] = np.where(X["weekday"] + 1 > 5, 1, 0)

    # Add a feature to indicate if the day is a holiday in France
    X["is_holiday"] = X["date"].apply(lambda d: 1 if d in fr_holidays else 0)

    # Add a feature to indicate if it is a jour férié in France
    X["is_jour_ferie"] = X["date"].dt.date.isin(jour_feries["date"]).astype(int)

    # Add a feature to indicate if it is a jour of "mouvement social" in France
    X["is_jour_mouvement_social"] = X["date"].dt.date.isin(mouvements_sociaux["Date"]).astype(int)

    # Add morning rush and evening rush features
    X["is_working_day"] = np.where((X["weekday"] + 1 <= 5), 1, 0)
    X["morning_rush"] = ((X["hour"].between(7, 9)) & X["is_working_day"]).astype(int)
    X["evening_rush"] = ((X["hour"].between(17, 19)) & X["is_working_day"]).astype(int)

    # Add the season feature
    def season_date(date):
        if (date > datetime.datetime(2020, 9, 21)) & (date < datetime.datetime(2020, 12, 21)):
            return 1  # Autumn
        if (date > datetime.datetime(2020, 12, 20)) & (date < datetime.datetime(2021, 3, 20)):
            return 2  # Winter
        if (date > datetime.datetime(2021, 3, 19)) & (date < datetime.datetime(2021, 6, 21)):
            return 3  # Spring
        if ((date > datetime.datetime(2021, 6, 20)) & (date < datetime.datetime(2021, 9, 22))) or \
           ((date > datetime.datetime(2020, 6, 19)) & (date < datetime.datetime(2020, 9, 22))):
            return 4  # Summer
        return 0  # Fallback if none matches

    X["season"] = X["date"].apply(season_date)

    return X

df_train = _encode_dates(df_train)
df_test = _encode_dates(df_test)


In [10]:
import geopandas as gpd
from shapely.geometry import Point

# To add an "arrondissement" feature based on latitute ande longitude
def arrondissement(X, shapefile_path="/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/arrondissements.shp"):

    arrondissements = gpd.read_file(shapefile_path)

    # Create a GeoDataFrame for the input dataset
    X = X.copy()  # Work on a copy of the dataset
    X["geometry"] = X.apply(lambda row: Point(row["longitude"], row["latitude"]), axis=1)
    gdf = gpd.GeoDataFrame(X, geometry="geometry", crs=arrondissements.crs)

    # Perform a spatial join to match points to arrondissements
    merged = gpd.sjoin(gdf, arrondissements, how="left", predicate="within")

    # Extract the arrondissement code (e.g., "c_ar") and fill missing values with 21
    X["district"] = merged["c_ar"].fillna(21).astype(int)

    # Drop the geometry column (optional, if not needed further)
    X = X.drop(columns=["geometry"])

    return X

df_train = arrondissement(df_train)
df_test = arrondissement(df_test)

In [11]:
# To add covid features : one binary feature for lockdown and one binary feature for curfew periods
def covid_features(data):
    # Lockdown periods
    lockdown_periods = [
        ("2020-10-30", "2020-12-15"),
        ("2021-04-03", "2021-05-03"),
    ]

    # Binary column for lockdown
    data["is_lockdown"] = 0
    for start_date, end_date in lockdown_periods:
        data.loc[
            (data["date"] >= start_date) & (data["date"] < end_date),
            "is_lockdown"
        ] = 1

    # Curfew periods with specific time restrictions
    curfew_periods = [
        ("2020-10-17", "2020-10-30", 21, 6),  # Curfew from 9 PM to 6 AM
        ("2020-12-16", "2021-01-15", 20, 6),  # Curfew from 8 PM to 6 AM
        ("2021-01-15", "2021-03-20", 19, 6),  # Curfew from 7 PM to 6 AM
        ("2021-03-20", "2021-04-03", 18, 6),  # Curfew from 6 PM to 6 AM
        ("2021-05-03", "2021-06-09", 19, 6),  # Curfew from 7 PM to 6 AM
        ("2021-06-09", "2021-06-20", 23, 6),  # Curfew from 11 PM to 6 AM
    ]

    # Binary column for curfew
    data["is_curfew"] = 0
    for start_date, end_date, start_hour, end_hour in curfew_periods:
        data.loc[
            (data["date"] >= start_date) & (data["date"] < end_date)
            & ((data["hour"] >= start_hour) | (data["hour"] < end_hour)),
            "is_curfew"
        ] = 1

    return data

# Apply the function to your datasets
df_train = covid_features(df_train)
df_test = covid_features(df_test)


# remove the date column
df_train = df_train.drop(columns=['date', 'is_working_day'])
df_test = df_test.drop(columns=['date', 'is_working_day'])


# remove the date column
# df_train = df_train.drop(columns=['date'])
# df_test = df_test.drop(columns=['date'])

In [12]:
df_train = df_train.drop(columns=["counter_id", "site_id", "counter_technical_id", "coordinates"])
df_test = df_test.drop(columns=["counter_id", "site_id", "counter_technical_id", "coordinates"])

In [13]:
'''
# Extract features from counter_installation_date
for df in [df_train, df_test]:
    df["installation_year"] = df["counter_installation_date"].dt.year
    df["installation_month"] = df["counter_installation_date"].dt.month

df_train = df_train.drop(columns=["counter_installation_date"])
df_test = df_test.drop(columns=["counter_installation_date"])
'''


'\n# Extract features from counter_installation_date\nfor df in [df_train, df_test]:\n    df["installation_year"] = df["counter_installation_date"].dt.year\n    df["installation_month"] = df["counter_installation_date"].dt.month\n\ndf_train = df_train.drop(columns=["counter_installation_date"])\ndf_test = df_test.drop(columns=["counter_installation_date"])\n'

In [14]:
df_train.dtypes

counter_name                       category
site_name                          category
bike_count                          float64
counter_installation_date    datetime64[us]
latitude                            float64
longitude                           float64
log_bike_count                      float64
DRR1                                float64
HXY                                 float64
FXI                                 float64
T                                   float64
QT                                  float64
TD                                  float64
DG                                  float64
U                                   float64
QU                                  float64
DHUMI40                             float64
DHUMI80                             float64
TSV                                 float64
VV                                  float64
WW                                  float64
GLO                                 float64
INS                             

In [15]:
# Preprocessing test :

ordinal_cols = [
    "counter_installation_date"
]

onehot_cols = [
    "counter_name",
    "site_name",
]

scale_cols = [
    "latitude",
    "longitude",
    "year",
    "month",
    "day",
    "weekday",
    "is_weekend",
    "hour",
    "is_holiday",
    "is_jour_ferie",
    "is_jour_mouvement_social",
    "morning_rush",
    "evening_rush",
    "season",
    "district",
    "is_lockdown",
    "is_curfew",
    "T", "TD", "DG", "U", "QU", "DHUMI40", "DHUMI80", "TSV", "VV", "WW", "GLO", "INS",
]

scaler = StandardScaler()
onehot = OneHotEncoder(sparse_output=False)
ordinal = OrdinalEncoder()


# Create the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("scale", scaler, scale_cols),
        ("onehot", onehot, onehot_cols),
        ("ordinal", ordinal, ordinal_cols),
    ]
)

# Define the full pipeline
def create_pipeline(params):
    model = LGBMRegressor(**params, random_state=42)
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    return pipeline

In [16]:
X_train = df_train.drop(columns=["bike_count", "log_bike_count"])
y_train = df_train["log_bike_count"]

X_test = df_test.copy()

In [17]:
# Split the subset into train and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [18]:
# Define the Optuna objective function
def objective(trial):
    # Suggest hyperparameters
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", -1, 10),  # -1 means no limit
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-8, 10.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0),
    }

    # Create pipeline with suggested parameters
    pipeline = create_pipeline(param)

    # Train the pipeline
    pipeline.fit(X_train_split, y_train_split)

    # Predict on validation set
    y_pred = pipeline.predict(X_val_split)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val_split, y_pred))
    return rmse

# Create an Optuna study and optimize
study = optuna.create_study(direction="minimize")  # Minimize RMSE
study.optimize(objective, n_trials=50, timeout=1200)  # Adjust n_trials and timeout as needed

# Get the best parameters and score
print("Best Parameters:", study.best_params)
print("Best RMSE:", study.best_value)

[I 2024-12-11 14:32:18,074] A new study created in memory with name: no-name-82ff662e-d218-405c-854b-edbc02ade25d


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010755 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:20,452] Trial 0 finished with value: 0.41084683257252286 and parameters: {'n_estimators': 116, 'learning_rate': 0.2946849073590842, 'max_depth': 7, 'num_leaves': 96, 'min_child_samples': 44, 'min_child_weight': 7.309480745631088, 'subsample': 0.512619694124286, 'colsample_bytree': 0.666596754608566, 'reg_alpha': 6.36654963598149, 'reg_lambda': 4.852063960759792}. Best is trial 0 with value: 0.41084683257252286.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010683 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:24,000] Trial 1 finished with value: 0.38511771390858823 and parameters: {'n_estimators': 235, 'learning_rate': 0.2767729366480475, 'max_depth': 8, 'num_leaves': 53, 'min_child_samples': 36, 'min_child_weight': 1.3538851991549625, 'subsample': 0.7638974825391194, 'colsample_bytree': 0.5680217834830527, 'reg_alpha': 4.698917578241141, 'reg_lambda': 9.444990325824781}. Best is trial 1 with value: 0.38511771390858823.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011101 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:25,934] Trial 2 finished with value: 0.5585662608476133 and parameters: {'n_estimators': 314, 'learning_rate': 0.15785026029064367, 'max_depth': 3, 'num_leaves': 51, 'min_child_samples': 42, 'min_child_weight': 4.5288422323930915, 'subsample': 0.6924135811107822, 'colsample_bytree': 0.6582512012809618, 'reg_alpha': 0.035542146615395724, 'reg_lambda': 0.23617320857950466}. Best is trial 1 with value: 0.38511771390858823.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:28,782] Trial 3 finished with value: 0.7307265523284227 and parameters: {'n_estimators': 154, 'learning_rate': 0.014224511192042343, 'max_depth': 8, 'num_leaves': 51, 'min_child_samples': 43, 'min_child_weight': 7.851391688974217, 'subsample': 0.7965613409271524, 'colsample_bytree': 0.5799002599731915, 'reg_alpha': 4.605684010631518, 'reg_lambda': 0.7506507634851236}. Best is trial 1 with value: 0.38511771390858823.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010486 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:31,375] Trial 4 finished with value: 0.4106802113032166 and parameters: {'n_estimators': 134, 'learning_rate': 0.27086451301171793, 'max_depth': -1, 'num_leaves': 47, 'min_child_samples': 20, 'min_child_weight': 6.866044414482618, 'subsample': 0.7838488492677855, 'colsample_bytree': 0.5128851368181386, 'reg_alpha': 9.319794938365416, 'reg_lambda': 1.3244026962688062}. Best is trial 1 with value: 0.38511771390858823.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010505 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:32,801] Trial 5 finished with value: 0.8392635766916792 and parameters: {'n_estimators': 185, 'learning_rate': 0.1948434240136297, 'max_depth': 1, 'num_leaves': 44, 'min_child_samples': 17, 'min_child_weight': 9.622155355972204, 'subsample': 0.9707760233698353, 'colsample_bytree': 0.5178743889623831, 'reg_alpha': 6.354630786038359, 'reg_lambda': 8.027337632340608}. Best is trial 1 with value: 0.38511771390858823.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:34,536] Trial 6 finished with value: 0.4423268351808032 and parameters: {'n_estimators': 130, 'learning_rate': 0.20623076572455212, 'max_depth': 10, 'num_leaves': 25, 'min_child_samples': 33, 'min_child_weight': 7.471796508764883, 'subsample': 0.6270353619305624, 'colsample_bytree': 0.8564050803954888, 'reg_alpha': 7.313308146469816, 'reg_lambda': 3.964809206792346}. Best is trial 1 with value: 0.38511771390858823.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:37,384] Trial 7 finished with value: 0.37848030737637384 and parameters: {'n_estimators': 142, 'learning_rate': 0.29533201198540654, 'max_depth': 8, 'num_leaves': 94, 'min_child_samples': 15, 'min_child_weight': 3.099605895486466, 'subsample': 0.8653134263946272, 'colsample_bytree': 0.8787675751583099, 'reg_alpha': 7.949263468598635, 'reg_lambda': 2.374833629018138}. Best is trial 7 with value: 0.37848030737637384.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010385 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:38,950] Trial 8 finished with value: 0.8236052696891426 and parameters: {'n_estimators': 434, 'learning_rate': 0.11301137016395599, 'max_depth': 1, 'num_leaves': 96, 'min_child_samples': 45, 'min_child_weight': 5.632214292525414, 'subsample': 0.9228530425735224, 'colsample_bytree': 0.899865082542399, 'reg_alpha': 1.1380406091630304, 'reg_lambda': 3.75756856168435}. Best is trial 7 with value: 0.37848030737637384.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:40,966] Trial 9 finished with value: 0.511153633982163 and parameters: {'n_estimators': 195, 'learning_rate': 0.18209075193073698, 'max_depth': 4, 'num_leaves': 70, 'min_child_samples': 40, 'min_child_weight': 7.010648595705355, 'subsample': 0.5955280768649158, 'colsample_bytree': 0.8239259685988409, 'reg_alpha': 5.729996317472562, 'reg_lambda': 0.27173937283435107}. Best is trial 7 with value: 0.37848030737637384.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011500 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:47,059] Trial 10 finished with value: 0.39163238074959045 and parameters: {'n_estimators': 351, 'learning_rate': 0.061721094073143704, 'max_depth': 10, 'num_leaves': 74, 'min_child_samples': 11, 'min_child_weight': 1.5982790331151973, 'subsample': 0.8830820053808683, 'colsample_bytree': 0.9876255384851784, 'reg_alpha': 9.913223167904809, 'reg_lambda': 6.437846875497397}. Best is trial 7 with value: 0.37848030737637384.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010647 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:50,875] Trial 11 finished with value: 0.3782904580971034 and parameters: {'n_estimators': 244, 'learning_rate': 0.2399310396016418, 'max_depth': 7, 'num_leaves': 80, 'min_child_samples': 28, 'min_child_weight': 1.3801969062551191, 'subsample': 0.8526007808502738, 'colsample_bytree': 0.746474887144793, 'reg_alpha': 3.3903143830447693, 'reg_lambda': 9.501002095234131}. Best is trial 11 with value: 0.3782904580971034.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:54,111] Trial 12 finished with value: 0.3931924024421476 and parameters: {'n_estimators': 259, 'learning_rate': 0.23879912447584756, 'max_depth': 6, 'num_leaves': 82, 'min_child_samples': 25, 'min_child_weight': 3.213629087502473, 'subsample': 0.8619420454972752, 'colsample_bytree': 0.7587434877182058, 'reg_alpha': 2.834481687490226, 'reg_lambda': 2.091309346308231}. Best is trial 11 with value: 0.3782904580971034.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:32:57,175] Trial 13 finished with value: 0.397512650243372 and parameters: {'n_estimators': 245, 'learning_rate': 0.23826240853857128, 'max_depth': 6, 'num_leaves': 85, 'min_child_samples': 27, 'min_child_weight': 0.21601733412086244, 'subsample': 0.9888295811114409, 'colsample_bytree': 0.7564239246840917, 'reg_alpha': 2.8270843658854212, 'reg_lambda': 9.990635824562636}. Best is trial 11 with value: 0.3782904580971034.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:33:03,170] Trial 14 finished with value: 0.3576284565224436 and parameters: {'n_estimators': 380, 'learning_rate': 0.2359669902080042, 'max_depth': 9, 'num_leaves': 85, 'min_child_samples': 11, 'min_child_weight': 3.1933162624500415, 'subsample': 0.8432615067857263, 'colsample_bytree': 0.9312532475405868, 'reg_alpha': 8.23464565512593, 'reg_lambda': 6.944736811445308}. Best is trial 14 with value: 0.3576284565224436.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:33:08,666] Trial 15 finished with value: 0.35650017201372297 and parameters: {'n_estimators': 395, 'learning_rate': 0.23453640671618584, 'max_depth': 10, 'num_leaves': 66, 'min_child_samples': 50, 'min_child_weight': 2.992224459721964, 'subsample': 0.7086215507343497, 'colsample_bytree': 0.9981483283773371, 'reg_alpha': 2.816355341922786, 'reg_lambda': 7.160441821705223}. Best is trial 15 with value: 0.35650017201372297.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009621 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:33:15,359] Trial 16 finished with value: 0.3660567309904391 and parameters: {'n_estimators': 497, 'learning_rate': 0.12060109944720425, 'max_depth': 10, 'num_leaves': 63, 'min_child_samples': 50, 'min_child_weight': 3.7389578037607545, 'subsample': 0.7058780086580194, 'colsample_bytree': 0.9863033113738611, 'reg_alpha': 8.317909627596473, 'reg_lambda': 7.1884417036207555}. Best is trial 15 with value: 0.35650017201372297.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010793 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:33:20,753] Trial 17 finished with value: 0.3604987656861655 and parameters: {'n_estimators': 397, 'learning_rate': 0.2095701888889186, 'max_depth': 9, 'num_leaves': 64, 'min_child_samples': 50, 'min_child_weight': 5.22383741561786, 'subsample': 0.6838500446573337, 'colsample_bytree': 0.9342980859672086, 'reg_alpha': 1.352426266993218, 'reg_lambda': 6.39854062510668}. Best is trial 15 with value: 0.35650017201372297.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013316 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:33:24,381] Trial 18 finished with value: 0.41799970951808335 and parameters: {'n_estimators': 407, 'learning_rate': 0.14715051084430114, 'max_depth': 5, 'num_leaves': 39, 'min_child_samples': 22, 'min_child_weight': 2.0763042897063233, 'subsample': 0.6092542969814875, 'colsample_bytree': 0.9374652163169342, 'reg_alpha': 3.981237918018313, 'reg_lambda': 7.956119065453167}. Best is trial 15 with value: 0.35650017201372297.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009549 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:33:31,570] Trial 19 finished with value: 0.3510063912781407 and parameters: {'n_estimators': 483, 'learning_rate': 0.2507707148900671, 'max_depth': 9, 'num_leaves': 88, 'min_child_samples': 33, 'min_child_weight': 0.1560964794790567, 'subsample': 0.8152852482246818, 'colsample_bytree': 0.8139344311239654, 'reg_alpha': 1.5919296220230412, 'reg_lambda': 5.882831592332525}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011655 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:33:34,310] Trial 20 finished with value: 0.47825243140024115 and parameters: {'n_estimators': 498, 'learning_rate': 0.25774637133799744, 'max_depth': 3, 'num_leaves': 34, 'min_child_samples': 36, 'min_child_weight': 0.6591908124302568, 'subsample': 0.7281098960138287, 'colsample_bytree': 0.8114742168329676, 'reg_alpha': 1.9918341865654978, 'reg_lambda': 5.225767495709632}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:33:40,716] Trial 21 finished with value: 0.352065683076571 and parameters: {'n_estimators': 443, 'learning_rate': 0.22485250950066918, 'max_depth': 9, 'num_leaves': 88, 'min_child_samples': 10, 'min_child_weight': 2.4279418610985912, 'subsample': 0.8091804406931642, 'colsample_bytree': 0.9994555049126286, 'reg_alpha': 0.4814242818280987, 'reg_lambda': 6.205170389812221}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010937 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:33:47,395] Trial 22 finished with value: 0.3517477663633856 and parameters: {'n_estimators': 448, 'learning_rate': 0.212960276968158, 'max_depth': 9, 'num_leaves': 90, 'min_child_samples': 32, 'min_child_weight': 2.5201635089268226, 'subsample': 0.7979148536500382, 'colsample_bytree': 0.9995238902941, 'reg_alpha': 0.06493021323051879, 'reg_lambda': 5.890729194500819}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011958 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:33:54,147] Trial 23 finished with value: 0.35303186829354466 and parameters: {'n_estimators': 451, 'learning_rate': 0.17311150897018593, 'max_depth': 9, 'num_leaves': 90, 'min_child_samples': 32, 'min_child_weight': 2.108128483606473, 'subsample': 0.8030536627885234, 'colsample_bytree': 0.9526335971273833, 'reg_alpha': 0.280330457458535, 'reg_lambda': 5.103574566768224}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:34:00,784] Trial 24 finished with value: 0.3600857499504798 and parameters: {'n_estimators': 455, 'learning_rate': 0.21355676336905843, 'max_depth': 7, 'num_leaves': 100, 'min_child_samples': 37, 'min_child_weight': 0.1342128910906287, 'subsample': 0.9072989396549138, 'colsample_bytree': 0.7001613574682032, 'reg_alpha': 0.8261051815605293, 'reg_lambda': 5.740839441884671}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010056 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:34:05,709] Trial 25 finished with value: 0.3584447795426647 and parameters: {'n_estimators': 349, 'learning_rate': 0.2575948812875714, 'max_depth': 8, 'num_leaves': 75, 'min_child_samples': 30, 'min_child_weight': 4.138889306489876, 'subsample': 0.8257908998788653, 'colsample_bytree': 0.849388343387847, 'reg_alpha': 1.9659409644275248, 'reg_lambda': 3.91497284911451}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009992 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:34:10,944] Trial 26 finished with value: 0.3818093791746583 and parameters: {'n_estimators': 473, 'learning_rate': 0.1375921697035345, 'max_depth': 6, 'num_leaves': 89, 'min_child_samples': 24, 'min_child_weight': 2.509058807996657, 'subsample': 0.7612653963444399, 'colsample_bytree': 0.906722597390749, 'reg_alpha': 1.74639735595868, 'reg_lambda': 8.424457974904199}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011898 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:34:16,992] Trial 27 finished with value: 0.3560467900489839 and parameters: {'n_estimators': 436, 'learning_rate': 0.21952589518802323, 'max_depth': 9, 'num_leaves': 78, 'min_child_samples': 33, 'min_child_weight': 1.0201932642236937, 'subsample': 0.9382142357413379, 'colsample_bytree': 0.8117312974476285, 'reg_alpha': 0.6111321634235967, 'reg_lambda': 6.037439564665703}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009602 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:34:23,696] Trial 28 finished with value: 0.35718930631725243 and parameters: {'n_estimators': 474, 'learning_rate': 0.18667890714768479, 'max_depth': 7, 'num_leaves': 90, 'min_child_samples': 39, 'min_child_weight': 5.999705428084403, 'subsample': 0.8125013371506431, 'colsample_bytree': 0.9608895916847897, 'reg_alpha': 0.1715902276707299, 'reg_lambda': 4.517772533161866}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:34:27,341] Trial 29 finished with value: 0.40010000375427257 and parameters: {'n_estimators': 359, 'learning_rate': 0.28144136885731397, 'max_depth': 5, 'num_leaves': 100, 'min_child_samples': 29, 'min_child_weight': 2.0496634022764706, 'subsample': 0.5392955101148663, 'colsample_bytree': 0.6594395784956744, 'reg_alpha': 2.3084327302432284, 'reg_lambda': 2.8833595060514274}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:34:32,923] Trial 30 finished with value: 0.35427161781417243 and parameters: {'n_estimators': 421, 'learning_rate': 0.25550617929594593, 'max_depth': 9, 'num_leaves': 70, 'min_child_samples': 19, 'min_child_weight': 0.7518565410866882, 'subsample': 0.6608810591213728, 'colsample_bytree': 0.9082675028590228, 'reg_alpha': 1.1860845949726326, 'reg_lambda': 5.667597349260882}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009876 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:34:40,071] Trial 31 finished with value: 0.3533141671270002 and parameters: {'n_estimators': 458, 'learning_rate': 0.1708948737793826, 'max_depth': 9, 'num_leaves': 92, 'min_child_samples': 32, 'min_child_weight': 1.958603201946134, 'subsample': 0.7521485852987501, 'colsample_bytree': 0.9596799192694154, 'reg_alpha': 0.4568777427461308, 'reg_lambda': 4.98532277711666}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:34:46,884] Trial 32 finished with value: 0.3568155341890791 and parameters: {'n_estimators': 450, 'learning_rate': 0.16753001470763634, 'max_depth': 8, 'num_leaves': 86, 'min_child_samples': 32, 'min_child_weight': 2.4541304993649433, 'subsample': 0.7867112009760896, 'colsample_bytree': 0.9738102384457741, 'reg_alpha': 0.011447321189866577, 'reg_lambda': 5.244149672418106}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:34:53,790] Trial 33 finished with value: 0.3563701461891979 and parameters: {'n_estimators': 479, 'learning_rate': 0.19499745036390653, 'max_depth': 7, 'num_leaves': 91, 'min_child_samples': 35, 'min_child_weight': 3.9019689954563055, 'subsample': 0.8165200866779911, 'colsample_bytree': 0.9997958586281781, 'reg_alpha': 0.8235983831277892, 'reg_lambda': 4.576623541292268}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:34:58,841] Trial 34 finished with value: 0.36500778413858864 and parameters: {'n_estimators': 291, 'learning_rate': 0.2214187058271881, 'max_depth': 8, 'num_leaves': 82, 'min_child_samples': 24, 'min_child_weight': 1.4064271324420472, 'subsample': 0.738688733839264, 'colsample_bytree': 0.9454094501060419, 'reg_alpha': 1.5776160643175925, 'reg_lambda': 6.649446471374485}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014492 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:35:06,352] Trial 35 finished with value: 0.3654824618340699 and parameters: {'n_estimators': 426, 'learning_rate': 0.09103200987594331, 'max_depth': 10, 'num_leaves': 96, 'min_child_samples': 38, 'min_child_weight': 4.3865518717336345, 'subsample': 0.7865363187307637, 'colsample_bytree': 0.8797632348322588, 'reg_alpha': 0.6064865483358357, 'reg_lambda': 7.661546334308069}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008948 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:35:08,454] Trial 36 finished with value: 0.4119524604782953 and parameters: {'n_estimators': 100, 'learning_rate': 0.2762987954031938, 'max_depth': 9, 'num_leaves': 58, 'min_child_samples': 27, 'min_child_weight': 2.5279542337616734, 'subsample': 0.8312249870792208, 'colsample_bytree': 0.6033829333895732, 'reg_alpha': 4.08189446249898, 'reg_lambda': 3.2726341668456085}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014285 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:35:16,215] Trial 37 finished with value: 0.35418437138055386 and parameters: {'n_estimators': 482, 'learning_rate': 0.15742895354586636, 'max_depth': 10, 'num_leaves': 87, 'min_child_samples': 34, 'min_child_weight': 8.97715191117064, 'subsample': 0.8898043274400775, 'colsample_bytree': 0.7185109393645823, 'reg_alpha': 2.393641101528763, 'reg_lambda': 5.699580178152642}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010977 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:35:21,179] Trial 38 finished with value: 0.3638406144590865 and parameters: {'n_estimators': 316, 'learning_rate': 0.1972602816450193, 'max_depth': 8, 'num_leaves': 76, 'min_child_samples': 31, 'min_child_weight': 4.731288036533615, 'subsample': 0.7680171446546041, 'colsample_bytree': 0.9679518836699174, 'reg_alpha': 0.043777557187979085, 'reg_lambda': 8.703388504158994}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:35:26,054] Trial 39 finished with value: 0.36192798547889016 and parameters: {'n_estimators': 383, 'learning_rate': 0.22320370469842032, 'max_depth': -1, 'num_leaves': 55, 'min_child_samples': 41, 'min_child_weight': 0.6594829871221789, 'subsample': 0.7258714601166204, 'colsample_bytree': 0.7871952242514216, 'reg_alpha': 1.1545399826086982, 'reg_lambda': 4.384176948426808}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015257 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:35:34,075] Trial 40 finished with value: 0.4540361955297852 and parameters: {'n_estimators': 453, 'learning_rate': 0.01942658771854705, 'max_depth': 9, 'num_leaves': 94, 'min_child_samples': 46, 'min_child_weight': 3.729907792710784, 'subsample': 0.8051510073797582, 'colsample_bytree': 0.859869954983524, 'reg_alpha': 5.250451779612637, 'reg_lambda': 6.168638025262265}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:35:41,107] Trial 41 finished with value: 0.3510561827387197 and parameters: {'n_estimators': 455, 'learning_rate': 0.17453626060842473, 'max_depth': 9, 'num_leaves': 92, 'min_child_samples': 31, 'min_child_weight': 1.7592187935841992, 'subsample': 0.7512528307569601, 'colsample_bytree': 0.9608845828215827, 'reg_alpha': 0.4590115651365315, 'reg_lambda': 4.988908579695974}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:35:47,813] Trial 42 finished with value: 0.35722005757329695 and parameters: {'n_estimators': 415, 'learning_rate': 0.1820299474095713, 'max_depth': 8, 'num_leaves': 98, 'min_child_samples': 30, 'min_child_weight': 1.7058476810105472, 'subsample': 0.7726685664810637, 'colsample_bytree': 0.9145731611628497, 'reg_alpha': 0.5057169688448491, 'reg_lambda': 5.7080072984403}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:35:49,408] Trial 43 finished with value: 0.7978413268008894 and parameters: {'n_estimators': 436, 'learning_rate': 0.17031754586659947, 'max_depth': 1, 'num_leaves': 83, 'min_child_samples': 35, 'min_child_weight': 1.0527478106375905, 'subsample': 0.7985079908869112, 'colsample_bytree': 0.8813818165708632, 'reg_alpha': 1.025933639822659, 'reg_lambda': 5.046807527906691}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010799 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:35:57,385] Trial 44 finished with value: 0.3533389791488348 and parameters: {'n_estimators': 500, 'learning_rate': 0.13824010386068347, 'max_depth': 10, 'num_leaves': 93, 'min_child_samples': 26, 'min_child_weight': 0.013582709153095784, 'subsample': 0.6659787655867453, 'colsample_bytree': 0.9510236693828692, 'reg_alpha': 1.503945904040369, 'reg_lambda': 3.5469177610863065}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009481 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:36:03,844] Trial 45 finished with value: 0.3581350583361934 and parameters: {'n_estimators': 466, 'learning_rate': 0.26319156649124764, 'max_depth': 7, 'num_leaves': 71, 'min_child_samples': 16, 'min_child_weight': 2.8326292099512096, 'subsample': 0.8776646538296212, 'colsample_bytree': 0.6210157855971864, 'reg_alpha': 0.3090768419537636, 'reg_lambda': 4.114661749843147}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:36:07,347] Trial 46 finished with value: 0.3930193119543841 and parameters: {'n_estimators': 445, 'learning_rate': 0.20333610027420795, 'max_depth': 9, 'num_leaves': 21, 'min_child_samples': 22, 'min_child_weight': 1.1345642045956459, 'subsample': 0.8316487037600374, 'colsample_bytree': 0.9748478873366351, 'reg_alpha': 6.661422336700827, 'reg_lambda': 6.791717619300617}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011694 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:36:13,084] Trial 47 finished with value: 0.35475544941489895 and parameters: {'n_estimators': 367, 'learning_rate': 0.29536159816475616, 'max_depth': 8, 'num_leaves': 88, 'min_child_samples': 29, 'min_child_weight': 3.5100849192550823, 'subsample': 0.8461116237244611, 'colsample_bytree': 0.9241680402945938, 'reg_alpha': 2.251836864117885, 'reg_lambda': 7.5191709542593586}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009850 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:36:17,991] Trial 48 finished with value: 0.35738083105484375 and parameters: {'n_estimators': 331, 'learning_rate': 0.2506261871935168, 'max_depth': 10, 'num_leaves': 79, 'min_child_samples': 14, 'min_child_weight': 2.2739977776297113, 'subsample': 0.7515230369141785, 'colsample_bytree': 0.8375990432754102, 'reg_alpha': 0.8467302329442337, 'reg_lambda': 5.394671089443039}. Best is trial 19 with value: 0.3510063912781407.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 397461, number of used features: 114
[LightGBM] [Info] Start training from score 3.078263


[I 2024-12-11 14:36:20,636] Trial 49 finished with value: 0.4159048334946339 and parameters: {'n_estimators': 198, 'learning_rate': 0.1810233177410724, 'max_depth': 6, 'num_leaves': 95, 'min_child_samples': 33, 'min_child_weight': 0.4706259252748718, 'subsample': 0.8623854314281354, 'colsample_bytree': 0.8882591632983785, 'reg_alpha': 1.6347369868037709, 'reg_lambda': 4.695545136340472}. Best is trial 19 with value: 0.3510063912781407.


Best Parameters: {'n_estimators': 483, 'learning_rate': 0.2507707148900671, 'max_depth': 9, 'num_leaves': 88, 'min_child_samples': 33, 'min_child_weight': 0.1560964794790567, 'subsample': 0.8152852482246818, 'colsample_bytree': 0.8139344311239654, 'reg_alpha': 1.5919296220230412, 'reg_lambda': 5.882831592332525}
Best RMSE: 0.3510063912781407


In [19]:
# Train the final model with the best parameters on the full dataset
best_params = study.best_params
final_pipeline = create_pipeline(best_params)
final_pipeline.fit(X_train, y_train)


# best_model = xgb.XGBRegressor(**best_params, random_state=42)
# best_model.fit(X_train, y_train)  # Use the full training set for the final model

# Predict on the test set
# y_predictions = best_model.predict(X_test)
y_predictions = final_pipeline.predict(X_test)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017857 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2338
[LightGBM] [Info] Number of data points in the train set: 496827, number of used features: 114
[LightGBM] [Info] Start training from score 3.079917


In [20]:
print(y_predictions)

[0.3189075  1.58037598 1.95246595 ... 5.18143148 4.57624437 3.33050345]


In [21]:
pd.DataFrame(y_predictions, columns=["log_bike_count"]).reset_index().rename(
    columns={"index": "Id"}
).to_csv("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/predictions_LightGBM_Optuna.csv", index=False)