In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import utils
from feature_engineering import codify_date, codify_date_2, remove_outliers, get_X_y, covid_19, covid_19_2, add_weather2
from feature_engineering import add_weather, add_lag_and_rolling_features
from utils import handle_missing_values
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import optuna
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import uniform, randint


In [2]:
from sklearn.preprocessing import FunctionTransformer

def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

In [3]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = covid_19_2(mdata)
#mdata = add_weather2(mdata)
#mdata = handle_missing_values(mdata, "linear")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])
  covid_19_index = pd.read_csv(Path("data") / "Covid_19_Index.csv")


Unnamed: 0,StringencyIndex_Average,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,46.76,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
1,46.76,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
2,46.76,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
3,46.76,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
4,46.76,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [4]:
X_test = pd.read_parquet(Path("data") / "final_test.parquet")
X_test = codify_date_2(X_test)
X_test = covid_19_2(X_test)
#X_test = add_weather2(X_test)
#X_test = handle_missing_values(X_test, "linear")

X_test.head()

  covid_19_index = pd.read_csv(Path("data") / "Covid_19_Index.csv")


Unnamed: 0,date,StringencyIndex_Average,counter_id,counter_name,site_id,site_name,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,datetime,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,2021-09-10,43.77,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2021-09-10 01:00:00,2021,9,10,4,1,False,False
1,2021-09-10,43.77,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2021-09-10 13:00:00,2021,9,10,4,13,False,False
2,2021-09-10,43.77,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2021-09-10 17:00:00,2021,9,10,4,17,False,False
3,2021-09-10,43.77,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2021-09-10 19:00:00,2021,9,10,4,19,False,False
4,2021-09-10,43.77,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2021-09-10 22:00:00,2021,9,10,4,22,False,False


In [5]:
# Save original index
X_original = pd.read_parquet(Path("data") / "final_test.parquet")
X_original.loc[:, "index1"] = X_original.index
X_original = codify_date_2(X_original)

# Merge DataFrames
X_test.loc[:, "index2"] = X_test.index
merged_df = X_test.merge(X_original, on=["datetime", "counter_name"], how="left")

merged_df = merged_df.sort_values("index1")
display(merged_df[["index1", "index2"]])


Unnamed: 0,index1,index2
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
...,...,...
51435,51435,51435
51436,51436,51436
51437,51437,51437
51438,51438,51438


In [6]:
X_test = X_test.drop(columns=["counter_id", "site_id", "site_name", 
                                "counter_installation_date", 
                              "coordinates", "counter_technical_id",
                              "latitude", "longitude", "datetime", "date"])

In [7]:
"""X_cols = ['StringencyIndex_Average', 'counter_name', 'year', 'month', 'day',
       'day_of_week', 'hour', 'is_weekend', 'IsHoliday', "t"]

X = X[X_cols]
X_test = X_test[X_cols]

display(X)
display(X_test)"""

'X_cols = [\'StringencyIndex_Average\', \'counter_name\', \'year\', \'month\', \'day\',\n       \'day_of_week\', \'hour\', \'is_weekend\', \'IsHoliday\', "t"]\n\nX = X[X_cols]\nX_test = X_test[X_cols]\n\ndisplay(X)\ndisplay(X_test)'

In [8]:
"""#numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ("hour_sin", sin_transformer(24), ["hour"]),
    ("hour_cos", cos_transformer(24), ["hour"]),
], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor())
])

# Fit the pipeline to the training data
pipeline.fit(X, y)

y_pred = pipeline.predict(X_test)
y_pred = y_pred[merged_df["index2"]]

results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission_maxim.csv", index=False)"""

'#numerical_columns = [\'t\', \'rr1\', \'u\', \'ht_neige\', \'raf10\', \'ff\', \'ww\', \'etat_sol\', \'tend\']\ncategorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "is_weekend", "IsHoliday"]\n\npreprocessor = ColumnTransformer(transformers=[\n    ("cat", OneHotEncoder(handle_unknown=\'ignore\', sparse_output=False), categorical_columns),\n    ("hour_sin", sin_transformer(24), ["hour"]),\n    ("hour_cos", cos_transformer(24), ["hour"]),\n], remainder=\'passthrough\')\n\npipeline = Pipeline([\n    ("preprocessor", preprocessor),\n    ("regressor", XGBRegressor())\n])\n\n# Fit the pipeline to the training data\npipeline.fit(X, y)\n\ny_pred = pipeline.predict(X_test)\ny_pred = y_pred[merged_df["index2"]]\n\nresults = pd.DataFrame(\n    dict(\n        Id=np.arange(y_pred.shape[0]),\n        log_bike_count=y_pred,\n    )\n)\nresults.to_csv("submission_maxim.csv", index=False)'

In [9]:
"""# Define the numerical and categorical columns
#numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "is_weekend", "IsHoliday"]

# Create the preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ("hour_sin", sin_transformer(24), ["hour"]),
    ("hour_cos", cos_transformer(24), ["hour"]),
], remainder='passthrough')

# Define the objective function for Optuna optimization
def objective(trial):
    # Define the hyperparameter search space
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 10.0),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10.0),
    }

    # Define the pipeline with the trial's hyperparameters
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", XGBRegressor(**params, random_state=42))
    ])

    # Perform cross-validation and return the negative mean squared error
    scores = cross_val_score(pipeline, X, y, cv=5, scoring=make_scorer(mean_squared_error, greater_is_better=False))
    return np.mean(scores)

# Create an Optuna study and optimize the objective
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Print the best hyperparameters
print("Best hyperparameters:", study.best_params)

# Train the final model with the best hyperparameters
best_params = study.best_params
final_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(**best_params, random_state=42))
])
final_pipeline.fit(X, y)

# Make predictions on the test set
y_pred = final_pipeline.predict(X_test)
y_pred = y_pred[merged_df["index2"]]

# Save the predictions to a CSV file
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission_maxim_tuned_optuna.csv", index=False)

print("Tuned model predictions saved to submission_maxim_tuned_optuna.csv")"""

'# Define the numerical and categorical columns\n#numerical_columns = [\'t\', \'rr1\', \'u\', \'ht_neige\', \'raf10\', \'ff\', \'ww\', \'etat_sol\', \'tend\']\ncategorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "is_weekend", "IsHoliday"]\n\n# Create the preprocessor\npreprocessor = ColumnTransformer(transformers=[\n    ("cat", OneHotEncoder(handle_unknown=\'ignore\', sparse_output=False), categorical_columns),\n    ("hour_sin", sin_transformer(24), ["hour"]),\n    ("hour_cos", cos_transformer(24), ["hour"]),\n], remainder=\'passthrough\')\n\n# Define the objective function for Optuna optimization\ndef objective(trial):\n    # Define the hyperparameter search space\n    params = {\n        "n_estimators": trial.suggest_int("n_estimators", 100, 500),\n        "max_depth": trial.suggest_int("max_depth", 3, 10),\n        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),\n        "subsample": trial.suggest_float("subsample", 0.6, 1.0),\n      

In [None]:
# Define the numerical and categorical columns
#numerical_columns = ['u']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "is_weekend", "IsHoliday"]

# Create the preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ("hour_sin", sin_transformer(24), ["hour"]),
    ("hour_cos", cos_transformer(24), ["hour"]),
], remainder='passthrough')

# Define the pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(random_state=42))
])

# Define the parameter grid for RandomizedSearchCV
param_distributions = {
    "regressor__n_estimators": randint(100, 500),
    "regressor__max_depth": randint(3, 10),
    "regressor__learning_rate": uniform(0.01, 0.2),
    "regressor__subsample": uniform(0.6, 0.4),
    "regressor__colsample_bytree": uniform(0.6, 0.4),
    "regressor__gamma": uniform(0, 5),
    "regressor__reg_alpha": uniform(1e-5, 10),
    "regressor__reg_lambda": uniform(1e-5, 10),
}

# Define RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=50,  # Number of parameter settings to try
    scoring="neg_mean_squared_error",
    cv=5,
    verbose=1,
    random_state=2,
    n_jobs=-1  # Use all available cores
)

# Fit RandomizedSearchCV
random_search.fit(X, y)

# Get the best pipeline with tuned hyperparameters
best_pipeline = random_search.best_estimator_

# Print the best parameters
print("Best parameters found:", random_search.best_params_)

# Fit the final pipeline on the entire training set
best_pipeline.fit(X, y)

# Make predictions on the test set
y_pred = best_pipeline.predict(X_test)
y_pred = y_pred[merged_df["index2"]]

# Save the predictions to a CSV file
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission_maxim_tuned_randomsearch.csv", index=False)

print("Tuned model predictions saved to submission_maxim_tuned_randomsearch.csv")

'# Define the numerical and categorical columns\n#numerical_columns = [\'u\']\ncategorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "is_weekend", "IsHoliday"]\n\n# Create the preprocessor\npreprocessor = ColumnTransformer(transformers=[\n    ("cat", OneHotEncoder(handle_unknown=\'ignore\', sparse_output=False), categorical_columns),\n    ("hour_sin", sin_transformer(24), ["hour"]),\n    ("hour_cos", cos_transformer(24), ["hour"]),\n], remainder=\'passthrough\')\n\n# Define the pipeline\npipeline = Pipeline([\n    ("preprocessor", preprocessor),\n    ("regressor", XGBRegressor(random_state=42))\n])\n\n# Define the parameter grid for RandomizedSearchCV\nparam_distributions = {\n    "regressor__n_estimators": randint(100, 500),\n    "regressor__max_depth": randint(3, 10),\n    "regressor__learning_rate": uniform(0.01, 0.2),\n    "regressor__subsample": uniform(0.6, 0.4),\n    "regressor__colsample_bytree": uniform(0.6, 0.4),\n    "regressor__gamma": uniform(0, 5),