In [11]:
import pandas as pd
import numpy as np

from skrub import TableVectorizer
import xgboost as xgb
from sklearn.pipeline import Pipeline

import datetime

import matplotlib.pyplot as plt

import holidays

from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import RandomizedSearchCV



In [12]:
# Import the files
df_train = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/train.parquet")
df_test = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/final_test.parquet")


In [13]:
# Add external data : weather data
weather = pd.read_csv(
    "/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/weather_data.csv.gz",
    parse_dates=["AAAAMMJJHH"],
    date_format="%Y%m%d%H",
    compression="gzip",
    sep=";",
).rename(columns={"AAAAMMJJHH": "date"})

weather = weather[
    (weather["date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (weather["date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

weather_reduced = (
    weather.drop(columns=["NUM_POSTE", "NOM_USUEL", "LAT", "LON", "QDXI3S"])
    .groupby("date")
    .mean()
    .dropna(axis=1, how="all")
    .interpolate(method="linear")
)

# Perform standardization, PCA, and create a DataFrame in one pipeline
n_components = 11  # Retain ~90% of variance
pca_columns = [f'pca_feature_{i+1}' for i in range(n_components)]  # Create PCA column names

weather_pca_df = pd.DataFrame(
    PCA(n_components=n_components).fit_transform(
        StandardScaler().fit_transform(weather_reduced)
    ),
    columns=pca_columns,
    index=weather_reduced.index  # Retain original index
)

# We merge only the TEMPERATURE feature
df_train = df_train.merge(weather_pca_df, left_on="date", right_on="date", how="left")
df_test = df_test.merge(weather_pca_df, left_on="date", right_on="date", how="left")

In [14]:
# Extract the date feature on different time scales :
fr_holidays = holidays.France()

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # creation of a binary varible depicting if day in weekend
    X["is_weekend"] = np.where(X["weekday"] + 1 > 5, 1, 0)

    # Add a feature to indicate if the day is a holiday in France
    X["is_holiday"] = X["date"].apply(lambda d: 1 if d in fr_holidays else 0)

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

df_train = _encode_dates(df_train)
df_test = _encode_dates(df_test)


In [15]:
# Preprocessing :
# Extract features from counter_installation_date
for df in [df_train, df_test]:
    df["installation_year"] = df["counter_installation_date"].dt.year
    df["installation_month"] = df["counter_installation_date"].dt.month

df_train = df_train.drop(columns=["counter_installation_date"])
df_test = df_test.drop(columns=["counter_installation_date"])

# Label encode high-cardinality categorical features
label_encoders = {}
for col in ["counter_id", "site_id", "counter_name", "site_name", "counter_technical_id", "coordinates"]:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.fit_transform(df_test[col])
    label_encoders[col] = le

In [16]:
'''
# Define bins and labels for temperature categories in Kelvin
bins = [-float('inf'), 278.15, 283, 298, 308.15, float('inf')]  # Updated Kelvin thresholds
labels = ['very_cold', 'cold', 'moderate', 'warm', 'very_hot']

# Create a new categorical feature for temperature
training_set_merged['temp_category'] = pd.cut(training_set_merged['temperature'], bins=bins, labels=labels)
testing_set_merged['temp_category'] = pd.cut(testing_set_merged['temperature'], bins=bins, labels=labels)

# One-hot encode the categories for the model
training_set_merged = pd.get_dummies(training_set_merged, columns=['temp_category'], drop_first=True)
testing_set_merged = pd.get_dummies(testing_set_merged, columns=['temp_category'], drop_first=True)

# remove temperature column :
training_set_merged = training_set_merged.drop(columns=['temperature'])
testing_set_merged = testing_set_merged.drop(columns=['temperature'])
'''

"\n# Define bins and labels for temperature categories in Kelvin\nbins = [-float('inf'), 278.15, 283, 298, 308.15, float('inf')]  # Updated Kelvin thresholds\nlabels = ['very_cold', 'cold', 'moderate', 'warm', 'very_hot']\n\n# Create a new categorical feature for temperature\ntraining_set_merged['temp_category'] = pd.cut(training_set_merged['temperature'], bins=bins, labels=labels)\ntesting_set_merged['temp_category'] = pd.cut(testing_set_merged['temperature'], bins=bins, labels=labels)\n\n# One-hot encode the categories for the model\ntraining_set_merged = pd.get_dummies(training_set_merged, columns=['temp_category'], drop_first=True)\ntesting_set_merged = pd.get_dummies(testing_set_merged, columns=['temp_category'], drop_first=True)\n\n# remove temperature column :\ntraining_set_merged = training_set_merged.drop(columns=['temperature'])\ntesting_set_merged = testing_set_merged.drop(columns=['temperature'])\n"

In [17]:
X_train = df_train.drop(columns=["bike_count", "log_bike_count"])
y_train = df_train["log_bike_count"]

X_test = df_test.copy()

In [18]:
# Define the XGBoost model
xgb_model = xgb.XGBRegressor()

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9, 12],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

In [19]:
# Define the XGBoost model
xgb_model = xgb.XGBRegressor()

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9, 12],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

In [20]:
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,  # Number of parameter settings sampled
    scoring='neg_mean_squared_error',  # Use appropriate scoring metric
    cv=5,  # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Perform the search
random_search.fit(X_train, y_train)

# Get the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", -random_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits




Best Parameters: {'subsample': 0.7, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.1, 'colsample_bytree': 0.6}
Best Score: 0.8679456370183681


In [21]:
# Update the model with the best parameters
best_xgb_model = random_search.best_estimator_

# Initialize the XGBoost regressor
model = best_xgb_model

# Fit the model
model.fit(
    X_train, y_train,
)

# Make Predictions on Test Data
y_predictions = model.predict(X_test)

In [22]:
'''
# Initialize the XGBoost regressor
model = XGBRegressor(
    # objective="reg:squarederror",  # Use squared error for regression
    max_depth=6,                  # Maximum depth of the trees
    learning_rate=0.1,            # Step size shrinkage
    n_estimators=500,             # Number of boosting rounds
    subsample=0.8,                # Fraction of samples for training each tree
    colsample_bytree=0.8,         # Fraction of features for each tree
    random_state=42,              # Reproducibility
)

# Fit the model
model.fit(
    X_train, y_train,
)

# Make Predictions on Test Data
y_predictions = model.predict(X_test)
'''


'\n# Initialize the XGBoost regressor\nmodel = XGBRegressor(\n    # objective="reg:squarederror",  # Use squared error for regression\n    max_depth=6,                  # Maximum depth of the trees\n    learning_rate=0.1,            # Step size shrinkage\n    n_estimators=500,             # Number of boosting rounds\n    subsample=0.8,                # Fraction of samples for training each tree\n    colsample_bytree=0.8,         # Fraction of features for each tree\n    random_state=42,              # Reproducibility\n)\n\n# Fit the model\nmodel.fit(\n    X_train, y_train,\n)\n\n# Make Predictions on Test Data\ny_predictions = model.predict(X_test)\n'

In [23]:
print(y_predictions)

[0.2882888 1.371596  2.1574965 ... 4.9524865 4.6113596 3.711801 ]


In [24]:
pd.DataFrame(y_predictions, columns=["log_bike_count"]).reset_index().rename(
    columns={"index": "Id"}
).to_csv("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/predictions_XGboost_PCA_weather.csv", index=False)

In [25]:
# We can print the RMSE on the training data :
y_train_predictions = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_predictions, squared=False)
print(f"Training RMSE: {rmse_train}")

Training RMSE: 0.3437612628878113




In [26]:
# code to get feature importance :


# Step 1: Extract the preprocessor and feature names
# Retrieve the preprocessor from the pipeline
preprocessor = pipeline.named_steps['preprocessor']

# Get the feature names after preprocessing
feature_names = preprocessor.get_feature_names_out()

# Step 2: Extract the trained XGBoost model and feature importance
xgb_model = pipeline.named_steps['model']

# Get feature importances from the trained XGBoost model
feature_importance = xgb_model.feature_importances_

# Step 3: Combine feature names and importance scores into a DataFrame
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sort features by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display top features
print("Top Features by Importance:")
importance_df.head(40)

NameError: name 'pipeline' is not defined