In [1]:
import pandas as pd
import numpy as np

from skrub import TableVectorizer
import xgboost as xgb
from sklearn.pipeline import Pipeline

import datetime

import matplotlib.pyplot as plt

import holidays

from sklearn.metrics import mean_squared_error


In [2]:
# Import the files
df_train = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/train.parquet")
df_test = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/final_test.parquet")


In [3]:
# Add external data : weather data
weather = pd.read_csv(
    "/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/weather_data.csv.gz",
    parse_dates=["AAAAMMJJHH"],
    date_format="%Y%m%d%H",
    compression="gzip",
    sep=";",
).rename(columns={"AAAAMMJJHH": "date"})

weather = weather[
    (weather["date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (weather["date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

weather_reduced = (
    weather.drop(columns=["NUM_POSTE", "NOM_USUEL", "LAT", "LON", "QDXI3S"])
    .groupby("date")
    .mean()
    .dropna(axis=1, how="all")
    .interpolate(method="linear")
)

# We merge only the TEMPERATURE feature
df_train = df_train.merge(weather_reduced["T"], left_on="date", right_on="date", how="left")
df_test = df_test.merge(weather_reduced["T"], left_on="date", right_on="date", how="left")

In [4]:
# Extract the date feature on different time scales :
fr_holidays = holidays.France()

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # creation of a binary varible depicting if day in weekend
    X["is_weekend"] = np.where(X["weekday"] + 1 > 5, 1, 0)

    # Add a feature to indicate if the day is a holiday in France
    X["is_holiday"] = X["date"].apply(lambda d: 1 if d in fr_holidays else 0)

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

df_train = _encode_dates(df_train)
df_test = _encode_dates(df_test)


In [None]:
'''
# Define bins and labels for temperature categories in Kelvin
bins = [-float('inf'), 278.15, 283, 298, 308.15, float('inf')]  # Updated Kelvin thresholds
labels = ['very_cold', 'cold', 'moderate', 'warm', 'very_hot']

# Create a new categorical feature for temperature
training_set_merged['temp_category'] = pd.cut(training_set_merged['temperature'], bins=bins, labels=labels)
testing_set_merged['temp_category'] = pd.cut(testing_set_merged['temperature'], bins=bins, labels=labels)

# One-hot encode the categories for the model
training_set_merged = pd.get_dummies(training_set_merged, columns=['temp_category'], drop_first=True)
testing_set_merged = pd.get_dummies(testing_set_merged, columns=['temp_category'], drop_first=True)

# remove temperature column :
training_set_merged = training_set_merged.drop(columns=['temperature'])
testing_set_merged = testing_set_merged.drop(columns=['temperature'])
'''

In [5]:
X_train = df_train.drop(columns=["bike_count", "log_bike_count"])
y_train = df_train["log_bike_count"]

X_test = df_test.copy()

In [6]:
# Preprocessing pipeline
pipeline = Pipeline(
    steps=[
        ('preprocessor', TableVectorizer()),
        ('model', xgb.XGBRegressor()),
    ]
)

In [7]:
# Fit Pipeline to Training Data
pipeline.fit(X_train, y_train)

# Make Predictions on Test Data
y_predictions = pipeline.predict(X_test)

In [8]:
print(y_predictions)

[0.36846828 1.4865953  1.9088609  ... 5.093462   4.612706   3.8784165 ]


In [9]:
pipeline.named_steps['preprocessor']

In [10]:
pd.DataFrame(y_predictions, columns=["log_bike_count"]).reset_index().rename(
    columns={"index": "Id"}
).to_csv("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/predictions_XGboost_vsimple_weather_newdata.csv", index=False)

In [11]:
# We can print the RMSE on the training data :
y_train_predictions = pipeline.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_predictions, squared=False)
print(f"Training RMSE: {rmse_train}")

Training RMSE: 0.409135913118312




In [12]:
# code to get feature importance :


# Step 1: Extract the preprocessor and feature names
# Retrieve the preprocessor from the pipeline
preprocessor = pipeline.named_steps['preprocessor']

# Get the feature names after preprocessing
feature_names = preprocessor.get_feature_names_out()

# Step 2: Extract the trained XGBoost model and feature importance
xgb_model = pipeline.named_steps['model']

# Get feature importances from the trained XGBoost model
feature_importance = xgb_model.feature_importances_

# Step 3: Combine feature names and importance scores into a DataFrame
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sort features by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display top features
print("Top Features by Importance:")
importance_df.head(40)

Top Features by Importance:


Unnamed: 0,Feature,Importance
76,site_name_Face au 40 quai D'Issy,0.106894
59,"counter_name: bagnolet, grenelle, ville (29)",0.079005
49,"counter_name: totem, diderot, des",0.064987
162,hour,0.060866
51,"counter_name: 28, sébastopol, voltaire (21)",0.054447
46,"counter_name: invalides, diderot, de",0.053683
93,counter_installation_date_day,0.045529
40,"counter_name: boulevard, ne, no",0.044414
38,"counter_name: bercy, austerlitz, aubervilliers",0.040601
155,latitude,0.040077
