In [1]:
import pandas as pd

import numpy as np

from sklearn.metrics import mean_squared_error

import datetime
import holidays

from skrub import TableVectorizer
import xgboost as xgb
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

from sklearn.preprocessing import LabelEncoder


In [2]:
# Import the files
df_train = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/train.parquet")
df_test = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/final_test.parquet")


In [3]:
# Add external data : weather data
weather = pd.read_csv(
    "/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/weather_data.csv.gz",
    parse_dates=["AAAAMMJJHH"],
    date_format="%Y%m%d%H",
    compression="gzip",
    sep=";",
).rename(columns={"AAAAMMJJHH": "date"})

weather = weather[
    (weather["date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (weather["date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

weather_reduced = (
    weather.drop(columns=["NUM_POSTE", "NOM_USUEL", "LAT", "LON", "QDXI3S"])
    .groupby("date")
    .mean()
    .dropna(axis=1, how="all")
    .interpolate(method="linear")
)

# We merge only the TEMPERATURE feature
df_train = df_train.merge(weather_reduced["T"], left_on="date", right_on="date", how="left")
df_test = df_test.merge(weather_reduced["T"], left_on="date", right_on="date", how="left")

In [4]:
# Extract the date feature on different time scales :
fr_holidays = holidays.France()

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # creation of a binary varible depicting if day in weekend
    X["is_weekend"] = np.where(X["weekday"] + 1 > 5, 1, 0)

    # Add a feature to indicate if the day is a holiday in France
    X["is_holiday"] = X["date"].apply(lambda d: 1 if d in fr_holidays else 0)

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

df_train = _encode_dates(df_train)
df_test = _encode_dates(df_test)

In [5]:
X_train = df_train.drop(columns=["bike_count", "log_bike_count"])
y_train = df_train["log_bike_count"]

X_test = df_test.copy()

In [6]:
non_numeric_columns = X_train.select_dtypes(include=["object", "category"]).columns
print("Non-numeric columns:", non_numeric_columns)

# we convert them to numeric using label encoding :
label_encoder = LabelEncoder()
for col in non_numeric_columns:
    X_train[col] = label_encoder.fit_transform(X_train[col])
    X_test[col] = label_encoder.transform(X_test[col])

Non-numeric columns: Index(['counter_id', 'counter_name', 'site_name', 'coordinates',
       'counter_technical_id'],
      dtype='object')


In [7]:
# and for the counter_installation_date : to convert it to normal date :
for df in [X_train, X_test]:
    df["installation_year"] = df["counter_installation_date"].dt.year
    df["installation_month"] = df["counter_installation_date"].dt.month
    # df["installation_day"] = df["counter_installation_date"].dt.day
    # df["days_since_installation"] = (pd.Timestamp.now() - df["counter_installation_date"]).dt.days

# Drop the original 'counter_installation_date' column
X_train = X_train.drop(columns=["counter_installation_date"])
X_test = X_test.drop(columns=["counter_installation_date"])

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Train Ridge Regression model
ridge = Ridge(alpha=1.0)  # Regularization strength (alpha)

In [10]:
# Fit Pipeline to Training Data
ridge.fit(X_train, y_train)

# Make Predictions on Test Data
y_predictions = ridge.predict(X_test)


In [11]:
print(y_predictions)

[1.9131636  2.91841086 2.84188461 ... 3.58696274 3.6056182  3.61752599]


In [12]:
# We can print the RMSE on the training data :
y_train_predictions = ridge.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_predictions, squared=False)
print(f"Training RMSE: {rmse_train}")

Training RMSE: 1.4606961368208042




In [13]:
pd.DataFrame(y_predictions, columns=["log_bike_count"]).reset_index().rename(
    columns={"index": "Id"}
).to_csv("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/predictions_option_vsimple_with_weather.csv", index=False)