In [83]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

import holidays
import seaborn as sns

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

from datetime import datetime

In [84]:
import utils

X, y = utils.get_train_data()
X.head(2)

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
400125,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233
408305,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233


In [85]:
def _encode_dates(X):
    X = X.copy()  # Modify a copy of X
    
    # Ensure 'date' is in datetime format
    X["date"] = pd.to_datetime(X["date"])
    
    # Extract date components
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Identify weekends (Saturday = 5, Sunday = 6)
    X["is_weekend"] = X["weekday"].isin([5, 6])
    
    # Get French holidays for all years in the dataset
    years = X["year"].unique()
    fr_holidays = holidays.France(years=years)
    
    # Identify holidays
    X["is_holiday"] = X["date"].dt.date.isin(fr_holidays)
    
    # Drop the original 'date' column
    return X


In [86]:
date_encoder = FunctionTransformer(_encode_dates, validate=False)
X = date_encoder.fit_transform(X)
X.head(2)

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,year,month,day,weekday,hour,is_weekend,is_holiday
400125,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,2020,9,1,1,1,False,False
408305,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,2020,9,1,1,1,False,False


In [87]:
X['counter_id'][80:90]

627667    100056327-104056327
675566    100056331-104056331
685544    100056331-103056331
264224    100047542-103047542
273935    100047542-104047542
313454    100047546-104047546
323423    100047546-103047546
595358    100056226-104056226
608542    100056226-103056226
635519    100056329-104056329
Name: counter_id, dtype: category
Categories (56, object): ['100007049-101007049', '100007049-102007049', '100036718-103036718', '100036718-104036718', ..., '100063175-353277233', '100063175-353277235', '300014702-353245971', '300014702-353245972']

In [88]:
# Ensure the 'date' column is in datetime format
X['date'] = pd.to_datetime(X['date'])

# Find the earliest and latest dates
earliest_date = X['date'].min()
latest_date = X['date'].max()

print(f"Earliest date: {earliest_date}")
print(f"Latest date: {latest_date}")


Earliest date: 2020-09-01 01:00:00
Latest date: 2021-09-09 23:00:00


In [77]:
strike_data = {'date': [datetime(2023, 2, 7), datetime(2023, 2, 16), datetime(2023, 3, 7),
                 datetime(2023, 1, 31), datetime(2022, 2, 18), datetime(2022, 3, 25),
                 datetime(2022, 5, 23), datetime(2022, 9, 29), datetime(2022, 10, 13)],
                'Strike': [1] * 9}

# Create a DataFrame
strike = pd.DataFrame(strike_data)

# Sort the values by ascending date
strike.sort_values(by='date', inplace=True)
strike.reset_index(drop=True, inplace=True)

# Merge the strike DataFrame with df
X = X.merge(strike, on='date', how='left')
X['Strike'] = X['Strike'].fillna(0).astype(int)

#Create get_TimeOfDay_name and get_TimeOfDay functions 
def get_TimeOfDay_name(hour):
  
  if hour > 3 and hour <= 6:
    return 'Early morning 4:00AM - 6:00 AM'  
  if hour > 6 and hour <= 10:
    return 'Morning 7:00AM - 10:00 AM'
  elif hour > 10 and hour <= 13:
    return 'Middle of the day 11:00 AM - 1:00 PM'
  elif hour > 13 and hour <= 17:
    return 'Afternoon 2:00 PM - 5:00 PM'
  elif hour > 17 and hour <= 22:
    return 'Evening 6:00 PM - 10:00 PM'
  else :
    return 'Night 11:00 PM - 3:00 AM'
  
def get_TimeOfDay(hour):
  if hour > 3 and hour <= 6:
    return 1  
  if hour > 6 and hour <= 10:
    return 2
  elif hour > 10 and hour <= 13:
    return 3
  elif hour > 13 and hour <= 17:
    return 4
  elif hour > 17 and hour <= 22:
    return 5
  else :
    return 6

#Create columns by applying the functions
X['TimeOfDay'] = X['hour'].apply(get_TimeOfDay)
X['TimeOfDay_name'] = X['hour'].apply(get_TimeOfDay_name)

def get_season_name(date):
  if (date > datetime(2022, 3, 20) ) & (date < datetime(2022, 6, 21)):
    return 'Spring'
  if (date > datetime(2022, 6, 20)) & (date < datetime(2022, 9, 21)):
    return 'Summer'
  if (date > datetime(2022, 9, 20)) & (date < datetime(2022, 12, 21)):
      return 'Fall'
  if  ((date > datetime(2022, 12, 20)) & (date < datetime(2023, 3, 20))) | ((date > datetime(2021, 12, 31)) & (date < datetime(2022, 3, 21))):
      return 'Winter'

def get_season(date):
  if (date > datetime(2022, 3, 20) ) & (date < datetime(2022, 6, 21)):
    return 1
  if (date > datetime(2022, 6, 20)) & (date < datetime(2022, 9, 21)):
    return 2
  if (date > datetime(2022, 9, 20)) & (date < datetime(2022, 12, 21)):
    return 3
  if  ((date > datetime(2022, 12, 20)) & (date < datetime(2023, 3, 20))) | ((date > datetime(2021, 12, 31)) & (date < datetime(2022, 3, 21))):
    return 4

#Create columns by applying the functions
X['Season'] = X['date'].apply(get_season)
X['Season_name'] = X['date'].apply(get_season_name)

In [89]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 496827 entries, 400125 to 135985
Data columns (total 17 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   date                       496827 non-null  datetime64[us]
 5   counter_installation_date  496827 non-null  datetime64[us]
 6   coordinates                496827 non-null  category      
 7   counter_technical_id       496827 non-null  category      
 8   latitude                   496827 non-null  float64       
 9   longitude                  496827 non-null  float64       
 10  year                       496827 non-null  int32         
 11  month                      496827 non-null  int32   

In [90]:
df = X.copy()

In [91]:
X = pd.get_dummies(X, columns=["hour"], prefix="hour")
X.head(2)

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
400125,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,...,False,False,False,False,False,False,False,False,False,False
408305,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,...,False,False,False,False,False,False,False,False,False,False


In [92]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 496827 entries, 400125 to 135985
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   date                       496827 non-null  datetime64[us]
 5   counter_installation_date  496827 non-null  datetime64[us]
 6   coordinates                496827 non-null  category      
 7   counter_technical_id       496827 non-null  category      
 8   latitude                   496827 non-null  float64       
 9   longitude                  496827 non-null  float64       
 10  year                       496827 non-null  int32         
 11  month                      496827 non-null  int32   

In [93]:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

In [94]:
# Step 1: Preprocessing
# One-hot encode the categorical variables
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
categorical_encoded = onehot_encoder.fit_transform(X[categorical_cols])

# Numerical scaling
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(X[numerical_cols])

# Drop processed columns
X.drop(categorical_cols, axis=1, inplace=True)
X.drop(numerical_cols, axis=1, inplace=True)

# Ensure date column is in datetime format
X["date"] = pd.to_datetime(X["date"])

# Combine all features
X_combined = np.hstack([X.values, categorical_encoded, numerical_scaled])

# Assuming each sample has a single timestep

# Step 3: Temporal Train-Test Split
# Convert X_reshaped back into a DataFrame to preserve the date column
X_combined_df = pd.DataFrame(X_combined, columns=[f"feature_{i}" for i in range(X_combined.shape[1])])
X_combined_df["date"] = X["date"].values  # Restore the date column


# Apply temporal train-test split
X_train_split, y_train_split, X_test_split, y_test_split = train_test_split_temporal(X_combined_df, y)


# Remove the 'date' column after splitting
datetime_columns = X_test_split.select_dtypes(include=['datetime64[ns]']).columns
print(f"Columns with datetime64[ns] dtype: {datetime_columns.tolist()}")
datetime_columns = X_test_split.select_dtypes(include=['datetime64[ns]']).columns
print(f"Columns with datetime64[ns] dtype: {datetime_columns.tolist()}")

# Drop these columns from X_train_split
X_train_split = X_train_split.drop(columns=datetime_columns)
# Drop these columns from X_test_split
X_test_split = X_test_split.drop(columns=datetime_columns)

X_train_split = X_train_split.astype(float)
X_test_split = X_test_split.astype(float)



Columns with datetime64[ns] dtype: ['feature_0', 'feature_1', 'date']
Columns with datetime64[ns] dtype: ['feature_0', 'feature_1', 'date']


In [96]:
final_test = utils.get_test_data()
date_encoder = FunctionTransformer(_encode_dates, validate=False)
final_test = date_encoder.fit_transform(final_test)
final_test = pd.get_dummies(final_test, columns=["hour"], prefix="hour")
# final_test.head(2)

# Step 1: Preprocessing
# One-hot encode the categorical variables

# final_test = final_test.merge(strike, on='date', how='left')
# final_test['Strike'] = final_test['Strike'].fillna(0).astype(int)

# final_test['TimeOfDay'] = final_test['hour'].apply(get_TimeOfDay)
# final_test['TimeOfDay_name'] = final_test['hour'].apply(get_TimeOfDay_name)

# final_test['Season'] = final_test['date'].apply(get_season)
# final_test['Season_name'] = final_test['date'].apply(get_season_name)

categorical_cols = final_test.select_dtypes(include=['object', 'category']).columns
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
categorical_encoded = onehot_encoder.fit_transform(final_test[categorical_cols])

# Numerical scaling
numerical_cols = final_test.select_dtypes(include=['int64', 'float64']).columns
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(final_test[numerical_cols])

# Drop processed columns
final_test.drop(categorical_cols, axis=1, inplace=True)
final_test.drop(numerical_cols, axis=1, inplace=True)

# Ensure date column is in datetime format
final_test["date"] = pd.to_datetime(final_test["date"])

# Combine all features
final_test_combined = np.hstack([final_test.values, categorical_encoded, numerical_scaled])

# Assuming each sample has a single timestep

# Convert X_reshaped back into a DataFrame to preserve the date column
final_test_combined = pd.DataFrame(final_test_combined, columns=[f"feature_{i}" for i in range(final_test_combined.shape[1])])
final_test_combined["date"] = final_test["date"].values  # Restore the date column


datetime_columns = final_test_combined.select_dtypes(include=['datetime64[ns]']).columns
print(f"Columns with datetime64[ns] dtype: {datetime_columns.tolist()}")

final_test_combined = final_test_combined.drop(columns=datetime_columns)

final_test_combined = final_test_combined.astype(float)


Columns with datetime64[ns] dtype: ['feature_0', 'feature_1', 'date']


In [97]:
models = {
    "XGBoost": xgb.XGBRegressor(random_state=42, verbosity=1),
    "LightGBM": lgb.LGBMRegressor(random_state=42),
    # "Random Forest": RandomForestRegressor(
    #     n_estimators=100,  # Fewer trees
    #     max_depth=20,     # Limit depth
    #     min_samples_split=5,
    #     min_samples_leaf=2,
    #     random_state=42,
    #     n_jobs=-1         # Utilize multiple cores
    # ),
}

# Initialize a dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train_split, y_train_split)

    # Predict on the test set
    y_pred = model.predict(X_test_split)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test_split, y_pred))
    results[name] = rmse
    print(f"Model: {name}, RMSE: {rmse}")

# Convert results to a DataFrame and display
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['RMSE']).sort_values(by='RMSE')

# Display the results
print(results_df)

for name, model in models.items():
    predictions = model.predict(final_test_combined)
    submission = pd.DataFrame({"id": final_test.index, "log_bike_count": predictions.flatten()})
    submission_path = f"submission_{name}.csv"
    submission.to_csv(submission_path, index=False)

Model: XGBoost, RMSE: 0.5126279064139162
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 601
[LightGBM] [Info] Number of data points in the train set: 456507, number of used features: 235
[LightGBM] [Info] Start training from score 3.048868
Model: LightGBM, RMSE: 0.5368720878935426
              RMSE
XGBoost   0.512628
LightGBM  0.536872


In [67]:
# Train base models
rf =  RandomForestRegressor(
        n_estimators=100,  # Fewer trees
        max_depth=20,     # Limit depth
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1         # Utilize multiple cores
    )
lgb_model = lgb.LGBMRegressor(random_state=42)

rf.fit(X_train_split, y_train_split)
# xgb_model.fit(X_train_split, y_train_split)
lgb_model.fit(X_train_split, y_train_split)

# Generate predictions for stacking
rf_pred = rf.predict(X_test_split)
# xgb_pred = xgb_model.predict(X_test_split)
lgb_pred = lgb_model.predict(X_test_split)

# Combine predictions as input to the meta-model
stacked_features = np.vstack((rf_pred, lgb_pred)).T

# Train meta-model
meta_model = xgb.XGBRegressor(random_state=42)
meta_model.fit(stacked_features, y_test_split)

# Final predictions
final_pred = meta_model.predict(stacked_features)

# Evaluate the stacked model
rmse = np.sqrt(mean_squared_error(y_test_split, final_pred))
print(f"RMSE of Stacked Model: {rmse}")


KeyboardInterrupt: 

In [None]:
rf_pred = rf.predict(final_test_combined)
lgb_pred = lgb_model.predict(final_test_combined)

# Combine predictions as input to the meta-model
stacked_features = np.vstack((rf_pred, lgb_pred)).T

# Final predictions
predictions = meta_model.predict(stacked_features)

submission = pd.DataFrame({"id": final_test.index, "log_bike_count": predictions.flatten()})
submission_path = "submission_meta_model.csv"
submission.to_csv(submission_path, index=False)