In [40]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder

import holidays
import seaborn as sns


In [41]:
import utils

X, y = utils.get_train_data()
X.head(2)

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
400125,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233
408305,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233


In [42]:
def _encode_dates(X):
    X = X.copy()  # Modify a copy of X
    
    # Ensure 'date' is in datetime format
    X["date"] = pd.to_datetime(X["date"])
    
    # Extract date components
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Identify weekends (Saturday = 5, Sunday = 6)
    X["is_weekend"] = X["weekday"].isin([5, 6])
    
    # Get French holidays for all years in the dataset
    years = X["year"].unique()
    fr_holidays = holidays.France(years=years)
    
    # Identify holidays
    X["is_holiday"] = X["date"].dt.date.isin(fr_holidays)
    
    # Drop the original 'date' column
    return X


In [43]:
date_encoder = FunctionTransformer(_encode_dates, validate=False)
X = date_encoder.fit_transform(X)
X.head(2)

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,year,month,day,weekday,hour,is_weekend,is_holiday
400125,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,2020,9,1,1,1,False,False
408305,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233,2020,9,1,1,1,False,False


In [44]:
# X = pd.get_dummies(X, columns=["hour"], prefix="hour")
# X.head(2)

In [45]:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

In [46]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import StandardScaler


# Step 1: Preprocessing
# One-hot encode the categorical variables
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
categorical_encoded = onehot_encoder.fit_transform(X[categorical_cols])

# Numerical scaling
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(X[numerical_cols])

# Drop processed columns
X.drop(categorical_cols, axis=1, inplace=True)
X.drop(numerical_cols, axis=1, inplace=True)

# Ensure date column is in datetime format
X["date"] = pd.to_datetime(X["date"])

# Combine all features
X_combined = np.hstack([X.values, categorical_encoded, numerical_scaled])

# Step 2: Reshape for LSTM
# LSTM requires 3D input: (samples, timesteps, features)
# Assuming each sample has a single timestep
X_reshaped = X_combined.reshape(X_combined.shape[0], 1, X_combined.shape[1])

# Step 3: Temporal Train-Test Split
# Convert X_reshaped back into a DataFrame to preserve the date column
X_combined_df = pd.DataFrame(X_combined, columns=[f"feature_{i}" for i in range(X_combined.shape[1])])
X_combined_df["date"] = X["date"].values  # Restore the date column


# Apply temporal train-test split
X_train_split, y_train_split, X_test_split, y_test_split = train_test_split_temporal(X_combined_df, y)


# Remove the 'date' column after splitting
datetime_columns = X_test_split.select_dtypes(include=['datetime64[ns]']).columns
print(f"Columns with datetime64[ns] dtype: {datetime_columns.tolist()}")
datetime_columns = X_test_split.select_dtypes(include=['datetime64[ns]']).columns
print(f"Columns with datetime64[ns] dtype: {datetime_columns.tolist()}")

# Drop these columns from X_train_split
X_train_split = X_train_split.drop(columns=datetime_columns)
# Drop these columns from X_test_split
X_test_split = X_test_split.drop(columns=datetime_columns)

X_train_split = X_train_split.astype(float)
X_test_split = X_test_split.astype(float)



Columns with datetime64[ns] dtype: ['feature_0', 'feature_1', 'date']
Columns with datetime64[ns] dtype: ['feature_0', 'feature_1', 'date']


In [47]:
final_test = utils.get_test_data()
date_encoder = FunctionTransformer(_encode_dates, validate=False)
final_test = date_encoder.fit_transform(final_test)
# final_test = pd.get_dummies(final_test, columns=["hour"], prefix="hour")
# final_test.head(2)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet
import xgboost as xgb
import lightgbm as lgb

# Define models to evaluate
models = {
    # "Elastic Net": ElasticNet(random_state=42),
    # "XGBoost": xgb.XGBRegressor(random_state=42, verbosity=1),
    # "LightGBM": lgb.LGBMRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(
        n_estimators=100,  # Fewer trees
        max_depth=20,     # Limit depth
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1         # Utilize multiple cores
    ),
}

# Initialize a dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train_split, y_train_split)

    # Predict on the test set
    y_pred = model.predict(X_test_split)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test_split, y_pred))
    results[name] = rmse
    print(f"Model: {name}, RMSE: {rmse}")

# Convert results to a DataFrame and display
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['RMSE']).sort_values(by='RMSE')

# Display the results
print(results_df)


Model: Elastic Net, RMSE: 1.4149543225433936
Model: XGBoost, RMSE: 0.5120635104018924
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033888 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 456507, number of used features: 212
[LightGBM] [Info] Start training from score 3.048868
Model: LightGBM, RMSE: 0.5152787320057891
Model: Random Forest, RMSE: 0.7521767458309048
                   RMSE
XGBoost        0.512064
LightGBM       0.515279
Random Forest  0.752177
Elastic Net    1.414954


In [None]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import StandardScaler


# Step 1: Preprocessing
# One-hot encode the categorical variables
categorical_cols = final_test.select_dtypes(include=['object', 'category']).columns
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
categorical_encoded = onehot_encoder.fit_transform(final_test[categorical_cols])

# Numerical scaling
numerical_cols = final_test.select_dtypes(include=['int64', 'float64']).columns
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(final_test[numerical_cols])

# Drop processed columns
final_test.drop(categorical_cols, axis=1, inplace=True)
final_test.drop(numerical_cols, axis=1, inplace=True)

# Ensure date column is in datetime format
final_test["date"] = pd.to_datetime(final_test["date"])

# Combine all features
X_combined = np.hstack([final_test.values, categorical_encoded, numerical_scaled])

# Step 2: Reshape for LSTM
# LSTM requires 3D input: (samples, timesteps, features)
# Assuming each sample has a single timestep
X_reshaped = X_combined.reshape(X_combined.shape[0], 1, X_combined.shape[1])

# Step 3: Temporal Train-Test Split
# Convert X_reshaped back into a DataFrame to preserve the date column
X_combined_df = pd.DataFrame(X_combined, columns=[f"feature_{i}" for i in range(X_combined.shape[1])])
X_combined_df["date"] = final_test["date"].values  # Restore the date column


datetime_columns = X_combined_df.select_dtypes(include=['datetime64[ns]']).columns
print(f"Columns with datetime64[ns] dtype: {datetime_columns.tolist()}")

X_combined_df = X_combined_df.drop(columns=datetime_columns)

X_combined_df = X_combined_df.astype(float)



Columns with datetime64[ns] dtype: ['feature_0', 'feature_1', 'date']


In [39]:
for name, model in models.items():
    predictions = model.predict(X_combined_df)
    submission = pd.DataFrame({"id": final_test.index, "log_bike_count": predictions.flatten()})
    submission_path = f"submission_{name}.csv"
    submission.to_csv(submission_path, index=False)

In [18]:
X_train_split = X_train_split.values.reshape(X_train_split.shape[0], 1, -1)
X_test_split = X_test_split.values.reshape(X_test_split.shape[0], 1, -1)

# Step 4: Define LSTM Model
model = Sequential()
model.add(LSTM(50, activation="relu", input_shape=(X_train_split.shape[1], X_train_split.shape[2])))
model.add(Dense(1))
model.compile(optimizer="adam", loss="mse")

# Step 5: Train the Model
model.fit(X_train_split, y_train_split, epochs=10, batch_size=32, verbose=2)

# Step 6: Evaluate the Model
loss = model.evaluate(X_test_split, y_test_split)
print(f"Test Loss: {loss}")

  super().__init__(**kwargs)


Epoch 1/10
14266/14266 - 41s - 3ms/step - loss: 3.4890
Epoch 2/10


KeyboardInterrupt: 