In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
data = pd.read_parquet(Path("data") / "train.parquet")
data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585


In [3]:
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

In [4]:
from sklearn.preprocessing import FunctionTransformer

date_encoder = FunctionTransformer(_encode_dates, validate=False)
sample_encoded = date_encoder.fit_transform(data[["date"]]).head()
sample_encoded

Unnamed: 0,year,month,day,weekday,hour
48321,2020,9,1,1,2
48324,2020,9,1,1,3
48327,2020,9,1,1,4
48330,2020,9,1,1,15
48333,2020,9,1,1,18


In [5]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse_output=False)

enc.fit_transform(sample_encoded[["hour"]])

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [6]:
import utils

X, y = utils.get_train_data()
X.head(2)

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
400125,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233
408305,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233


In [7]:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

In [8]:
X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)

print(
    f'Train: n_samples={X_train.shape[0]},  {X_train["date"].min()} to {X_train["date"].max()}'
)
print(
    f'Valid: n_samples={X_valid.shape[0]},  {X_valid["date"].min()} to {X_valid["date"].max()}'
)

Train: n_samples=456507,  2020-09-01 01:00:00 to 2021-08-10 23:00:00
Valid: n_samples=40320,  2021-08-11 00:00:00 to 2021-09-09 23:00:00


In [9]:
_encode_dates(X_train[["date"]]).columns.tolist()

['year', 'month', 'day', 'weekday', 'hour']

In [10]:
X_train.head(10)

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
400125,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233
408305,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233
87516,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,2.35702
98518,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,2.35702
875137,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2020-09-01 01:00:00,2020-07-22,"48.88529,2.32666",Y2H20073268,48.88529,2.32666
882956,100063175-353277235,20 Avenue de Clichy SE-NO,100063175,20 Avenue de Clichy,2020-09-01 01:00:00,2020-07-22,"48.88529,2.32666",Y2H20073268,48.88529,2.32666
754676,100056336-106056336,27 quai de la Tournelle NO-SE,100056336,27 quai de la Tournelle,2020-09-01 01:00:00,2019-11-14,"48.85013,2.35423",Y2H19070383,48.85013,2.35423
766471,100056336-105056336,27 quai de la Tournelle SE-NO,100056336,27 quai de la Tournelle,2020-09-01 01:00:00,2019-11-14,"48.85013,2.35423",Y2H19070383,48.85013,2.35423
48428,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
57884,100007049-101007049,28 boulevard Diderot O-E,100007049,28 boulevard Diderot,2020-09-01 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

regressor = LinearRegression()

pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)

In [14]:
from sklearn.metrics import mean_squared_error

print(
    f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}"
)
print(
    f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}"
)

Train set, RMSE=0.80
Valid set, RMSE=0.73




In [15]:
print("Baseline mean prediction.")
print(
    f"Train set, RMSE={mean_squared_error(y_train, np.full(y_train.shape, y_train.mean()), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_valid, np.full(y_valid.shape, y_valid.mean()), squared=False):.2f}"
)

Baseline mean prediction.
Train set, RMSE=1.67
Test set, RMSE=1.44




In [16]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp39-cp39-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Downloading tensorflow_intel-2.18.0-cp39-cp39-win_amd64.whl.metadata (4.9 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow-

In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split

# Assuming X_train and y_train are your data inputs and labels

# Step 1: Preprocessing
# One-hot encode the categorical variables
categorical_cols = ["counter_name", "site_name"]
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
categorical_encoded = onehot_encoder.fit_transform(X_train[categorical_cols])

# Normalize numerical columns
numerical_cols = ["latitude", "longitude"]
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(X_train[numerical_cols])

# Extract temporal features from 'date'
X_train["date"] = pd.to_datetime(X_train["date"])
X_train["year"] = X_train["date"].dt.year
X_train["month"] = X_train["date"].dt.month
X_train["day"] = X_train["date"].dt.day
X_train["hour"] = X_train["date"].dt.hour

temporal_cols = ["year", "month", "day", "hour"]

# Combine all features
X_combined = np.hstack(
    [categorical_encoded, numerical_scaled, X_train[temporal_cols].values]
)

# Step 2: Reshape for LSTM
# LSTM requires 3D input: (samples, timesteps, features)
# Assuming each sample has a single timestep
X_reshaped = X_combined.reshape(X_combined.shape[0], 1, X_combined.shape[1])

# Step 3: Train-Test Split
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_reshaped, y_train, test_size=0.2, random_state=42
)

# Step 4: Define LSTM Model
model = Sequential()
model.add(LSTM(50, activation="relu", input_shape=(X_reshaped.shape[1], X_reshaped.shape[2])))
model.add(Dense(1))
model.compile(optimizer="adam", loss="mse")

# Step 5: Train the Model
model.fit(X_train_split, y_train_split, epochs=50, batch_size=32, verbose=2)

# Step 6: Evaluate the Model
loss = model.evaluate(X_test_split, y_test_split)
print(f"Test Loss: {loss}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["date"] = pd.to_datetime(X_train["date"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["year"] = X_train["date"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["month"] = X_train["date"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Tr

Epoch 1/50
11413/11413 - 66s - 6ms/step - loss: 2.4664
Epoch 2/50
11413/11413 - 63s - 6ms/step - loss: 2.2973
Epoch 3/50
11413/11413 - 64s - 6ms/step - loss: 2.2671
Epoch 4/50
11413/11413 - 53s - 5ms/step - loss: 2.2391
Epoch 5/50
11413/11413 - 52s - 5ms/step - loss: 1.9901
Epoch 6/50
11413/11413 - 53s - 5ms/step - loss: 1.1374
Epoch 7/50
11413/11413 - 54s - 5ms/step - loss: 1.0807
Epoch 8/50
11413/11413 - 51s - 5ms/step - loss: 1.0618
Epoch 9/50
11413/11413 - 52s - 5ms/step - loss: 1.0515
Epoch 10/50
11413/11413 - 52s - 5ms/step - loss: 1.0500
Epoch 11/50
11413/11413 - 51s - 4ms/step - loss: 1.0487
Epoch 12/50
11413/11413 - 58s - 5ms/step - loss: 1.0435
Epoch 13/50
11413/11413 - 73s - 6ms/step - loss: 1.0427
Epoch 14/50
11413/11413 - 52s - 5ms/step - loss: 1.0432
Epoch 15/50
11413/11413 - 86s - 8ms/step - loss: 1.0412
Epoch 16/50
11413/11413 - 64s - 6ms/step - loss: 1.0389
Epoch 17/50
11413/11413 - 86s - 8ms/step - loss: 1.0375
Epoch 18/50
11413/11413 - 69s - 6ms/step - loss: 1.0327
E

In [28]:
import pandas as pd
import numpy as np
import zipfile

# Step 1: Load the test data
final_test = pd.read_parquet('data/final_test.parquet')

# Step 2: Preprocess the test data
# Apply the same transformations as in training
categorical_encoded_test = onehot_encoder.transform(final_test[categorical_cols])
numerical_scaled_test = scaler.transform(final_test[numerical_cols])

# Extract temporal features
final_test["date"] = pd.to_datetime(final_test["date"])
final_test["year"] = final_test["date"].dt.year
final_test["month"] = final_test["date"].dt.month
final_test["day"] = final_test["date"].dt.day
final_test["hour"] = final_test["date"].dt.hour

X_test_combined = np.hstack(
    [categorical_encoded_test, numerical_scaled_test, final_test[temporal_cols].values]
)

# Reshape for LSTM input
X_test_reshaped = X_test_combined.reshape(X_test_combined.shape[0], 1, X_test_combined.shape[1])

# Step 3: Predict using the trained model
predictions = model.predict(X_test_reshaped)

# Step 4: Prepare the submission file
submission = pd.DataFrame({"id": final_test.index, "log_bike_count": predictions.flatten()})
submission_path = "submission.csv"
submission.to_csv(submission_path, index=False)

# Step 5: Compress the file
with zipfile.ZipFile("submission.zip", "w") as zipf:
    zipf.write(submission_path)


[1m1608/1608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step
