In [396]:
import pandas as pd
import numpy as np
from pathlib import Path
import holidays
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from prophet import Prophet
from statsmodels.tsa.statespace.sarimax import SARIMAX
from feature_engineering import codify_date, codify_date_2, remove_outliers, get_X_y, covid_19, covid_19_2


In [397]:
df = pd.read_parquet(Path("data") / "train.parquet")
test = pd.read_parquet(Path("data") / "final_test.parquet")

test_old = test.copy()

weather_data = pd.read_csv("data/external_data.csv")

In [398]:
test_old.columns

Index(['counter_id', 'counter_name', 'site_id', 'site_name', 'date',
       'counter_installation_date', 'coordinates', 'counter_technical_id',
       'latitude', 'longitude'],
      dtype='object')

In [399]:
df = codify_date_2(df)
test = codify_date_2(test)

In [400]:
# Extract the relevant columns from your DataFrame
features = ['hour', 'month', 'IsHoliday', 'day', "day_of_week", "is_weekend"]
target = 'log_bike_count'

# Extract feature matrix (X) and target vector (y)
X = df[features]
y = df[target]

# Instantiate the HistGradientBoostingRegressor
model = HistGradientBoostingRegressor(max_iter=100, random_state=42)

# Perform cross-validation to evaluate the performance of the model
# Use neg_mean_squared_error for regression tasks
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Output the negative mean squared error (MSE) scores for each fold
print("Negative MSE scores from cross-validation:", cv_scores)
print("Average Negative MSE:", cv_scores.mean())

Negative MSE scores from cross-validation: [-0.96288667 -0.55461495 -0.64306749 -0.95816682 -1.43161343]
Average Negative MSE: -0.9100698711407886


In [401]:
# Instantiate the HistGradientBoostingRegressor
model = HistGradientBoostingRegressor(max_iter=100, random_state=42)

# Fit the model to the training data
model.fit(X, y)

# Extract the relevant columns from df_test (test data) for making predictions
X_test = test[features]

# Make predictions on the test data
y_pred_test = model.predict(X_test)

# Create the output DataFrame with "Id" (the index) and the "log_bike_count" predictions
output_df = pd.DataFrame({
    'Id': test.index,  # Assuming Id is just the index of the test dataframe
    'log_bike_count': y_pred_test
})

# Optionally, save the output to a CSV file for Kaggle submission
# output_df.to_csv('submission.csv', index=False)

# Display the first few rows of the output dataframe
print(output_df.head())



   Id  log_bike_count
0   0        1.471516
1   1        4.456159
2   2        5.158623
3   3        4.210475
4   4        3.285660


In [402]:
output_df.to_csv("submission_maxim_2.csv", index=False)

# Now including weather

In [403]:
weather = pd.read_csv('data/external_data.csv')
weather['date'] = pd.to_datetime(weather['date'])

weather.drop_duplicates(inplace=True)

# let's consider the "nearest preceding weather" --> biking behavior is influenced by previous weather; incorporating future information could introduce bias
# therefore: consider the "last 3h" variables (for consistency)

relevant_columns = [
    'date',         # Date
    't',            # Temperature
    #'tn12',         # Minimum Temperature over 12 hours
    #'tn24',         # Minimum Temperature over 24 hours
    #'tx12',         # Maximum Temperature over 12 hours
    #'tx24',         # Maximum Temperature over 24 hours
    #'tminsol',      # Ground Temperature
    'rr1',          # Precipitation in last hour
    'rr3',          # Precipitation in last 3 hours
    #'rr6',          # Precipitation in last 6 hours
    #'rr12',         # Precipitation in last 12 hours
    #'rr24',         # Precipitation in last 24 hours
    'ht_neige',     # Total Snow Depth
    #'ssfrai',       # Fresh Snow Depth
    'ff',           # Wind Speed
    'raf10',        # Wind Gusts over 10 minutes
    'u',            # Humidity
    #'vv',           # ok
    'ww',           # Current Weather Condition
    'etat_sol',     # Ground Condition
    'tend',         # Pressure Trend in 3 hours
    #'tend24'        # Pressure Trend in 24 hours
]

weather = weather[relevant_columns]


In [404]:
df['datetime'] = df['datetime'].astype('datetime64[ns]')

weather['date'] = pd.to_datetime(weather['date'])

df = df.sort_values('datetime')
weather = weather.sort_values('date')

df = pd.merge_asof(df, weather, left_on='datetime', right_on='date', direction='backward', suffixes=('', '_weather'))

In [405]:
test.columns

Index(['counter_id', 'counter_name', 'site_id', 'site_name', 'date',
       'counter_installation_date', 'coordinates', 'counter_technical_id',
       'latitude', 'longitude', 'datetime', 'year', 'month', 'day',
       'day_of_week', 'hour', 'is_weekend', 'IsHoliday'],
      dtype='object')

In [406]:
test['datetime'] = test['datetime'].astype('datetime64[ns]')
test_old = test_old.rename(columns={'date': 'datetime'})

# Step 1: Store the columns of interest from the original test_old dataset
columns_of_interest_old = ['counter_id', 'counter_name', 'site_id', 'site_name', 'datetime',
       'counter_installation_date', 'coordinates', 'counter_technical_id',
       'latitude', 'longitude']

# Rename 'date' column to 'datetime' in test_old to match the test_merged_sorted column
test_old = test_old[columns_of_interest_old].copy()

# Step 2: Perform the merging and sorting as before
test['original_index'] = test.index  # Store original index
test['datetime'] = pd.to_datetime(test['datetime'])

test_sorted = test.sort_values('datetime')
weather_sorted = weather.sort_values('date')

# Perform the merge
test = pd.merge_asof(test_sorted, weather_sorted, left_on='datetime', right_on='date', direction='backward', suffixes=('', '_weather'))

# Step 3: Sort the merged test data by the original index to restore the original order
test = test.sort_values('original_index')

# Drop the 'original_index' column after sorting
test = test.drop(columns=['original_index'])

# Step 4: Compare the columns of interest before and after the merge/sort (with test_old)
columns_match_all_old = test_old.values == test[columns_of_interest_old].values

# Step 5: Print the results
print("Do all columns match? ", columns_match_all_old.all())
print("\nTest_old before merge (first 5 rows):")
print(test_old.head())
print("\nTest after merge and sorting (first 5 rows):")
print(test[columns_of_interest_old].head())


Do all columns match?  True

Test_old before merge (first 5 rows):
            counter_id              counter_name    site_id  \
0  100007049-102007049  28 boulevard Diderot E-O  100007049   
1  100007049-102007049  28 boulevard Diderot E-O  100007049   
2  100007049-102007049  28 boulevard Diderot E-O  100007049   
3  100007049-102007049  28 boulevard Diderot E-O  100007049   
4  100007049-102007049  28 boulevard Diderot E-O  100007049   

              site_name            datetime counter_installation_date  \
0  28 boulevard Diderot 2021-09-10 01:00:00                2013-01-18   
1  28 boulevard Diderot 2021-09-10 13:00:00                2013-01-18   
2  28 boulevard Diderot 2021-09-10 17:00:00                2013-01-18   
3  28 boulevard Diderot 2021-09-10 19:00:00                2013-01-18   
4  28 boulevard Diderot 2021-09-10 22:00:00                2013-01-18   

          coordinates counter_technical_id   latitude  longitude  
0  48.846028,2.375429          Y2H15027244  48.8

In [407]:
""" test['datetime'] = test['datetime'].astype('datetime64[ns]')

# Assuming test and weather are already loaded
# Step 1: Store the original order of the test dataset (before sorting)
test['original_index'] = test.index

# Step 2: Sort the test dataset by 'date' before merging with weather
test = test.sort_values('datetime')

# Step 3: Merge weather data (as you did before)
test = pd.merge_asof(test, weather, left_on='datetime', right_on='date', direction='backward')
# Step 4: Revert to the original order using 'original_index'
test = test.sort_values('original_index')

test["original_index"]"""

' test[\'datetime\'] = test[\'datetime\'].astype(\'datetime64[ns]\')\n\n# Assuming test and weather are already loaded\n# Step 1: Store the original order of the test dataset (before sorting)\ntest[\'original_index\'] = test.index\n\n# Step 2: Sort the test dataset by \'date\' before merging with weather\ntest = test.sort_values(\'datetime\')\n\n# Step 3: Merge weather data (as you did before)\ntest = pd.merge_asof(test, weather, left_on=\'datetime\', right_on=\'date\', direction=\'backward\')\n# Step 4: Revert to the original order using \'original_index\'\ntest = test.sort_values(\'original_index\')\n\ntest["original_index"]'

## Ab hier model

In [408]:
weather_columns = ['t', 'rr1', 'rr3', 'ff', 'raf10', 'u', 'ww', 'etat_sol', 'tend']

# Extract the relevant columns from your DataFrame
features = ['hour', 'month', 'IsHoliday', 'day', "day_of_week", "is_weekend"] + weather_columns
target = 'log_bike_count'

# Extract feature matrix (X) and target vector (y)
X = df[features]
y = df[target]

# Instantiate the HistGradientBoostingRegressor
model = HistGradientBoostingRegressor(max_iter=100, random_state=42)

# Perform cross-validation to evaluate the performance of the model
# Use neg_mean_squared_error for regression tasks
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Output the negative mean squared error (MSE) scores for each fold
print("Negative MSE scores from cross-validation:", cv_scores)
print("Average Negative MSE:", cv_scores.mean())


Negative MSE scores from cross-validation: [-0.95044582 -0.81106293 -0.90263914 -1.17604121 -1.10087524]
Average Negative MSE: -0.9882128666193614


In [409]:
test.columns

Index(['counter_id', 'counter_name', 'site_id', 'site_name', 'date',
       'counter_installation_date', 'coordinates', 'counter_technical_id',
       'latitude', 'longitude', 'datetime', 'year', 'month', 'day',
       'day_of_week', 'hour', 'is_weekend', 'IsHoliday', 'date_weather', 't',
       'rr1', 'rr3', 'ht_neige', 'ff', 'raf10', 'u', 'ww', 'etat_sol', 'tend'],
      dtype='object')

In [410]:
# Instantiate the HistGradientBoostingRegressor
model = HistGradientBoostingRegressor(max_iter=100, random_state=42)

# Fit the model to the training data
model.fit(X, y)

# Extract the relevant columns from df_test (test data) for making predictions
X_test = test[features]

# Make predictions on the test data
y_pred_test = model.predict(X_test)

# Create the output DataFrame with "Id" (the index) and the "log_bike_count" predictions
output_df = pd.DataFrame({
    'Id': range(0, 51440, 1),  # Assuming Id is just the index of the test dataframe
    'log_bike_count': y_pred_test
})

# Optionally, save the output to a CSV file for Kaggle submission
# output_df.to_csv('submission.csv', index=False)

# Display the first few rows of the output dataframe
print(output_df.head())



   Id  log_bike_count
0   0        1.494013
1   1        4.060301
2   2        4.887337
3   3        3.870088
4   4        3.209452


In [411]:
output_df.to_csv("submission_maxim.csv", index=False)

In [412]:
output_df.shape

(51440, 2)