In [85]:
import pandas as pd
import numpy as np

from skrub import TableVectorizer
import xgboost as xgb
from sklearn.pipeline import Pipeline

import holidays

from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import StandardScaler, LabelEncoder

import datetime

import geopandas as gpd
from shapely.geometry import Point


In [86]:
# Import the files
#df_train = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/train.parquet")
#df_test = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/final_test.parquet")

df_train = pd.read_parquet("/Users/srazjman/Python/bike_counters/data/train.parquet")
df_test = pd.read_parquet("/Users/srazjman/Python/bike_counters/data/final_test.parquet")

In [87]:
# Add jour ferie data
jour_feries = (
    pd.read_csv(
        #"/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/jours_feries_metropole.csv",
        "/Users/srazjman/Python/bike_counters/external_data/jours_feries_metropole.csv",
        date_format="%Y%m%d%H"  # Ensure date format is handled correctly
    )
    .drop(columns=["annee", "zone"])  # Drop unnecessary columns
)

# Convert 'date' column to datetime
jour_feries['date'] = pd.to_datetime(jour_feries['date'])

# Filter rows based on the date range of df_train and df_test
jour_feries = jour_feries[
    (jour_feries["date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (jour_feries["date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

In [88]:
# Add mouvements sociaux data :
mouvements_sociaux = (
    pd.read_csv(
        #"/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/mouvements-sociaux-depuis-2002.csv",
         "/Users/srazjman/Python/bike_counters/external_data/mouvements-sociaux-depuis-2002.csv",
        date_format="%Y%m%d%H",
        sep=";"
    )
    .drop(columns=['date_de_fin', 'Organisations syndicales', 'Métiers ciblés par le préavis',
                   'Population devant travailler ciblee par le préavis', 'Nombre de grévistes du préavis'])  # Drop unnecessary columns
)

mouvements_sociaux['Date'] = pd.to_datetime(mouvements_sociaux['Date'])

mouvements_sociaux = mouvements_sociaux[
    (mouvements_sociaux["Date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (mouvements_sociaux["Date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

mouvements_sociaux = mouvements_sociaux[mouvements_sociaux['Date'] != pd.Timestamp('2021-03-08')]

In [89]:
#Add list of Ratp Stop :
arrets_ratp = (
    pd.read_csv(
    #"/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/arrets.csv",
    "/Users/srazjman/Python/bike_counters/external_data/arrets.csv",
    sep=";"
    ).drop(columns=['ArRId', 'ArRVersion', 'ArRCreated', 'ArRChanged', 'ArRName', 'ArRType', 
                    'ArRXEpsg2154', 'ArRYEpsg2154', 'ArRTown', 'ArRPostalRegion', 'ArRAccessibility',
                      'ArRAudibleSignals', 'ArRVisualSigns', 'ArRFareZone', 'ZdAId'])	
)
arrets_ratp.head()

for df in [df_train, df_test]:

    df['geometry'] = df['coordinates'].apply(
        lambda coord: Point(map(float, coord.split(',')))
    )
    gdf_bike = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
    arrets_ratp['geometry'] = arrets_ratp['ArRGeopoint'].apply(
        lambda coord: Point(map(float, coord.split(',')))
    )
    gdf_arrets = gpd.GeoDataFrame(arrets_ratp, geometry='geometry', crs="EPSG:4326")
    #2:
    gdf_bike = gdf_bike.to_crs("EPSG:3857")
    gdf_arrets = gdf_arrets.to_crs("EPSG:3857")
    #3: 
    gdf_bike['buffer_1500m'] = gdf_bike['geometry'].buffer(1500)
    #4:
    gdf_arrets_in_buffer = gpd.sjoin(
        gdf_arrets,
        gdf_bike.set_geometry('buffer_1500m'),
        predicate='within'
    )
    gdf_arrets_in_buffer = gdf_arrets_in_buffer[['counter_id', 'geometry']].drop_duplicates()
    arrets_count = gdf_arrets_in_buffer.groupby('counter_id').size()
    df['arrets_count'] = df['counter_id'].map(arrets_count).fillna(0)
    print(df[['coordinates', 'arrets_count']].head())
    df = df.drop(columns=['geometry'])


  arrets_count = gdf_arrets_in_buffer.groupby('counter_id').size()


              coordinates  arrets_count
48321  48.846028,2.375429           235
48324  48.846028,2.375429           235
48327  48.846028,2.375429           235
48330  48.846028,2.375429           235
48333  48.846028,2.375429           235
          coordinates  arrets_count
0  48.846028,2.375429           235
1  48.846028,2.375429           235
2  48.846028,2.375429           235
3  48.846028,2.375429           235
4  48.846028,2.375429           235


  arrets_count = gdf_arrets_in_buffer.groupby('counter_id').size()


In [90]:
# Extract the date feature on different time scales :

fr_holidays = holidays.France()

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # creation of a binary varible depicting if day in weekend
    X["is_weekend"] = np.where(X["weekday"] + 1 > 5, 1, 0)

    # Add a feature to indicate if the day is a holiday in France
    X["is_holiday"] = X["date"].apply(lambda d: 1 if d in fr_holidays else 0)

    # and if it is a jour ferie in France :
    X["is_jour_ferie"] = X["date"].dt.date.isin(jour_feries['date']).astype(int)

    # and it is a jour of "mouvement social" in France :
    X["is_jour_mouvement_social"] = X["date"].dt.date.isin(mouvements_sociaux['Date']).astype(int)

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

df_train = _encode_dates(df_train)
df_test = _encode_dates(df_test)


In [91]:
# Preprocessing :

# Drop unnecessary columns :
# df_train = df_train.drop(columns=["coordinates", "counter_name", "site_name", "counter_technical_id"])
# df_test = df_test.drop(columns=["coordinates", "counter_name", "site_name", "counter_technical_id"])

# Extract features from counter_installation_date
for df in [df_train, df_test]:
    df["installation_year"] = df["counter_installation_date"].dt.year
    df["installation_month"] = df["counter_installation_date"].dt.month

df_train = df_train.drop(columns=["counter_installation_date"])
df_test = df_test.drop(columns=["counter_installation_date"])

# Label encode high-cardinality categorical features
label_encoders = {}
for col in ["counter_id", "site_id", "counter_name", "site_name", "counter_technical_id", "coordinates"]:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.fit_transform(df_test[col])
    label_encoders[col] = le

In [92]:
df_train = df_train.drop(columns = 'geometry')
df_test = df_test.drop(columns = 'geometry')
df_train.columns

Index(['counter_id', 'counter_name', 'site_id', 'site_name', 'bike_count',
       'coordinates', 'counter_technical_id', 'latitude', 'longitude',
       'log_bike_count', 'arrets_count', 'year', 'month', 'day', 'weekday',
       'hour', 'is_weekend', 'is_holiday', 'is_jour_ferie',
       'is_jour_mouvement_social', 'installation_year', 'installation_month'],
      dtype='object')

In [93]:
X_train = df_train.drop(columns=["bike_count", "log_bike_count"])
y_train = df_train["log_bike_count"]

X_test = df_test.copy()

In [94]:
'''
# Define the XGBoost model
xgb_model = xgb.XGBRegressor()

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [90,100,110],#100 / 90
    'learning_rate': [0.08, 0.1,0.15],#0.1/0.08 (950)
    'max_depth': [11, 12, 14],#12 / 14
    'subsample': [0.5, 0.6],#0.6
    'colsample_bytree': [0.5, 0.6, 0.7],#0.6
}'''

"\n# Define the XGBoost model\nxgb_model = xgb.XGBRegressor()\n\n# Define the parameter grid for tuning\nparam_grid = {\n    'n_estimators': [90,100,110],#100 / 90\n    'learning_rate': [0.08, 0.1,0.15],#0.1/0.08 (950)\n    'max_depth': [11, 12, 14],#12 / 14\n    'subsample': [0.5, 0.6],#0.6\n    'colsample_bytree': [0.5, 0.6, 0.7],#0.6\n}"

In [95]:
'''
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,  # Number of parameter settings sampled
    scoring='neg_mean_squared_error',  # Use appropriate scoring metric
    cv=5,  # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Perform the search
random_search.fit(X_train, y_train)

# Get the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", -random_search.best_score_)
'''

'\n# Initialize RandomizedSearchCV\nrandom_search = RandomizedSearchCV(\n    estimator=xgb_model,\n    param_distributions=param_grid,\n    n_iter=50,  # Number of parameter settings sampled\n    scoring=\'neg_mean_squared_error\',  # Use appropriate scoring metric\n    cv=5,  # 5-fold cross-validation\n    verbose=1,\n    random_state=42,\n    n_jobs=-1  # Use all available processors\n)\n\n# Perform the search\nrandom_search.fit(X_train, y_train)\n\n# Get the best parameters and score\nprint("Best Parameters:", random_search.best_params_)\nprint("Best Score:", -random_search.best_score_)\n'

In [106]:
xgb_model = xgb.XGBRegressor()

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100],#100 / 90
    'learning_rate': [0.09],#0.1/0.08 (950)
    'max_depth': [12],#12 / 14 /13
    'subsample': [0.6],#0.6
    'colsample_bytree': [0.6],#0.6
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,  # Number of parameter settings sampled
    scoring='neg_mean_squared_error',  # Use appropriate scoring metric
    cv=5,  # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Perform the search
random_search.fit(X_train, y_train)

# Get the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", -random_search.best_score_)
#0.8988



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'subsample': 0.6, 'n_estimators': 100, 'max_depth': 12, 'learning_rate': 0.09, 'colsample_bytree': 0.6}
Best Score: 0.9044222522831111


In [107]:
# Update the model with the best parameters
best_xgb_model = random_search.best_estimator_

In [108]:
'''
# Preprocessing pipeline
pipeline = Pipeline(
    steps=[
        ('preprocessor', TableVectorizer()),
        ('model', best_xgb_model),
    ]
)
'''

"\n# Preprocessing pipeline\npipeline = Pipeline(\n    steps=[\n        ('preprocessor', TableVectorizer()),\n        ('model', best_xgb_model),\n    ]\n)\n"

In [109]:
# Initialize the XGBoost regressor
model = best_xgb_model

# Fit the model
model.fit(
    X_train, y_train,
)

# Make Predictions on Test Data
y_predictions = model.predict(X_test)

In [110]:
print(y_predictions)

[0.39654157 1.6316073  2.1623495  ... 5.3431845  4.9070163  3.8263245 ]


In [111]:
#pd.DataFrame(y_predictions, columns=["log_bike_count"]).reset_index().rename(
#    columns={"index": "Id"}
#).to_csv("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/predictions_option_2_vsimple.csv", index=False)
pd.DataFrame(y_predictions, columns=["log_bike_count"]).reset_index().rename(
    columns={"index": "Id"}
).to_csv("/Users/srazjman/Python/bike_counters/predictions_option_2_vsimple_Ratp.csv", index=False)