In [1]:
import pandas as pd
import numpy as np
from tpot import TPOTRegressor
from skrub import TableVectorizer
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder






In [2]:
# Import the files
df_train = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/train.parquet")
df_test = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/final_test.parquet")

In [3]:
# Extract the date feature on different time scales :

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # creation of a binary varible depicting if day in weekend
    X["is_weekend"] = np.where(X["weekday"] + 1 > 5, 1, 0)

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

df_train = _encode_dates(df_train)
df_test = _encode_dates(df_test)


In [4]:
df_train.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count,year,month,day,weekday,hour,is_weekend
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0,2020,9,1,1,2,0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147,2020,9,1,1,3,0
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0,2020,9,1,1,4,0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438,2020,9,1,1,15,0
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585,2020,9,1,1,18,0


In [5]:
df_test.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,year,month,day,weekday,hour,is_weekend
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2021,9,10,4,1,0
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2021,9,10,4,13,0
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2021,9,10,4,17,0
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2021,9,10,4,19,0
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2021,9,10,4,22,0


In [6]:
df_train.describe()

Unnamed: 0,site_id,bike_count,counter_installation_date,latitude,longitude,log_bike_count,year,month,day,weekday,hour,is_weekend
count,496827.0,496827.0,496827,496827.0,496827.0,496827.0,496827.0,496827.0,496827.0,496827.0,496827.0,496827.0
mean,105345000.0,60.191475,2019-04-04 07:24:35.245911,48.854343,2.345479,3.079917,2020.679846,6.556904,15.458226,2.992172,11.50273,0.283354
min,100007000.0,0.0,2013-01-18 00:00:00,48.82636,2.26542,0.0,2020.0,1.0,1.0,0.0,0.0,0.0
25%,100047500.0,5.0,2018-11-29 00:00:00,48.840801,2.31444,1.791759,2020.0,4.0,8.0,1.0,6.0,0.0
50%,100056200.0,29.0,2019-11-06 00:00:00,48.85209,2.35387,3.401197,2021.0,7.0,15.0,3.0,12.0,0.0
75%,100056300.0,79.0,2019-12-11 00:00:00,48.86461,2.37587,4.382027,2021.0,9.0,23.0,5.0,18.0,1.0
max,300014700.0,1302.0,2020-11-29 00:00:00,48.89172,2.40969,7.172425,2021.0,12.0,31.0,6.0,23.0,1.0
std,32103460.0,87.590566,,0.018554,0.038026,1.659899,0.466536,3.423834,8.851485,1.995015,6.920936,0.450627


In [7]:
df_test.describe()

Unnamed: 0,site_id,counter_installation_date,latitude,longitude,year,month,day,weekday,hour,is_weekend
count,51440.0,51440,51440.0,51440.0,51440.0,51440.0,51440.0,51440.0,51440.0,51440.0
mean,107305000.0,2019-05-05 04:44:51.881804,48.854275,2.344642,2021.0,9.466116,15.090921,3.081532,11.49166,0.308865
min,100007000.0,2013-01-18 00:00:00,48.82636,2.26542,2021.0,9.0,1.0,0.0,0.0,0.0
25%,100047500.0,2018-11-29 00:00:00,48.83977,2.31179,2021.0,9.0,10.0,1.0,6.0,0.0
50%,100056300.0,2019-11-06 00:00:00,48.85209,2.35387,2021.0,9.0,15.0,3.0,11.0,0.0
75%,100056300.0,2019-12-12 00:00:00,48.86461,2.37587,2021.0,10.0,21.0,5.0,17.0,1.0
max,300014700.0,2020-11-29 00:00:00,48.89172,2.40969,2021.0,10.0,30.0,6.0,23.0,1.0
std,37388390.0,,0.018607,0.038257,0.0,0.498855,7.716252,2.042702,6.903709,0.46203


In [8]:
X_train = df_train.drop(columns=["bike_count", "log_bike_count"])
y_train = df_train["log_bike_count"]

X_test = df_test.copy()

In [9]:
non_numeric_columns = X_train.select_dtypes(include=["object", "category"]).columns
print("Non-numeric columns:", non_numeric_columns)

# we convert them to numeric using label encoding :
label_encoder = LabelEncoder()
for col in non_numeric_columns:
    X_train[col] = label_encoder.fit_transform(X_train[col])
    X_test[col] = label_encoder.transform(X_test[col])

Non-numeric columns: Index(['counter_id', 'counter_name', 'site_name', 'coordinates',
       'counter_technical_id'],
      dtype='object')


In [10]:
# and for the counter_installation_date : to convert it to normal date :
for df in [X_train, X_test]:
    df["installation_year"] = df["counter_installation_date"].dt.year
    # df["installation_month"] = df["counter_installation_date"].dt.month
    # df["installation_day"] = df["counter_installation_date"].dt.day
    # df["days_since_installation"] = (pd.Timestamp.now() - df["counter_installation_date"]).dt.days

# Drop the original 'counter_installation_date' column
X_train = X_train.drop(columns=["counter_installation_date"])
X_test = X_test.drop(columns=["counter_installation_date"])

In [11]:
X_train.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,coordinates,counter_technical_id,latitude,longitude,year,month,day,weekday,hour,is_weekend,installation_year
48321,1,10,100007049,5,10,0,48.846028,2.375429,2020,9,1,1,2,0,2013
48324,1,10,100007049,5,10,0,48.846028,2.375429,2020,9,1,1,3,0,2013
48327,1,10,100007049,5,10,0,48.846028,2.375429,2020,9,1,1,4,0,2013
48330,1,10,100007049,5,10,0,48.846028,2.375429,2020,9,1,1,15,0,2013
48333,1,10,100007049,5,10,0,48.846028,2.375429,2020,9,1,1,18,0,2013


In [12]:
X_train.dtypes

counter_id                int64
counter_name              int64
site_id                   int64
site_name                 int64
coordinates               int64
counter_technical_id      int64
latitude                float64
longitude               float64
year                      int32
month                     int32
day                       int32
weekday                   int32
hour                      int32
is_weekend                int64
installation_year         int32
dtype: object

In [13]:
# Initialize TPOT for regression
tpot = TPOTRegressor(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='neg_root_mean_squared_error',  # RMSE as the evaluation metric
    random_state=42,
    n_jobs=-1
)

In [14]:
# Fit Pipeline to Training Data
tpot.fit(X_train, y_train)

# Make Predictions on Test Data
y_predictions = tpot.predict(X_test)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]



                                                                    
                                                                    
TPOT closed during evaluation in one generation.
                                                                    
                                                                    
TPOT closed prematurely. Will use the current best pipeline.
                                                                    

  (m.start(), m.end()) for m in re.finditer(", [\w]+__", dirty_string)


RuntimeError: A pipeline has not yet been optimized. Please call fit() first.

In [72]:
print(y_predictions)

[0.36719066 1.5280361  1.9501511  ... 5.169382   4.6053843  3.5775466 ]


In [None]:
pd.DataFrame(y_predictions, columns=["log_bike_count"]).reset_index().rename(
    columns={"index": "Id"}
).to_csv("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/predictions_option_2_vsimple_AUTOML.csv", index=False)