In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler


station_dataframe = dd.read_csv('data/data_bicing_joined_HX.csv', assume_missing=True, delimiter=';')

weather_dataframe = dd.read_csv('weather_data/weather.csv', assume_missing=True, delimiter=',')

station_dataframe = station_dataframe.loc[station_dataframe['status'] == 'IN_SERVICE']

bare_df = station_dataframe[['station_id', 'year', 'month', 'day', 'hour', '% Docks Availlable',  '% Docks Available H-4','% Docks Available H-3', '% Docks Available H-2', '% Docks Available H-1']]
bare_df = bare_df.rename(columns={'% Docks Availlable': 'percentage'})
for i in range(1, 5):
    bare_df = bare_df.rename(columns={f'% Docks Available H-{i}': f'ctx-{i}'})

# Print the head of the updated DataFrame

bare_df.head()

Unnamed: 0,station_id,year,month,day,hour,percentage,ctx-4,ctx-3,ctx-2,ctx-1
0,290.0,2019.0,7.0,22.0,8.0,0.751131,0.352941,0.352941,0.352941,0.504902
1,271.0,2022.0,6.0,10.0,21.0,0.769231,0.753968,0.659341,0.645022,0.686508
2,149.0,2022.0,6.0,8.0,20.0,0.735043,0.491582,0.675214,0.864198,0.801347
3,342.0,2020.0,2.0,4.0,4.0,0.777778,0.89899,0.831909,0.777778,0.777778
4,358.0,2021.0,5.0,28.0,8.0,0.943333,0.48,0.48,0.513333,0.766667


In [3]:
bare_df.size.compute()

148774780

In [15]:
def weather_prep(weather_df:dd) -> dd:
    
    weather = weather_df.copy()

    weather = weather.groupby(weather.index//2).mean()
    weather['mm_precip'] = weather['mm_precip']*2
    weather['timestamp'] = (weather['timestamp']-900).astype(int)
    weather['datetime'] = weather['timestamp'].map(lambda x: pd.to_datetime(x, unit='s'))

    
    weather['timestamp'] = weather['timestamp'].astype(int)

    weather['datetime'] = weather['timestamp'].map(lambda x: pd.to_datetime(x, unit='s'))
    
    return weather

def weather_merge(weather_df:dd, station_data:dd) -> dd:
    weather = weather_df.copy()
    stations = station_data.copy()

    stations[['year', 'month', 'day', 'hour']] = stations[['year', 'month', 'day', 'hour']].astype(int)

    stations['datetime'] = dd.to_datetime(stations['year'].astype(str) + '-' +
                                                stations['month'].astype(str) + '-' +
                                                stations['day'].astype(str) + ' ' +
                                                stations['hour'].astype(str) + ':00:00')
    stations = stations.merge(weather[['datetime', 'temperature','mm_precip']], on='datetime', how='left')
    # for i in range(1,5):
    #     df_weather_shifted = weather.copy()
    #     df_weather_shifted['datetime'] = df_weather_shifted['datetime'] + pd.Timedelta(hours=-i)
    #     stations = stations.merge(df_weather_shifted[['datetime', 'temperature','mm_precip']], on='datetime', how='inner', suffixes=('', f'-{abs(i)}'))

    return stations


def extra_time_info(df:dd) -> dd:

    def is_weekend(day_of_week):
        return 1 if day_of_week >= 5 else 0

    df['is_weekend'] = df['datetime'].dt.dayofweek.map(is_weekend, meta=('is_weekend', 'int64'))

    df['timeframe1'] = df['datetime'].dt.hour.map(lambda x: 1 if x <= 4 else 0, meta=('timeframe1', 'int64'))
    df['timeframe2'] = df['datetime'].dt.hour.map(lambda x: 1 if x >= 5 and x <=9 else 0, meta=('timeframe1', 'int64'))
    df['timeframe3'] = df['datetime'].dt.hour.map(lambda x: 1 if x >= 10 and x <=14 else 0, meta=('timeframe1', 'int64'))
    df['timeframe4'] = df['datetime'].dt.hour.map(lambda x: 1 if x >= 15 and x <=19 else 0, meta=('timeframe1', 'int64'))
    df['timeframe5'] = df['datetime'].dt.hour.map(lambda x: 1 if x >= 20 else 0, meta=('timeframe1', 'int64'))
    
    return df


def station_loc(id_lat_lon:dd, df:dd) -> dd:

    assert all(item in list(id_lat_lon.columns) for item in ['station_id', 'lat', 'lon']), 'id_lat_lon must contain station_id, lat and lon columns'
    id_locator = id_lat_lon.copy()
    data = df.copy()
    id_locator = id_locator.drop_duplicates(subset=['station_id'])

    data = data.merge(id_locator[['station_id', 'lat', 'lon']], on='station_id', how='left')
    data = data.drop(['station_id'], axis=1)

    return data


In [16]:
weather_prepped = weather_prep(weather_dataframe)

data_prepared = weather_merge(weather_prepped, bare_df)

# data_prepared = extra_time_info(bare_df)

# data_prepared = data_prepared.drop(['datetime'], axis=1)

# locations = station_dataframe[['station_id', 'lat', 'lon']]

# data_prepared = station_loc(id_lat_lon=locations, df=data_prepared)






In [17]:
data_prepared.size.compute()

193425466

In [6]:
data_prepared = data_prepared[['year', 'month', 'day', 'hour', 'is_weekend', 'timeframe1',
       'timeframe2', 'timeframe3', 'timeframe4', 'timeframe5', 'lat', 'lon', 
       'temperature-4', 'mm_precip-4', 'ctx-4', 'temperature-3', 'mm_precip-3', 'ctx-3', 
       'temperature-2', 'mm_precip-2', 'ctx-2', 'temperature-1', 'mm_precip-1', 'ctx-1', 
       'temperature', 'mm_precip', 'percentage']]

In [7]:
data_prepared.size.compute()

401881392

In [20]:
print(data_prepared.head().to_markdown(tablefmt = 'fancy_grid'))


╒════╤════════╤═════════╤═══════╤════════╤══════════════╤══════════════╤══════════════╤══════════════╤══════════════╤═════════╤═════════╤═════════════════╤═══════════════╤══════════╤═════════════════╤═══════════════╤══════════╤═════════════════╤═══════════════╤══════════╤═════════════════╤═══════════════╤══════════╤═══════════════╤═════════════╤══════════════╕
│    │   year │   month │   day │   hour │   is_weekend │   timeframe1 │   timeframe2 │   timeframe3 │   timeframe4 │     lat │     lon │   temperature-4 │   mm_precip-4 │    ctx-4 │   temperature-3 │   mm_precip-3 │    ctx-3 │   temperature-2 │   mm_precip-2 │    ctx-2 │   temperature-1 │   mm_precip-1 │    ctx-1 │   temperature │   mm_precip │   percentage │
╞════╪════════╪═════════╪═══════╪════════╪══════════════╪══════════════╪══════════════╪══════════════╪══════════════╪═════════╪═════════╪═════════════════╪═══════════════╪══════════╪═════════════════╪═══════════════╪══════════╪═════════════════╪═══════════════╪══════════╪══

In [28]:
import xgboost as xgb
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

reduced_data_prepared = data_prepared.head(1000)

reduced_data_prepared = reduced_data_prepared[reduced_data_prepared['hour'].isin([4,9,14,19,23])]

X = reduced_data_prepared.drop(['percentage'], axis=1)
y = reduced_data_prepared['percentage']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)



In [29]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

space = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'reg_lambda': [0.1, 1.0, 10.0],
    'reg_alpha': [0, 0.1, 1.0],
    'n_estimators': [100, 200, 300]
}

xgb_model = XGBRegressor(objective='reg:squarederror', random_state = 69)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=space, scoring='neg_mean_squared_error', cv=3, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 729 candidates, totalling 2187 fits


GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None, gpu_id=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, m...
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estimators=100,
                                    n_jobs=None, num_parallel_tree=None,
                                    predictor=None, random_state=6

In [31]:
import json

parameter = open('best_params.json', 'w+')
parameter.write(json.dumps(grid_search.best_params_))
parameter.close()


In [None]:
xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=6, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, gamma=0.1, reg_alpha=0.1, reg_lambda=0.1, n_jobs=-1, random_state=123)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'RMSE: {rmse}')

# BILSTM trials


In [19]:
import numpy as np
import pandas as pd
import tensorflow as tf
# Assuming your dataframe is named 'df'
# Extract the static data, the target and the time series data in seperate arrays

static_data = data_prepared[['year', 'month', 'day', 'hour', 'is_weekend', 'timeframe1','timeframe2', 'timeframe3', 'timeframe4', 'lat', 'lon', 'temperature', 'mm_precip']].values

timestep_1 = data_prepared[['temperature-1', 'mm_precip-1', 'ctx-1']].values
timestep_2 = data_prepared[['temperature-2', 'mm_precip-2', 'ctx-2']].values
timestep_3 = data_prepared[['temperature-3', 'mm_precip-3', 'ctx-3']].values
timestep_4 = data_prepared[['temperature-4', 'mm_precip-4', 'ctx-4']].values

time_series_data = np.stack((timestep_4, timestep_3, timestep_2, timestep_1), axis=1)

target = data_prepared[['percentage']].values
# timestep_1 = np.column_stack((data_prepared[['temperature-1', 'mm_precip-1', 'ctx-1']].values, static_data))
# timestep_2 = np.column_stack((data_prepared[['temperature-2', 'mm_precip-2', 'ctx-2']].values, static_data))
# timestep_3 = np.column_stack((data_prepared[['temperature-3', 'mm_precip-3', 'ctx-3']].values, static_data))
# timestep_4 = np.column_stack((data_prepared[['temperature-4', 'mm_precip-4', 'ctx-4']].values, static_data))






In [19]:
import tensorflow as tf
from keras.layers import LSTM, Dense, Input, concatenate, Dropout

ts_input = tf.keras.Input(shape=(4, 3), name='ts_input')
static_input = tf.keras.Input(shape = (13,), name='static_input')
LSTMout = LSTM(32, activation='relu', return_sequences=False)(ts_input)
dropout_lstm = Dropout(0.2)(LSTMout)
static_out = Dense(32, activation='relu')(static_input)
dropout_static = Dropout(0.2)(static_out)

merged_out = concatenate([LSTMout, static_out])
merged_out = Dense(1, activation='relu')(merged_out)

model = tf.keras.Model(inputs=[ts_input, static_input], outputs=merged_out)

model.compile(optimizer='adam', loss='mse', metrics=['mse'])
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 ts_input (InputLayer)          [(None, 4, 3)]       0           []                               
                                                                                                  
 static_input (InputLayer)      [(None, 13)]         0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 32)           4608        ['ts_input[0][0]']               
                                                                                                  
 dense (Dense)                  (None, 32)           448         ['static_input[0][0]']           
                                                                                              

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

# Create the testing dataframe
testing = pd.DataFrame(data={'A': [1, 2, 3, 4], 'Name': ['Carlos', 'Jose', 'Maria', 'Juan'],
                             'p': [0.1, 0.2, 0.3, 0.4], 'p-1': [0.2, 0.3, 0.4, 0.5],
                             'p-2': [0.3, 0.4, 0.5, 0.6], 't-1': [10, 20, 30, 40],
                             't-2': [15, 25, 35, 45]})

# Extract the variables from the dataframe
static_vars = testing[['A', 'Name']].values
target_var = testing['p'].values
timestep_1 = testing[['p-1', 't-1']].values
timestep_2 = testing[['p-2', 't-2']].values

# Combine the variables into a numpy array
data = np.column_stack((static_vars, timestep_2, timestep_1))

# Reshape the data into a 3D tensor
timesteps = 3  # Number of previous timesteps
features = data.shape[1] - 2  # Number of features excluding static variables
num_samples = len(data) - timesteps + 1

data_3d = np.zeros((num_samples, timesteps, features))

for i in range(num_samples):
    data_3d[i] = data[i:i + timesteps, 2:]

# Split the data into input sequences (X) and target values (y)
X = data_3d
y = target_var[timesteps - 1:]

# Create the time series dataset
dataset = tf.keras.utils.timeseries_dataset_from_array(
    X,
    y,
    sequence_length=timesteps,
    batch_size=32  # Specify your desired batch size
)

# Now, 'dataset' is a TensorFlow Dataset object that you can use for training your LSTM model



In [7]:
timestep_1 = np.column_stack((testing[['A', 'Name']].values, testing[['p-1', 't-1']].values))
timestep_2 = np.column_stack((testing[['A', 'Name']].values, testing[['p-2', 't-2']].values))

tensor = np.stack((timestep_2, timestep_1), axis=0)

tensor


array([[[1, 'Carlos', 0.3, 15.0],
        [2, 'Jose', 0.4, 25.0],
        [3, 'Maria', 0.5, 35.0],
        [4, 'Juan', 0.6, 45.0]],

       [[1, 'Carlos', 0.2, 10.0],
        [2, 'Jose', 0.3, 20.0],
        [3, 'Maria', 0.4, 30.0],
        [4, 'Juan', 0.5, 40.0]]], dtype=object)