In [None]:
import os
import shutil
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')


# Second Notebook

- Once we decided on a Machine Learning Model
- Lets check if we can optimize it in size without loosing prediction power

In [8]:
data_csv = os.path.join('data', 'final_merged_data.csv')
df = pd.read_csv(data_csv)

# We have 115 stations in total

unique_stations = df['station_id'].unique()
len(list(df["station_id"].value_counts()))

115

In [9]:
df['air_temperature_celsius'] = (df['max_air_temperature_celsius'] + df['min_air_temperature_celsius']) / 2
df['relative_humidity_percent'] = (df['max_relative_humidity_percent'] + df['min_relative_humidity_percent']) / 2


cols_to_keep = [
    'station_id', # Model name
    'num_bikes_available', # Precited value
    'last_reported', # Get the day of the week
    'hour', 'minute',
    'air_temperature_celsius', 
    'relative_humidity_percent'
]

In [10]:
df_testing_models = df[cols_to_keep].copy()

# Get the day of the week
df_testing_models['last_reported'] = pd.to_datetime(df_testing_models['last_reported'])
day_codes = df_testing_models['last_reported'].dt.dayofweek
df_testing_models['day_of_week'] = pd.Categorical.from_codes(day_codes, categories=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

# Combine hour and minute into a single variable: fraction of the day
df_testing_models['time_of_day'] = (df_testing_models['hour'] * 60 + df_testing_models['minute']) / (24 * 60)

# Create cyclical features from the time_of_day (range 0-1)
df_testing_models['time_sin'] = np.sin(2 * np.pi * df_testing_models['time_of_day'])
df_testing_models['time_cos'] = np.cos(2 * np.pi * df_testing_models['time_of_day'])

# One-hot encode the day_of_week categorical feature
df_testing_models = pd.get_dummies(df_testing_models, columns=['day_of_week'], drop_first=True)

# drop
df_testing_models = df_testing_models.drop(columns=['last_reported'])
df_testing_models = df_testing_models.drop(columns=['hour', 'minute', 'time_of_day'])

In [11]:
station_dfs = {}

for station in unique_stations:
    station_df = df_testing_models[df_testing_models['station_id'] == station].copy()
    station_df = station_df.drop(columns=['station_id'])
    station_df = station_df.reset_index(drop=True)
    station_dfs[station] = station_df

In [12]:
def train_rf_models_for_station(df_model, station_id):
    """
    For a given station DataFrame, train two RandomForest models:
      - A 'heavy' model using the full dataset and a wide grid search.
      - A 'light' model using reduced data, lighter data types, and a smaller grid.
    
    Both models are saved in the station folder with names 'heavy' and 'light'.
    A dictionary with RMSE and MAE for both models is returned.
    
    Parameters:
    - df_model: DataFrame containing the station data.
    - station_id: Identifier of the station.
    
    Returns:
    - results_dict: Dictionary with keys 'heavy' and 'light', each mapping to a dict with RMSE and MAE.
    """
    
    # Create a folder for the specific station inside 'model_outputs_Optimized'
    base_output_folder = 'model_outputs_Optimized'
    station_folder = os.path.join(base_output_folder, f"{station_id}")
    os.makedirs(station_folder, exist_ok=True)
    
    # Define feature columns and target
    feature_cols = [
        'time_sin', 'time_cos', 
        'air_temperature_celsius',
        'relative_humidity_percent'
    ]
    # Include one-hot encoded day of week columns if available
    day_cols = [col for col in df_model.columns if col.startswith('day_of_week_')]
    feature_cols += day_cols
    
    target = 'num_bikes_available'
    
    # Prepare X and y
    X = df_model[feature_cols]
    y = df_model[target]
    
    # Split data into training and test sets (20% test)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # ---------------------------
    # HEAVY MODEL: Full data & extensive grid search
    # ---------------------------
    rf_heavy = RandomForestRegressor(random_state=42)
    heavy_param_grid = {
        'n_estimators': [200], # This was tested before as best
        'max_depth': [None], # This was tested before as best
        'min_samples_split': [2] , # This was tested before as best
        'min_samples_leaf': [1] # This was tested before as best
    }
    heavy_grid = GridSearchCV(rf_heavy, heavy_param_grid, cv=5, 
                              scoring='neg_mean_squared_error', n_jobs=-1)
    heavy_grid.fit(X_train, y_train)
    heavy_best = heavy_grid.best_estimator_
    
    # Evaluate heavy model on the test set
    y_pred_heavy = heavy_best.predict(X_test)
    heavy_rmse = np.sqrt(mean_squared_error(y_test, y_pred_heavy))
    heavy_mae = mean_absolute_error(y_test, y_pred_heavy)
    
    # Save the heavy model
    heavy_model_filename = os.path.join(station_folder, f"heavy_model_station_{station_id}_RandomForest.pkl")
    with open(heavy_model_filename, 'wb') as f:
        pickle.dump(heavy_best, f)
    
    # ---------------------------
    # LIGHT MODEL: Reduced data, optimized data types, and simpler grid search
    # ---------------------------
    # Reduce training data (using 50% of the training set)
    X_train_light = X_train.sample(frac=0.5, random_state=42)
    y_train_light = y_train.loc[X_train_light.index]
    
    # Convert features to a lighter type (float32) if they are numeric
    X_train_light = X_train_light.astype(np.float32)
    X_test_light = X_test.astype(np.float32)
    
    # Use a simplified grid for faster model building
    rf_light = RandomForestRegressor(random_state=42)
    light_param_grid = {
        'n_estimators': [50],  # fewer trees
        'max_depth': [15],      # shallower trees
        'min_samples_split': [2],
        'min_samples_leaf': [1]
    }
    light_grid = GridSearchCV(rf_light, light_param_grid, cv=3, 
                              scoring='neg_mean_squared_error', n_jobs=-1)
    light_grid.fit(X_train_light, y_train_light)
    light_best = light_grid.best_estimator_
    
    # Evaluate light model on the test set
    y_pred_light = light_best.predict(X_test_light)
    light_rmse = np.sqrt(mean_squared_error(y_test, y_pred_light))
    light_mae = mean_absolute_error(y_test, y_pred_light)
    
    # Save the light model
    light_model_filename = os.path.join(station_folder, f"light_model_station_{station_id}_RandomForest.pkl")
    with open(light_model_filename, 'wb') as f:
        pickle.dump(light_best, f)
    
    # Save results for both models in a text file
    results_filename = os.path.join(station_folder, f"results_station_{station_id}.txt")
    with open(results_filename, 'w') as f:
        f.write(f"Results for station {station_id}\n\n")
        f.write("HEAVY MODEL:\n")
        f.write("Kept Model: Random Forest Regressor (Full Data & Extensive Grid Search)\n")
        f.write("Best Parameters: " + str(heavy_grid.best_params_) + "\n")
        f.write(f"Test RMSE: {heavy_rmse:.2f}\n")
        f.write(f"Test MAE: {heavy_mae:.2f}\n\n")
        f.write("LIGHT MODEL:\n")
        f.write("Kept Model: Random Forest Regressor (Reduced Data, Lighter Data Types, Simplified Grid)\n")
        f.write("Best Parameters: " + str(light_grid.best_params_) + "\n")
        f.write(f"Test RMSE: {light_rmse:.2f}\n")
        f.write(f"Test MAE: {light_mae:.2f}\n")
    
    print(f"Station {station_id} models saved in '{station_folder}'.")
    print(f"Heavy Model - Best Parameters: {heavy_grid.best_params_}, RMSE: {heavy_rmse:.2f}, MAE: {heavy_mae:.2f}")
    print(f"Light Model - Best Parameters: {light_grid.best_params_}, RMSE: {light_rmse:.2f}, MAE: {light_mae:.2f}")
    
    # Dictionary of evaluation metrics for both models
    results_dict = {
        "heavy": {"rmse": heavy_rmse, "mae": heavy_mae},
        "light": {"rmse": light_rmse, "mae": light_mae}
    }
    
    return results_dict




In [13]:
# Example usage:
# Assuming station_dfs is a dictionary with station_id as keys and station DataFrames as values:
all_results = {}
for station_id, station_df in station_dfs.items():
    print(f"\nTraining models for station {station_id}")
    results = train_rf_models_for_station(station_df, station_id)
    all_results[station_id] = results

print("Evaluation metrics for all stations:")
print(all_results)


Training models for station 10
Station 10 models saved in 'model_outputs_Optimized/10'.
Heavy Model - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}, RMSE: 1.09, MAE: 0.69
Light Model - Best Parameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}, RMSE: 1.45, MAE: 0.97

Training models for station 100
Station 100 models saved in 'model_outputs_Optimized/100'.
Heavy Model - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}, RMSE: 2.01, MAE: 1.13
Light Model - Best Parameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}, RMSE: 2.97, MAE: 1.80

Training models for station 109
Station 109 models saved in 'model_outputs_Optimized/109'.
Heavy Model - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}, RMSE: 2.40, MAE: 1.29
Light Model - Best Para

In [14]:
# Assuming all_results is a dictionary in the following format:
# {
#     'station_id_1': {'heavy': {'rmse': 3.5, 'mae': 2.1}, 'light': {'rmse': 4.0, 'mae': 2.5}},
#     'station_id_2': {'heavy': {'rmse': 3.2, 'mae': 2.0}, 'light': {'rmse': 3.8, 'mae': 2.4}},
#      ... 
# }

# Create header for the markdown table
print("| Station | Heavy RMSE | Heavy MAE | Light RMSE | Light MAE |")
print("|---------|------------|-----------|------------|-----------|")

# Variables to accumulate metrics for averages
total_heavy_rmse = total_heavy_mae = 0
total_light_rmse = total_light_mae = 0
n_stations = len(all_results)

# Iterate over the all_results dict to build the table rows
for station_id, metrics in all_results.items():
    heavy_rmse = metrics['heavy']['rmse']
    heavy_mae = metrics['heavy']['mae']
    light_rmse = metrics['light']['rmse']
    light_mae = metrics['light']['mae']
    
    total_heavy_rmse += heavy_rmse
    total_heavy_mae += heavy_mae
    total_light_rmse += light_rmse
    total_light_mae += light_mae
    
    row = f"| {station_id} | {heavy_rmse:.2f} | {heavy_mae:.2f} | {light_rmse:.2f} | {light_mae:.2f} |"
    print(row)

# Calculate averages for heavy and light models
avg_heavy_rmse = total_heavy_rmse / n_stations
avg_heavy_mae = total_heavy_mae / n_stations
avg_light_rmse = total_light_rmse / n_stations
avg_light_mae = total_light_mae / n_stations






| Station | Heavy RMSE | Heavy MAE | Light RMSE | Light MAE |
|---------|------------|-----------|------------|-----------|
| 10 | 1.09 | 0.69 | 1.45 | 0.97 |
| 100 | 2.01 | 1.13 | 2.97 | 1.80 |
| 109 | 2.40 | 1.29 | 3.14 | 1.91 |
| 11 | 2.97 | 1.52 | 3.88 | 2.45 |
| 114 | 3.50 | 1.88 | 4.23 | 2.58 |
| 116 | 2.65 | 1.44 | 3.21 | 2.03 |
| 13 | 2.32 | 1.18 | 3.15 | 1.88 |
| 14 | 3.20 | 1.94 | 3.76 | 2.60 |
| 15 | 0.68 | 0.44 | 1.08 | 0.69 |
| 17 | 2.65 | 1.49 | 3.06 | 1.90 |
| 18 | 2.95 | 1.70 | 3.82 | 2.47 |
| 19 | 2.59 | 1.35 | 3.49 | 2.10 |
| 2 | 1.34 | 0.77 | 1.65 | 1.07 |
| 20 | 2.19 | 1.13 | 2.80 | 1.55 |
| 22 | 2.19 | 1.39 | 2.82 | 1.97 |
| 24 | 1.69 | 1.12 | 2.51 | 1.75 |
| 28 | 3.06 | 1.86 | 4.02 | 2.72 |
| 29 | 2.38 | 1.37 | 3.45 | 2.22 |
| 3 | 1.86 | 1.17 | 2.65 | 1.77 |
| 30 | 1.01 | 0.47 | 1.26 | 0.65 |
| 31 | 2.43 | 1.50 | 3.08 | 2.18 |
| 33 | 2.93 | 1.94 | 3.88 | 2.81 |
| 34 | 3.48 | 2.25 | 4.70 | 3.29 |
| 35 | 3.08 | 1.78 | 4.30 | 2.84 |
| 36 | 2.51 | 1.39 | 3.73 | 2.34 |

In [15]:
print("| Station | Heavy RMSE | Heavy MAE | Light RMSE | Light MAE |")
print("|---------|------------|-----------|------------|-----------|")

print("| **Average** | "
        f"**{avg_heavy_rmse:.2f}** | **{avg_heavy_mae:.2f}** | "
        f"**{avg_light_rmse:.2f}** | **{avg_light_mae:.2f}** |")

| Station | Heavy RMSE | Heavy MAE | Light RMSE | Light MAE |
|---------|------------|-----------|------------|-----------|
| **Average** | **2.42** | **1.37** | **3.24** | **2.06** |


In [18]:
# Get the RMSE difference between heavy and light models
print(f"\nRMSE Difference (Light - Heavy) = {avg_light_rmse - avg_heavy_rmse:.2f}")
# Get the MAE difference between heavy and light models
print(f"MAE Difference (Light - Heavy) = {avg_light_mae - avg_heavy_mae:.2f}")


RMSE Difference (Light - Heavy) = 0.82
MAE Difference (Light - Heavy) = 0.69


## Model Sizes Comparison

In [17]:
# Base folder where station model folders are saved
base_output_folder = 'model_outputs_Optimized'

total_heavy_size = 0
total_light_size = 0

# Loop through each station folder in the base directory
for station_folder in os.listdir(base_output_folder):
    station_path = os.path.join(base_output_folder, station_folder)
    if os.path.isdir(station_path):
        # Loop through files in the station folder
        for file in os.listdir(station_path):
            file_path = os.path.join(station_path, file)
            if file.startswith("heavy_model") and file.endswith(".pkl"):
                total_heavy_size += os.path.getsize(file_path)
            elif file.startswith("light_model") and file.endswith(".pkl"):
                total_light_size += os.path.getsize(file_path)

# Convert sizes from bytes to megabytes for easier reading (optional)
def bytes_to_mb(byte_size):
    return byte_size / (1024 * 1024)

print(f"Total heavy model size: {total_heavy_size} bytes ({bytes_to_mb(total_heavy_size):.2f} MB)")
print(f"Total light model size: {total_light_size} bytes ({bytes_to_mb(total_light_size):.2f} MB)")

# Get size percentual reduction
reduction = (total_heavy_size - total_light_size) / total_heavy_size * 100
print(f"Size reduction from heavy to light model: {reduction:.2f}%")


Total heavy model size: 2042875300 bytes (1948.24 MB)
Total light model size: 277176553 bytes (264.34 MB)
Size reduction from heavy to light model: 86.43%


## Decision

- We reduced the model by 86.43%
- We lost on average:
    - 0.82 bikes RMSE
    - 0.69 bikes MAE

> Bikes prediction can afford to lose almost 1 bike in prediction accuracy, at the benefit of considerably faster and lighter software

In [None]:
# Base folder where station model folders are saved
base_output_folder = 'model_outputs_Optimized'
# Destination folder for lightweight models
destination_folder = 'pickle_models'

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Loop through each station folder in the base directory
for station_folder in os.listdir(base_output_folder):
    station_path = os.path.join(base_output_folder, station_folder)
    if os.path.isdir(station_path):
        # Look for the lightweight model file in the station folder
        for file in os.listdir(station_path):
            if file.startswith("light_model") and file.endswith(".pkl"):
                source_file_path = os.path.join(station_path, file)
                # Create new file name: model_station_{station_id}.pkl
                destination_file_name = f"model_station_{station_folder}.pkl"
                destination_file_path = os.path.join(destination_folder, destination_file_name)
                # Copy the file to the destination folder with the new name
                shutil.copy2(source_file_path, destination_file_path)
                print(f"Copied {source_file_path} to {destination_file_path}")


Copied model_outputs_Optimized/61/light_model_station_61_RandomForest.pkl to pickle_models/model_station_61.pkl
Copied model_outputs_Optimized/95/light_model_station_95_RandomForest.pkl to pickle_models/model_station_95.pkl
Copied model_outputs_Optimized/59/light_model_station_59_RandomForest.pkl to pickle_models/model_station_59.pkl
Copied model_outputs_Optimized/92/light_model_station_92_RandomForest.pkl to pickle_models/model_station_92.pkl
Copied model_outputs_Optimized/66/light_model_station_66_RandomForest.pkl to pickle_models/model_station_66.pkl
Copied model_outputs_Optimized/104/light_model_station_104_RandomForest.pkl to pickle_models/model_station_104.pkl
Copied model_outputs_Optimized/50/light_model_station_50_RandomForest.pkl to pickle_models/model_station_50.pkl
Copied model_outputs_Optimized/68/light_model_station_68_RandomForest.pkl to pickle_models/model_station_68.pkl
Copied model_outputs_Optimized/103/light_model_station_103_RandomForest.pkl to pickle_models/model_st