## Import Libraries

In [76]:
import sys


plot_dir = "../Plots"
os.makedirs(plot_dir, exist_ok=True)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import mlflow
import joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import os




## Import Data

In [62]:
data = pd.read_csv("../Data/AnalyzedData/SeoulBikeData_Analyzed.csv")
data

Unnamed: 0,Date,Rented_Bike_Count,Hour,Temperature,Humidity,Wind_speed,Visibility,Dew_point_temperature,Solar_Radiation,Rainfall,...,Holiday,Functioning_Day,is_Holiday_WorkingDay,is_clear_weather,is_rainy_weather,is_snowy_weather,Month,Day,Weekday,DayOfYear
0,2017-12-01,254.0,0.0,-5.2,37.0,2.2,2000.0,-17.6,0.0,0.0,...,0.0,1.0,0,0,0,0,12,1,4,335
1,2017-12-01,204.0,1.0,-5.5,38.0,0.8,2000.0,-17.6,0.0,0.0,...,0.0,1.0,0,0,0,0,12,1,4,335
2,2017-12-01,173.0,2.0,-6.0,39.0,1.0,2000.0,-17.7,0.0,0.0,...,0.0,1.0,0,0,0,0,12,1,4,335
3,2017-12-01,107.0,3.0,-6.2,40.0,0.9,2000.0,-17.6,0.0,0.0,...,0.0,1.0,0,0,0,0,12,1,4,335
4,2017-12-01,78.0,4.0,-6.0,36.0,2.3,2000.0,-18.6,0.0,0.0,...,0.0,1.0,0,0,0,0,12,1,4,335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2018-11-30,1003.0,19.0,4.2,34.0,2.6,1894.0,-10.3,0.0,0.0,...,0.0,1.0,0,0,0,0,11,30,4,334
8756,2018-11-30,764.0,20.0,3.4,37.0,2.3,2000.0,-9.9,0.0,0.0,...,0.0,1.0,0,0,0,0,11,30,4,334
8757,2018-11-30,694.0,21.0,2.6,39.0,0.3,1968.0,-9.9,0.0,0.0,...,0.0,1.0,0,0,0,0,11,30,4,334
8758,2018-11-30,712.0,22.0,2.1,41.0,1.0,1859.0,-9.8,0.0,0.0,...,0.0,1.0,0,0,0,0,11,30,4,334


In [63]:
# Ensure the 'Date' column is in datetime format
data['Date'] = pd.to_datetime(data['Date'], errors='coerce')

In [64]:
print(data.dtypes)

Date                     datetime64[ns]
Rented_Bike_Count               float64
Hour                            float64
Temperature                     float64
Humidity                        float64
Wind_speed                      float64
Visibility                      float64
Dew_point_temperature           float64
Solar_Radiation                 float64
Rainfall                        float64
Snowfall                        float64
Seasons                         float64
Holiday                         float64
Functioning_Day                 float64
is_Holiday_WorkingDay             int64
is_clear_weather                  int64
is_rainy_weather                  int64
is_snowy_weather                  int64
Month                             int64
Day                               int64
Weekday                           int64
DayOfYear                         int64
dtype: object


## MLflow URI

In [65]:
mlflow.set_tracking_uri("http://localhost:5000")

## Data Splitting

In [None]:
X = data.drop(columns=["Rented_Bike_Count","Date"])  # Features (all columns except 'count')
y = data["Rented_Bike_Count"]  # Target variable
# Perform an 80-20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data size: {X_train.shape}")
print(f"Testing data size: {X_test.shape}")

Training data size: (7008, 20)
Testing data size: (1752, 20)


## Model Training

In [None]:


# Define the Random Forest model
rf_model = RandomForestRegressor(
    n_estimators=100, 
    max_depth=10, 
    random_state=42, 
    n_jobs=-1
)

with mlflow.start_run():
    
    # Log model parameters
    mlflow.log_params({
        "model_type": "RandomForestRegressor",
        "n_estimators": 100,
        "max_depth": 10
    })
    
    # Train
    rf_model.fit(X_train, y_train)
    
    # Predict
    y_pred = rf_model.predict(X_test)
    
    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metrics({"MAE": mae, "RMSE": rmse, "R2": r2})
    
    # Feature importance plot
    feature_importances = rf_model.feature_importances_
    plt.figure(figsize=(10, 6))
    plt.barh(X_train.columns, feature_importances)
    plt.title("Random Forest Feature Importance")
    plt.tight_layout()
    plt.savefig("rf_feature_importance.png")
    plt.close()
    
    # Log artifact and model
    mlflow.log_artifact("rf_feature_importance.png")
    mlflow.sklearn.log_model(rf_model, "rf_model")

# Print metrics
print(f"Random Forest training complete. MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")



Random Forest training complete. MAE: 144.03, RMSE: 246.81, R2: 0.845


['model.pkl']

## save model as joblib File

In [None]:
# Save model
joblib.dump(rf_model, 'model.pkl')

##  Example single prediction 

In [None]:

model = joblib.load("model.pkl")   

# You must use the same features (columns) as your X_train
sample = X_test.iloc[[0]]   # ✅ Use one real row from your test set
print("Sample input:")
print(sample)

# Make a single prediction
single_pred = model.predict(sample)
print("Single Prediction (Rented Bike Count):", single_pred[0])

# === 3. Batch prediction on test data ===
batch_pred = model.predict(X_test)

# Display few predictions
print("\nBatch Predictions (first 10):", batch_pred[:15])

# === 4. Save results to a CSV ===
results = X_test.copy()
results["Actual"] = y_test.values
results["Predicted"] = batch_pred
results.to_csv("bike_demand_predictions.csv", index=False)


print("\n✅ Predictions saved to 'bike_demand_predictions.csv'")


Sample input:
      Hour  Temperature  Humidity  Wind_speed  Visibility  \
6056   8.0         27.2      69.0         1.8      1999.0   

      Dew_point_temperature  Solar_Radiation  Rainfall  Snowfall  Seasons  \
6056                   21.0              0.7       0.0       0.0      3.0   

      Holiday  Functioning_Day  is_Holiday_WorkingDay  is_clear_weather  \
6056      0.0              1.0                      0                 0   

      is_rainy_weather  is_snowy_weather  Month  Day  Weekday  DayOfYear  
6056                 0                 0      8   10        4        222  
Single Prediction (Rented Bike Count): 1791.8707655324981

Batch Predictions (first 10): [1791.87076553  706.01388015  663.9700939  1796.50610807  801.25580059
  794.80346646 1685.26998347  769.99557571  460.68024581  462.78813467
  140.99579162  304.04308324  339.82456713  910.70273862 1317.82406758]

✅ Predictions saved to 'bike_demand_predictions.csv'
