In [4]:
#importing all libraries necessary for machine learning
import pandas as pd
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import joblib
import pickle
from datetime import datetime

In [5]:
# # i need to load in all 5 datasets which is the one dataset combined

# # Attempt to get the script directory; fallback to os.getcwd() if __file__ is not defined
# try:
#     script_dir = os.path.dirname(os.path.abspath(__file__))
# except NameError:
#     script_dir = os.getcwd()

# # Assuming your notebook is running from "COMP30830-SE-Group11-Dublin-Bike-Sharing-System/app/machine learning",
# # we move up two directories to reach the project root.
# project_root = os.path.abspath(os.path.join(script_dir, "..", ".."))

# # Define the correct path to the 'database' folder inside 'app'
# data_folder = os.path.join(project_root, "app", "machine learning")

# # Get all CSV file paths that match the pattern in the 'database' folder
# file_paths = glob.glob(os.path.join(data_folder, "MachineLearningData_*.csv"))

# # Read and concatenate all CSV files into one DataFrame
# data = pd.concat([pd.read_csv(file) for file in file_paths], ignore_index=True)

# # Handle missing values (drop rows with NaN in lagged features)
# data.dropna(inplace=True)

In [6]:
# Load the dataset
data = pd.read_csv("bike_weather_data_2.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'bike_weather_data_2.csv'

In [None]:
# Handle missing values (drop rows with NaN in lagged features)
data.dropna(inplace=True)

In [None]:
data['humidity'] = (data['min_humidity'] + data['max_humidity'])/2
data['pressure'] = (data['min_pressure'] + data['max_pressure'])/2

In [None]:
# Define features and target
features = ['station_id','max_temperature', 'min_temperature', 'humidity', 'pressure', 'hour', 'day']
target = 'num_bikes_available'


# 1. Using different Models to predict 

## 1.1 Build linear regression model and evaluation

In [None]:
X = data[features]
y = data[target]

# Split data into training and testing sets
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train_1, y_train_1)

# Make predictions
y_pred = model.predict(X_test_1)

# Evaluate the model
mae = mean_absolute_error(y_test_1, y_pred)
r2 = r2_score(y_test_1, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")

Mean Absolute Error: 8.142964339007797
R² Score: -4.809119736259859e-05


The results, however, show poor predictive performance, with a Mean Absolute Error (MAE) of 8.14 and an R² score close to zero (–0.000046), indicating that the model explains virtually none of the variance in the target variable.

In [None]:
# Display model coefficients
print("\nModel Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef}")
print(f"Intercept: {model.intercept_}")


Model Coefficients:
station_id: -0.00023217489363861016
max_temperature: 0.19905585836316605
min_temperature: -0.19533001422666177
humidity: 0.010560822426623996
pressure: -0.001709350329187733
hour: -0.005170924365627474
day: 0.00457836499203155
Intercept: 13.021677676353793


| Feature | Coefficient | Interpretation |
|-------------|-----------------|--------------------|
|station_id|–0.00023|Negligible negative effect; station ID alone doesn't meaningfully influence bike availability in a linear fashion.|
|max_temperature	|+0.19906|	Slight positive correlation: higher temperatures may increase bike usage, reducing availability.|
|min_temperature	|–0.19533	|Slight negative correlation; colder minimums may reduce usage and increase availability.|
|humidity	|+0.01056	|Very weak positive effect; may slightly increase availability.|
|pressure	|–0.00171	|Very minor negative effect; not significant.|
|hour	|–0.00517	|Slightly fewer bikes available at later hours.|
|day	|+0.00458	|Very minimal effect over the days.|
|intercept	|+13.02	|The predicted number of bikes when all features are zero, though not interpretable in practical terms.|
### Conclusion
While the signs of the coefficients align with some intuitive patterns (e.g., higher temperature reduces availability), the magnitude of all coefficients is very small, and the overall predictive power of the model is extremely low. This suggests that the relationship between the selected features and bike availability is likely non-linear, and cannot be effectively captured by a simple linear model.

Given the poor performance, linear regression is not suitable for this prediction task. More advanced, non-linear models (e.g., Random Forests, Gradient Boosting) are better equipped to handle the complex interactions between features in this context, as evidenced by the strong performance of the Random Forest model.

## 1.2 Using Random Forest algorithm and Evaluation

In [None]:
# Split data into training and testing sets
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size=0.3, random_state=12)

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=12)

# Train the model
rf_model.fit(X_train_2, y_train_2)


In [None]:
# Predict on the test set
y_pred = rf_model.predict(X_test_2)

# Calculate performance metrics
mse = mean_squared_error(y_test_2, y_pred)
r2 = r2_score(y_test_2, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 3.2608652145310204
R^2 Score: 0.965831446274333


## 1.3 Gradient Boosting model

In [None]:
# Split data into training and testing sets
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y, test_size=0.3, random_state=26)

# 初始化模型（你可以尝试不同的参数）
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=26)

# 训练模型
gb_model.fit(X_train_3, y_train_3)


In [None]:
# 进行预测
y_pred_gb = gb_model.predict(X_test_3)

# 模型评估
mse = mean_squared_error(y_test_3, y_pred_gb)
mae = mean_absolute_error(y_test_3, y_pred_gb)
r2 = r2_score(y_test_3, y_pred_gb)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")


Mean Squared Error: 68.19406276339608
Mean Absolute Error: 6.817643455797645
R^2 Score: 0.28490681396675765


The Gradient Boosting Regressor achieved a Mean Squared Error of 68.19, a Mean Absolute Error of 6.82, and an R² score of 0.285 on the test set. 

These results indicate that the model captures some of the patterns in the data but performs significantly worse than the Random Forest model, which achieved an R² of approximately 0.97. While Gradient Boosting generally performs well for structured data, in this case, it likely underperformed due to suboptimal hyperparameters or the model’s sensitivity to the data distribution.



## 1.3 Decide which model is better

In [None]:
# Save the model to a .pkl file
model_filename = "Dubike_random_forest_model.joblib"
joblib.dump(rf_model, model_filename)

with open(model_filename, "wb") as file:
    pickle.dump(model, file)

print(f"Model saved to {model_filename}")

Model saved to Dubike_random_forest_model.joblib


## 1.2 Make a prediction

In [None]:
# Load the saved model
with open("Dubike_random_forest_model.joblib", "rb") as file:
    model = pickle.load(file)

# Define new input data for prediction
new_data = pd.DataFrame({
    'station_id': [32],
    'max_temperature': [20],
    'min_temperature': [6],
    'humidity': [60],
    'pressure': [1002.94],
    'hour': [9],
    'day': [2]  # Example: 0 = Monday, 1 = Tuesday, etc.
})

# Make prediction
prediction = model.predict(new_data)
# Output prediction
print(f"Predicted number of available bikes: {prediction[0]}")

Predicted number of available bikes: 14.705277098796019


# Predice_based_on_weather

In [None]:

# Load the trained model
with open("Dubike_random_forest_model.joblib", "rb") as file:
    model = pickle.load(file)

def get_weather_forecast(city, date):
    """Stub function for weather forecast. Returns fixed weather data: REPLACE WITH CALL TO OPENWEATHER API
    """
    return {
        'max_temperature': 20.0,
        'min_temperature': 2,
        'humidity': 60.0,
        'wind_speed': 10.0,
        'precipitation': 0.0,
        'pressure': 1001.10
    }

def predict_bike_availability(station_id, city, date_str, time_str):
    """Predict the number of available bikes for a given city, date, and time."""
    # Parse input date and time
    date_time = datetime.strptime(f"{date_str} {time_str}", "%Y-%m-%d %H:%M")
    hour = date_time.hour
    day_of_week = date_time.weekday()

    # Use the function for weather forecast
    weather_features = get_weather_forecast(city, date_str)
    
    # Prepare input data for the model
    input_data = pd.DataFrame([{
        'station_id': station_id,
        'max_temperature': weather_features['max_temperature'],
        'min_temperature': weather_features['min_temperature'],
        'humidity': weather_features['humidity'],
        'pressure': weather_features['pressure'],
        'hour': hour,
        'day': day_of_week
    }])

    # Make prediction
    prediction = model.predict(input_data)
    return prediction[0]

# Example usage
city = "Dublin"
date_str = "2025-04-05"
time_str = "09:00"
station_id = 50

predicted_bikes = predict_bike_availability(station_id, city, date_str, time_str)
print(f"Predicted number of available bikes in {city} on {date_str} at {time_str}: {predicted_bikes}")


Predicted number of available bikes in Dublin on 2025-04-05 at 09:00: 15.499298307198972
