<a href="https://colab.research.google.com/github/maggiecope/comp351-ai-project/blob/main/Maggie_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [23]:
# load dataset
file_path = "Wait times with Weather.csv"
df = pd.read_csv(file_path, encoding='latin-1')

print("Dataset loaded!")
print("Shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
df.head()

Dataset loaded!
Shape: (339064, 26)

Column names:
['Land', 'Ride', 'Wait Time', 'Local Time', 'Day of Week', 'Date', 'tempmax', 'tempmin', 'temp', 'humidity', 'precip', 'windgust', 'windspeed', 'winddir', 'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation', 'solarenergy', 'uvindex', 'severerisk', 'sunrise', 'sunset', 'moonphase', 'conditions', 'description']


Unnamed: 0,Land,Ride,Wait Time,Local Time,Day of Week,Date,tempmax,tempmin,temp,humidity,...,visibility,solarradiation,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description
0,Pixar Pier,Toy Story Midway Mania!,40.0,2024-09-23 17:30:04,Monday,2024-09-23,78.4,62.4,67.9,80.5,...,5.6,210.0,18.0,8.0,10.0,2024-09-23 06:40:56,2024-09-23 18:46:15,0.69,Partially cloudy,Clearing in the afternoon.
1,Cars Land,Mater's Graveyard JamBOOree,30.0,2024-09-23 17:45:03,Monday,2024-09-23,78.4,62.4,67.9,80.5,...,5.6,210.0,18.0,8.0,10.0,2024-09-23 06:40:56,2024-09-23 18:46:15,0.69,Partially cloudy,Clearing in the afternoon.
2,Hollywood Land,Mickey's PhilharMagic,10.0,2024-09-23 17:45:03,Monday,2024-09-23,78.4,62.4,67.9,80.5,...,5.6,210.0,18.0,8.0,10.0,2024-09-23 06:40:56,2024-09-23 18:46:15,0.69,Partially cloudy,Clearing in the afternoon.
3,Pixar Pier,Jumpin' Jellyfish,20.0,2024-09-23 17:45:03,Monday,2024-09-23,78.4,62.4,67.9,80.5,...,5.6,210.0,18.0,8.0,10.0,2024-09-23 06:40:56,2024-09-23 18:46:15,0.69,Partially cloudy,Clearing in the afternoon.
4,Avengers Campus,WEB SLINGERS: A Spider-Man Adventure,45.0,2024-09-23 18:00:04,Monday,2024-09-23,78.4,62.4,67.9,80.5,...,5.6,210.0,18.0,8.0,10.0,2024-09-23 06:40:56,2024-09-23 18:46:15,0.69,Partially cloudy,Clearing in the afternoon.


In [24]:
# pre process
# Convert datetime fields
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df["Local Time"] = pd.to_datetime(df["Local Time"], errors="coerce")
df["sunrise"] = pd.to_datetime(df["sunrise"], errors="coerce")
df["sunset"] = pd.to_datetime(df["sunset"], errors="coerce")

# Extract time-based features
df['hour'] = df['Local Time'].dt.hour
df['minutes_since_sunrise'] = (df['Local Time'] - df['sunrise']).dt.total_seconds() / 60
df['minutes_until_sunset'] = (df['sunset'] - df['Local Time']).dt.total_seconds() / 60

print("Time features created!")
df[['Local Time', 'hour', 'minutes_since_sunrise', 'minutes_until_sunset']].head()

Time features created!


Unnamed: 0,Local Time,hour,minutes_since_sunrise,minutes_until_sunset
0,2024-09-23 17:30:04,17,649.133333,76.183333
1,2024-09-23 17:45:03,17,664.116667,61.2
2,2024-09-23 17:45:03,17,664.116667,61.2
3,2024-09-23 17:45:03,17,664.116667,61.2
4,2024-09-23 18:00:04,18,679.133333,46.183333


In [25]:
# feature engineering
# Define features
categorical_features = ['Ride', 'Land', 'Day of Week', 'conditions']
numerical_features = [
    'tempmax', 'tempmin', 'temp', 'humidity', 'precip', 'windgust', 'windspeed',
    'winddir', 'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation',
    'solarenergy', 'uvindex', 'severerisk',
    'hour', 'minutes_since_sunrise', 'minutes_until_sunset'
]

# One-hot encode categorical features
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Create feature matrix X
X = df_encoded[numerical_features + [col for col in df_encoded.columns
                                      if col.startswith(tuple(categorical_features))
                                      and col not in categorical_features]]

# Target variable
y = df['Wait Time']

print("Feature matrix shape:", X.shape)
print("Target variable shape:", y.shape)

Feature matrix shape: (339064, 62)
Target variable shape: (339064,)


In [26]:
# train-test-split
trainX, testX, trainY, testY = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training samples: {trainX.shape[0]}")
print(f"Testing samples: {testX.shape[0]}")

Training samples: 271251
Testing samples: 67813


In [None]:
# train random forest model
# Create baseline model
rf_baseline = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# Train
print("\nTraining...")
rf_baseline.fit(trainX, trainY)
print("Training complete!")

# Predictions
train_pred_baseline = rf_baseline.predict(trainX)
test_pred_baseline = rf_baseline.predict(testX)

# Metrics
train_r2_baseline = r2_score(trainY, train_pred_baseline)
test_r2_baseline = r2_score(testY, test_pred_baseline)
test_rmse_baseline = np.sqrt(mean_squared_error(testY, test_pred_baseline))
test_mae_baseline = mean_absolute_error(testY, test_pred_baseline)

print("\nBASELINE RANDOM FOREST RESULTS:")
print(f"Train R²: {train_r2_baseline:.4f}")
print(f"Test R²: {test_r2_baseline:.4f}")
print(f"Test RMSE: {test_rmse_baseline:.2f} minutes")
print(f"Test MAE: {test_mae_baseline:.2f} minutes")


Training...


In [None]:
# Tuned Model

# Create tuned model
rf_tuned = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt',
    bootstrap=True,
    oob_score=True,
    random_state=42,
    n_jobs=-1
)

# Train
print("\nTraining...")
rf_tuned.fit(trainX, trainY)
print("Training complete!")

# Predictions
train_pred_tuned = rf_tuned.predict(trainX)
test_pred_tuned = rf_tuned.predict(testX)

# Metrics
train_r2_tuned = r2_score(trainY, train_pred_tuned)
test_r2_tuned = r2_score(testY, test_pred_tuned)
test_rmse_tuned = np.sqrt(mean_squared_error(testY, test_pred_tuned))
test_mae_tuned = mean_absolute_error(testY, test_pred_tuned)

print("\nTUNED RANDOM FOREST RESULTS:")
print(f"Train R²: {train_r2_tuned:.4f}")
print(f"Test R²: {test_r2_tuned:.4f}")
print(f"Test RMSE: {test_rmse_tuned:.2f} minutes")
print(f"Test MAE: {test_mae_tuned:.2f} minutes")

In [None]:
# Model Comparisons

comparison = pd.DataFrame({
    'Model': ['RF Baseline', 'RF Tuned'],
    'Train R²': [train_r2_baseline, train_r2_tuned],
    'Test R²': [test_r2_baseline, test_r2_tuned],
    'Test RMSE': [test_rmse_baseline, test_rmse_tuned],
    'Test MAE': [test_mae_baseline, test_mae_tuned]
})

print("\nMODEL COMPARISON:")
print(comparison.to_string(index=False))

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_tuned.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 20 Most Important Features:")
print(feature_importance.head(20).to_string(index=False))

In [None]:
# visualize feature importance
plt.figure(figsize=(12, 8))
top_20 = feature_importance.head(20)
plt.barh(range(len(top_20)), top_20['Importance'])
plt.yticks(range(len(top_20)), top_20['Feature'])
plt.xlabel('Importance Score')
plt.title('Top 20 Most Important Features - Random Forest')
plt.tight_layout()
plt.show()