In [104]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from datetime import datetime, timedelta

In [105]:
kyoto_df = pd.read_csv('data/kyoto.csv')
japan_df = pd.read_csv('data/japan.csv')

**Data Preprocessing and Feature Engineering**

In [106]:
kyoto_df['bloom_date'] = pd.to_datetime(kyoto_df['bloom_date'], errors='coerce')
japan_df['bloom_date'] = pd.to_datetime(japan_df['bloom_date'], errors='coerce')

# Drop rows with missing bloom_date in Kyoto
kyoto_df = kyoto_df.dropna(subset=['bloom_date'])

# Standardize latitude, longitude, and altitude
scaler = StandardScaler()
kyoto_df[['lat', 'long', 'alt']] = scaler.fit_transform(kyoto_df[['lat', 'long', 'alt']])
japan_df[['lat', 'long', 'alt']] = scaler.transform(japan_df[['lat', 'long', 'alt']])

# Merge both datasets
combined_df = pd.concat([kyoto_df, japan_df], ignore_index=True)

# Create lag features for previous bloom dates
combined_df = combined_df.sort_values(by=['location', 'year'])
combined_df['bloom_doy_lag1'] = combined_df.groupby('location')['bloom_doy'].shift(1)
combined_df['bloom_doy_lag2'] = combined_df.groupby('location')['bloom_doy'].shift(2)
combined_df['bloom_doy_lag3'] = combined_df.groupby('location')['bloom_doy'].shift(3)

# Fill missing lag values with the mean bloom_doy of each location
combined_df =combined_df.fillna(combined_df.groupby('location')['bloom_doy'].mean())

**Model Selection and Training**

Using Random Forest for Exact Date

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
train_df = combined_df[combined_df['year'] < 2025]
train_df.fillna(train_df.mean(), inplace=True)

# Define features and target
features = ['lat', 'long', 'alt', 'year', 'bloom_doy_lag1', 'bloom_doy_lag2', 'bloom_doy_lag3']
target = 'bloom_doy'

# Train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(train_df[features], train_df[target], test_size=0.2, random_state=42)

# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=100)
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 2.966325379609538


  train_df.fillna(train_df.mean(), inplace=True)
  train_df.fillna(train_df.mean(), inplace=True)


In [109]:
# Extract latest available data (2024) for Kyoto
latest_kyoto = combined_df[(combined_df['location'] == 'kyoto')].sort_values(by='year').iloc[-1:].copy()
latest_kyoto['year'] = 2025  # Update year to 2025

# Use previous bloom_doy as lag values
latest_kyoto['bloom_doy_lag1'] = latest_kyoto['bloom_doy']
latest_kyoto['bloom_doy_lag2'] = latest_kyoto['bloom_doy_lag1']
latest_kyoto['bloom_doy_lag3'] = latest_kyoto['bloom_doy_lag2']

# Keep only necessary columns
latest_kyoto = latest_kyoto[['location', 'lat', 'long', 'alt', 'year', 'bloom_doy_lag1', 'bloom_doy_lag2', 'bloom_doy_lag3']]

latest_kyoto

Unnamed: 0,location,lat,long,alt,year,bloom_doy_lag1,bloom_doy_lag2,bloom_doy_lag3
337,kyoto,-7.105427e-15,2.842171e-14,0.0,2025,95,95,95


In [110]:
# Predict 2025 bloom DOY
latest_kyoto['predicted_bloom_doy'] = int(model.predict(latest_kyoto[features]))

# Convert DOY to actual date
latest_kyoto['predicted_bloom_date'] = latest_kyoto['predicted_bloom_doy'].apply(
    lambda doy: datetime(2025, 1, 1) + timedelta(days=int(doy) - 1)
)
latest_kyoto

Unnamed: 0,location,lat,long,alt,year,bloom_doy_lag1,bloom_doy_lag2,bloom_doy_lag3,predicted_bloom_doy,predicted_bloom_date
337,kyoto,-7.105427e-15,2.842171e-14,0.0,2025,95,95,95,89,2025-03-30


Using Gradient Boosting for Time Interval Prediction

In [92]:
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

# Define Gradient Boosting models for quantile regression
gb_early = GradientBoostingRegressor(loss="quantile", alpha=0.1, n_estimators=100, random_state=42)
gb_late = GradientBoostingRegressor(loss="quantile", alpha=0.9, n_estimators=100, random_state=42)

# Train models using past bloom data
gb_early.fit(X_train, y_train)
gb_late.fit(X_train, y_train)

# Predict early and late bloom DOY for 2025
early_bloom_doy = gb_early.predict(latest_kyoto[features])
late_bloom_doy = gb_late.predict(latest_kyoto[features])

print(int(early_bloom_doy),int(late_bloom_doy))

# Convert DOY to actual dates
earliest_bloom_date = datetime(2025, 1, 1) + timedelta(days=int(early_bloom_doy[0]) - 1)
latest_bloom_date = datetime(2025, 1, 1) + timedelta(days=int(late_bloom_doy[0]) - 1)

print(f"Predicted Cherry Bloom Interval for Kyoto in 2025: {earliest_bloom_date} - {latest_bloom_date}")


85 96
Predicted Cherry Bloom Interval for Kyoto in 2025: 2025-03-26 00:00:00 - 2025-04-06 00:00:00
