## Approach 1: Consider the bloom prediction task as a multi-task classification problem.

For the first approach, we use collected daily climate data to characterize each day (at least a week in advance before the bloom peak day). This approach assumes assinging a class label for each day: more than a week before blooming, week before, 6 days before ... bloom peak day.

In [1]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from datetime import datetime
from random import sample
import matplotlib.pyplot as plt

### Load data extended with daily climat information

In [6]:
df = pd.read_csv('data_ext/all_locations_weather_daily.csv', low_memory=False)
df.bloom_date = df.bloom_date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())
df.day = df.day.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())
df["doy"] = df.day.apply(lambda x: x.timetuple().tm_yday)
df.head(2)

(991246, 23)


Unnamed: 0,location,lat,long,alt,year,bloom_date,bloom_doy,tavg,tmin,tmax,...,wspd,wpgt,pres,tsun,day,3day_mean,7day_mean,heat_sum,class,doy
0,vancouver,49.2237,-123.1636,24.0,2022,2022-03-27,86,-4.3,-9.9,1.3,...,18.5,49.0,1021.5,,2022-01-01,,,-4.3,0,1
1,vancouver,49.2237,-123.1636,24.0,2022,2022-03-27,86,2.7,0.4,4.9,...,29.7,68.0,1004.7,,2022-01-02,,,-1.6,0,2


### Dealing with missing values for climate data

In [7]:
df["3day_mean"].fillna(df.tavg, inplace=True)
df["7day_mean"].fillna(df.tavg, inplace=True)
df["tmin"].fillna(df.tavg, inplace=True)
df["tmax"].fillna(df.tavg, inplace=True)
df["prcp"].fillna(0, inplace=True)
print(df.isnull().sum(axis=0))

location           0
lat                0
long               0
alt                0
year               0
bloom_date         0
bloom_doy          0
tavg            4006
tmin            4005
tmax            4006
prcp               0
snow          632009
wdir          806953
wspd          710155
wpgt          917214
pres          817280
tsun          897648
day                0
3day_mean       4006
7day_mean       4006
heat_sum        4006
class              0
doy                0
dtype: int64


In [8]:
feature_cols = ['lat', 'long', 'alt', 'year', 'doy','tavg', 'tmin', 'tmax', 
                'prcp', '3day_mean', '7day_mean', 'heat_sum']
val_year = [1970, 1979, 2020] # use the same random set to assess the model performance on a validation set
val_locations = ['kyoto', 'liestal','washingtondc']

In [9]:
df_ftr = df.loc[:, ['location', 'lat', 'long', 'alt', 'year', 'doy','tavg', 'tmin', 'tmax', 
                'prcp', '3day_mean', '7day_mean', 'heat_sum', 'class', 'bloom_doy']]
df_ftr.dropna(inplace=True)

In [10]:
ftr = df_ftr['location'].isin(val_locations) & df_ftr['year'].isin(val_year)
train = df_ftr.loc[~ftr, :]
valid = df_ftr.loc[ftr, :]
print(train.shape, valid.shape)

(986370, 15) (870, 15)


## Multi-class classification

In [11]:
import itertools 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error,r2_score,mean_squared_error

In [12]:
X = train.loc[:, feature_cols]
y = train["class"]
X_valid = valid.loc[:, feature_cols]
y_valid = valid["class"]

In [13]:
# We have highly imbalanced classes. Define weight as inverse frequencies
weights = dict(1/(y.value_counts()/len(y)))
weights

{0: 1.0723604388722914,
 6: 103.68653421633553,
 7: 103.69743481917577,
 2: 103.70833771422562,
 3: 103.75197223098769,
 5: 103.76288659793813,
 4: 103.77380326144136,
 1: 103.78472222222223}

### Random Forest

In [14]:
rf_classifier = RandomForestClassifier(n_estimators = 300, criterion = 'entropy', max_depth=100, 
                                       class_weight="balanced", random_state = 42)
rf_classifier.fit(X, y)
y_pred=rf_classifier.predict(X_valid)
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       807
           1       1.00      0.11      0.20         9
           2       0.00      0.00      0.00         9
           3       1.00      0.11      0.20         9
           4       0.50      0.11      0.18         9
           5       0.33      0.11      0.17         9
           6       0.20      0.11      0.14         9
           7       0.29      0.22      0.25         9

    accuracy                           0.93       870
   macro avg       0.53      0.22      0.26       870
weighted avg       0.91      0.93      0.91       870



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
val_pairs = list(itertools.product(val_year,val_locations))
for p in val_pairs:
    print(p)
    y_pred=rf_classifier.predict(df_ftr[feature_cols][(df_ftr.year==p[0]) & (df_ftr.location==p[1])])
    print(y_pred)
    print(classification_report(df_ftr["class"][(df_ftr.year==p[0]) & (df_ftr.location==p[1])], y_pred))

(1970, 'kyoto')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 6 6 7 6 7 6 7 7 7 7]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       100
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.17      1.00      0.29         1

    accuracy                           0.91       107
   macro avg       0.15      0.24      0.16       107
weighted avg       0.94      0.91      0.92       107

(1970, 'liestal')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.94      1.00      0.97       107
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1

    accuracy                           0.94       114
   macro avg       0.12      0.12      0.12       114
weighted avg       0.88      0.94      0.91       114

(1970, 'washingtondc')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        99
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99        90
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1

    accuracy                           0.98        97
   macro avg       0.75      0.75      0.75        97
weighted avg       0.96      0.98      0.97        97

(1979, 'liestal')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        98
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.92      1.00      0.96        85
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1

    accuracy                           0.92        92
   macro avg       0.12      0.12      0.12        92
weighted avg       0.85      0.92      0.89        92

(2020, 'kyoto')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 4 5]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        85
           1       1.00      1.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.91      1.00      0.95        70
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1

    accuracy                           0.91        77
   macro avg       0.11      0.12      0.12        77
weighted avg       0.83      0.91      0.87        77

(2020, 'washingtondc')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        73
           1       0.00      0.00      0.00         1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
from sklearn.ensemble import GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                           max_depth=10, random_state=42)
gb_classifier.fit(X, y)

GradientBoostingClassifier(learning_rate=1.0, max_depth=10, random_state=42)

In [20]:
y_pred=gb_classifier.predict(X_valid)
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.04      0.08       807
           1       0.02      0.11      0.04         9
           2       0.02      0.22      0.04         9
           3       0.03      0.22      0.06         9
           4       0.03      0.22      0.05         9
           5       0.00      0.00      0.00         9
           6       0.00      0.22      0.01         9
           7       0.00      0.00      0.00         9

    accuracy                           0.05       870
   macro avg       0.11      0.13      0.03       870
weighted avg       0.69      0.05      0.07       870



Unfortunately, models tend to predict the null class. We did not trained other classifiers and were unable to tune parameters due to lack of time. For this reason, we sticked to the Approach 2. 