## Read preprocessed data

In [1]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from datetime import datetime
from random import sample
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data_ext/all_locations_weather_fixed.csv', low_memory=False)
df.bloom_date = df.bloom_date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())
df.day = df.day.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())
df["doy"] = df.day.apply(lambda x: x.timetuple().tm_yday)
print(df.shape)
df.head(2)

(991246, 23)


Unnamed: 0,location,lat,long,alt,year,bloom_date,bloom_doy,tavg,tmin,tmax,...,wspd,wpgt,pres,tsun,day,3day_mean,7day_mean,heat_sum,class,doy
0,vancouver,49.2237,-123.1636,24.0,2022,2022-03-27,86,-4.3,-9.9,1.3,...,18.5,49.0,1021.5,,2022-01-01,,,-4.3,0,1
1,vancouver,49.2237,-123.1636,24.0,2022,2022-03-27,86,2.7,0.4,4.9,...,29.7,68.0,1004.7,,2022-01-02,,,-1.6,0,2


In [3]:
#imputation
df["3day_mean"].fillna(df.tavg, inplace=True)
df["7day_mean"].fillna(df.tavg, inplace=True)
df["tmin"].fillna(df.tavg, inplace=True)
df["tmax"].fillna(df.tavg, inplace=True)
df["prcp"].fillna(0, inplace=True)
print(df.isnull().sum(axis=0))

location           0
lat                0
long               0
alt                0
year               0
bloom_date         0
bloom_doy          0
tavg            3083
tmin            3083
tmax            3083
prcp               0
snow          632009
wdir          806953
wspd          710155
wpgt          917214
pres          817280
tsun          897648
day                0
3day_mean       3083
7day_mean       3083
heat_sum        3252
class              0
doy                0
dtype: int64


In [39]:
feature_cols = ['lat', 'long', 'alt', 'year', 'doy','tavg', 'tmin', 'tmax', 
                'prcp', '3day_mean', '7day_mean', 'heat_sum']
val_year = [1970, 1979, 2020]
val_locations = ['kyoto', 'liestal','washingtondc']

In [32]:
df_ftr = df.loc[:, ['location', 'lat', 'long', 'alt', 'year', 'doy','tavg', 'tmin', 'tmax', 
                'prcp', '3day_mean', '7day_mean', 'heat_sum', 'class', 'bloom_doy']]
df_ftr.dropna(inplace=True)

In [33]:
ftr = df_ftr['location'].isin(val_locations) & df_ftr['year'].isin(val_year)
train = df_ftr.loc[~ftr, :]
valid = df_ftr.loc[ftr, :]
print(train.shape, valid.shape)

(987124, 15) (870, 15)


## Multi-class classification

In [45]:
import itertools 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error,r2_score,mean_squared_error

In [34]:
X = train.loc[:, feature_cols]
y = train["class"]
X_valid = valid.loc[:, feature_cols]
y_valid = valid["class"]

In [81]:
# define weight as inverse frequencies
weights = dict(1/(y.value_counts()/len(y)))
weights

{0: 1.07233378741562,
 2: 103.7657941763902,
 3: 103.7657941763902,
 4: 103.7657941763902,
 5: 103.7657941763902,
 6: 103.7657941763902,
 1: 103.7767031118587,
 7: 103.80944368493006}

In [31]:
rf_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', max_depth=100, 
                                       class_weight="balanced", random_state = 42)
rf_classifier.fit(X, y)
y_pred=rf_classifier.predict(X_valid)
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       807
           1       1.00      0.22      0.36         9
           2       1.00      0.11      0.20         9
           3       0.50      0.11      0.18         9
           4       0.33      0.11      0.17         9
           5       0.25      0.11      0.15         9
           6       0.33      0.11      0.17         9
           7       0.20      0.22      0.21         9

    accuracy                           0.93       870
   macro avg       0.57      0.25      0.30       870
weighted avg       0.92      0.93      0.92       870



In [72]:
val_pairs = list(itertools.product(val_year,val_locations))
for p in val_pairs:
    print(p)
    y_pred=rf_classifier.predict(df_ftr[feature_cols][(df_ftr.year==p[0]) & (df_ftr.location==p[1])])
    print(y_pred)
    print(classification_report(df_ftr["class"][(df_ftr.year==p[0]) & (df_ftr.location==p[1])], y_pred))

(1970, 'kyoto')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 7 5 7 6 5 6 7 7 7 7]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       100
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.14      1.00      0.25         1

    accuracy                           0.91       107
   macro avg       0.14      0.24      0.15       107
weighted avg       0.94      0.91      0.92       107

(1970, 'liestal')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.94      1.00      0.97       107
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1

    accuracy                           0.94       114
   macro avg       0.12      0.12      0.12       114
weighted avg       0.88      0.94      0.91       114

(1970, 'washingtondc')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        99
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        90
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1

    accuracy                           1.00        97
   macro avg       1.00      1.00      1.00        97
weighted avg       1.00      1.00      1.00        97

(1979, 'liestal')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        98
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.93      1.00      0.97        85
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1

    accuracy                           0.92        92
   macro avg       0.12      0.12      0.12        92
weighted avg       0.86      0.92      0.89        92

(2020, 'kyoto')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 3 4 0 4 7 5]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        85
           1       1.00      1.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.91      1.00      0.95        70
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1

    accuracy                           0.91        77
   macro avg       0.11      0.12      0.12        77
weighted avg       0.83      0.91      0.87        77

(2020, 'washingtondc')
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        73
           1       0.00      0.00      0.00         1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn import svm
clf = svm.SVC(decision_function_shape='ovo', class_weight=weights)
clf.fit(X, y)

In [None]:
print("R^2 : ", r2_score(y_valid_reg, y_pred))
print("MAE :", mean_absolute_error(y_valid_reg,y_pred))
print("RMSE:",np.sqrt(mean_squared_error(y_valid_reg, y_pred)))