# Timeseries and met data

In [None]:
import json

import altair as alt
import httpx
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from shapely import geometry
from shapely.ops import unary_union
import os
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

In [None]:
from statsmodels.tsa.stattools import adfuller

# This is a stationary test for timeseries
def adf_test(timeseries):
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

In [None]:
tete_wb = ["1953", "1954", "1955", "1956", "1975", "1966", "1967", "1968", "1981", "1969", "1970", "1971", "1972", "1973", "1974", "1988"]
tete_aga = ["2363", "2189", "2190", "2191", "2202", "2192", "2193", "2194", "2203", "2195", "2196", "2197", "2198", "2199", "2200", "2201", "2204", "2205", "2206", "2208", "2209", "2207", "2210", "2211", "2212", "2213", "2214", "2215", "2216", "2359", "2360", "2361", "2362", "2364", "2365", "2366", "2367", "2368", "2369", "2370", "2371", "2372", "2409", "2410", "2411", "2412", "2413", "2414", "2415", "2454", "2416", "2452", "2453", "2455", "2456", "2459", "2457", "2458", "2460", "2461", "2462", "2463", "2464"]


tete_wb = list(map(int, tete_wb))
tete_aga = list(map(int, tete_aga))

# API Auth

In [None]:
base_url = "https://api.oxfordeo.com/"

In [None]:
client = httpx.Client(base_url=base_url)

In [None]:
r = client.post(
    "auth/token",
    data={"username": os.environ["API_USER"], "password": os.environ["API_PASS"]},
)

In [None]:
token = json.loads(r.text)["access_token"]
headers = {"Authorization": f"Bearer {token}"}

# Get agricultural areas geoms from the API

In [None]:
polygons = []
for wb_id in tete_aga:
    try:
        r = client.get("aoi/", params=dict(id=wb_id), headers=headers)
        res = json.loads(r.text)
        polygons.append(geometry.shape(res["features"][0]["geometry"]))
    except:
        continue

In [None]:
unary_union(polygons)

In [None]:
# get the bbox for all the ag areas
box = unary_union(polygons).bounds


# Get Events from DB 
- Date range from 2019 to 2022

In [None]:
start_datetime = "2019-01-01"
end_datetime = "2021-12-31"

In [None]:
aga_results = []
for aoi in tete_aga:
    r = client.get(
        "events/",
        params=dict(
            aoi_id=aoi,
            start_datetime=start_datetime,
            end_datetime=end_datetime,
            limit=10000,
        ),
        headers=headers,
        timeout=60,
    )
    aga_results.extend(json.loads(r.text)["events"])
    
wb_results = []
for aoi in tete_wb:
    r = client.get(
        "events/",
        params=dict(
            aoi_id=aoi,
            start_datetime=start_datetime,
            end_datetime=end_datetime,
            limit=10000,
        ),
        headers=headers,
        timeout=60,
    )
    wb_results.extend(json.loads(r.text)["events"])

In [None]:
def get_keyed_values(results, keyed_value, new_col):
    df = pd.DataFrame(results)
    df.labels = df.labels.map(lambda x: x[0])
    df[new_col] = df.keyed_values.apply(lambda x: x.get(keyed_value))
    df = df.drop_duplicates(subset=["aoi_id", "datetime"]).dropna()
    df.datetime = pd.to_datetime(df.datetime)
    return df
    

In [None]:
aga_df = get_keyed_values(aga_results, "mean_value", "ndvi_mean")
wb_df = get_keyed_values(wb_results, "water_pixels", "water_pixels")

In [None]:
# TODO: We don't have water pixels for all the dates, we should run the predictions. 
# I'm not using water pixels predictions from now on to avoid problems.
aga_df.datetime.min(), aga_df.datetime.max(), wb_df.datetime.min(), wb_df.datetime.max()

# Get NDVI for given daterange
- Filter results by date
- Fill na values using forward fill (we don't an image for every single day)

In [None]:
ndvi=aga_df[(aga_df.datetime >= start_datetime) & (aga_df.datetime <= end_datetime)].groupby(["datetime"]).mean()["ndvi_mean"]

In [None]:
idx = pd.date_range(start_datetime, end_datetime)
ndvi = ndvi.reindex(idx)
ndvi = ndvi.fillna(method="ffill")

In [None]:
ndvi.plot()

# Get precipitation data from bucket
- Data is stored in zarr with dimensions: latitude, longitude, step, member and time
- step is synonym of "days forecast" (up to 215 days = 7 months). Shows acum TP (if we don't want acum, we can use df.diff)
- members are different forecast models (up to 50). We can average them
- time has a measure everymonth



In [None]:
import gcsfs
import xarray as xr
import matplotlib.pyplot as plt
url = 'gs://oxeo-seasonal/tp'
zx = xr.open_zarr(gcsfs.GCSMap(url)) 

In [None]:
min_x, min_y, max_x, max_y = box

In [None]:
from datetime import datetime
data = zx['tp'].sel({'time':slice(datetime.strptime(start_datetime, "%Y-%m-%d"),
                                  datetime.strptime(end_datetime, "%Y-%m-%d")),
                'latitude':slice(round(max_y),round(min_y)),
                'longitude':slice(round(min_x),round(max_x))})

In [None]:
data

## Getting daily tp data
We have monthly measures but for each month we have 215 days of forecast data. 
If we want to have a measure per day we have to:
- query the forecasted data for the next 30 days of each month.
- Get the average of all "members"
- apply diff to df so we don't get the acum TP but single measures

In [None]:
def get_daily_tp(data, date_from, date_to):
    day_range = pd.date_range(date_from, date_to,freq="D")
    day_range = day_range[day_range.day == 1]
    days_per_month = pd.Series(data.sel(time=day_range).time.values).map(lambda x: pd.Period(x,freq="D").days_in_month)
    tp_per_day = []
    for i,dpm in enumerate(days_per_month):
        cum_tp = pd.Series(data.isel(time=i).mean(dim=["latitude","longitude"]).isel(step=slice(dpm)).mean(dim="member").values)
        cum_tp = cum_tp.diff().fillna(cum_tp)
        tp_per_day.append(cum_tp)
    flat_list = [item for sublist in tp_per_day for item in sublist]
    return pd.Series(flat_list, index=pd.date_range(date_from, date_to))


    


In [None]:
start_datetime

In [None]:
tp_per_day = get_daily_tp(data, start_datetime, end_datetime)
tp_per_day.plot()

# Timeseries models
We can start using some simple ARIMA models and try to predict NDVI. 
We start only using ndvi data and later we'll add weather data as an exogenous variable.

Models:
- Rolling 30D NDVI Model with Sarima
- NDVI + Weather 30D using RF

## Train/Test split

In [None]:
days = ndvi.shape[0] - 215

train=ndvi[:days]
test=ndvi[days:]



print(train.shape, test.shape)

In [None]:
rolling_ndvi = ndvi.rolling("30D").mean()
rolling_ndvi= rolling_ndvi[rolling_ndvi.index.day == 1]

rolling_train = rolling_ndvi[:-7]
rolling_test = rolling_ndvi[-7:]


rolling_tp = tp_per_day.rolling("30D").mean()
rolling_tp= rolling_tp[rolling_tp.index.day == 1]

rolling_tp_train = rolling_tp[:-7]
rolling_tp_test = rolling_tp[-7:]


## NVDI Model with sarima

In [None]:
#pip install scalecast
from scalecast.Forecaster import Forecaster

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(14,7)})


f = Forecaster(y=rolling_ndvi,exog=rolling_tp,current_dates=rolling_ndvi.index)

f.generate_future_dates(7) # 12-month forecast horizon
f.set_test_length(.2) # 20% test set

model_name = "arima"
f.set_estimator(model_name) # set arima


# Forecast
f.manual_forecast(order=(1,1,0),seasonal_order=(1,1,1,12),call_me=model_name)

# View test results
f.plot_test_set(ci=True,models=model_name)
plt.title('ARIMA Test-Set Performance',size=14)
plt.show()

# View forecast results
f.plot(ci=True,models=model_name)
plt.title('ARIMA Forecast Performance',size=14)
plt.show()

# See summary stats
f.regr.summary()

## Random Forest

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
shift = -7 # shift in months. You can use -1 for 1 month forecast, -2 for 2 months...

target_1m = rolling_ndvi.shift(shift)
data = pd.DataFrame(rolling_ndvi)
data["tp"] = rolling_tp
data["y"] = target_1m
data = data.dropna()
data_train = data[:-7]
data_test = data[-7:]

In [None]:
cols_to_use = ['ndvi_mean']

model = RandomForestRegressor(n_estimators=1000, random_state=42)
model.fit(data_train[cols_to_use], data_train["y"])
# make a one-step prediction
yhat = model.predict(data_test[cols_to_use])
plt.plot(pd.Series(yhat, index=data_test.index),'--')
plt.plot(data["y"])

In [None]:
cols_to_use = ['ndvi_mean', 'tp']

model = RandomForestRegressor(n_estimators=1000, random_state=42)
model.fit(data_train[cols_to_use], data_train["y"])
# make a one-step prediction
yhat = model.predict(data_test[cols_to_use])
plt.plot(pd.Series(yhat, index=data_test.index),'--')
plt.plot(data["y"])