In [None]:
import json
import seaborn as sns
import altair as alt
import httpx
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from shapely import geometry
from shapely.ops import unary_union
import os
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

In [None]:
from dateutil.relativedelta import relativedelta


def get_keyed_values(results, label, keyed_value, new_col):
    df = pd.DataFrame(results)
    df.labels = df.labels.map(lambda x: x[0])
    df = df[df.labels == label]
    df[new_col] = df.keyed_values.apply(lambda x: x.get(keyed_value))
    df = df.drop_duplicates(subset=["aoi_id", "datetime"]).dropna()
    df.datetime = pd.to_datetime(df.datetime)
    
    
    if keyed_value == "FORECAST_SPI":
        months_df = df[new_col].apply(pd.Series)
        months_df.columns = [f"{new_col}_1", f"{new_col}_2",
                             f"{new_col}_3", f"{new_col}_4",
                             f"{new_col}_5", f"{new_col}_6"]
        df = pd.concat([df.drop([new_col], axis=1), months_df], axis=1)
    
        df.index = df["datetime"]
        
    elif keyed_value == "mean_value" or keyed_value == "N_PIXELS":
        df = df.groupby(["datetime"]).mean().resample("MS").mean()
        #df.index = df.index.map(lambda x: x.replace(day=1))
      
    else:
        df.index = df["datetime"]
        
        
    return df

2179, 2416, 2199, 2359, 2411, 2453, 2459, 2462, 2213, 2182, 2233, 2185, 2181, 2187, 2384, 2219, 2269, 2270, 2221, 2275, 2344, 2150, 2137, 2168, 2358, 2411, 2413, 2415, 2207, 2370


2179, 2411, 2453, 2182, 2185, 2181, 2187, 2384, 2344

In [None]:
tete_aga = [2179, 2416, 2199, 2359, 2411, 2453, 2459, 2462, 2213, 2182, 2233, 2185, 2181, 2187, 2384, 2219, 2269, 2270, 2221, 2275, 2344, 2150, 2137, 2168, 2358, 2411, 2413, 2415, 2207, 2370]

# API Auth

In [None]:
base_url = "http://localhost:8081/"
client = httpx.Client(base_url=base_url)
r = client.post(
    "auth/token",
    data={"username": "fran.dorr@gmail.com", "password": "fran123"},
)
token = json.loads(r.text)["access_token"]
headers = {"Authorization": f"Bearer {token}"}

# Get agricultural areas geoms from the API

In [None]:

r = client.get("aoi/", params=dict(id=tete_aga), headers=headers)
res = json.loads(r.text)
polygons = geometry.shape(res["features"][0]["geometry"])
# get the bbox for all the ag areas
box = polygons.bounds

In [None]:
polygons

# Get Events from DB 
- Date range from 2019 to 2022

In [None]:
start_datetime = "1985-01-02"
end_datetime = "2022-08-31"

ndvi_cols = ["month_ndvi_mean"]
sm_cols = ["month_sm_mean"]
chirps_cols = ["chirps_actual"]
forecast_spi_cols = ["mixed_spi",
                     "forecast_spi_1", 
                     "forecast_spi_2",
                     "forecast_spi_3",
                     "forecast_spi_4",
                     "forecast_spi_5",
                     "forecast_spi_6"]


In [None]:
def add_season_features(df,season,feature,model_forecast):
    df[f"past_{season}_{feature}"] = sm[f"month_{feature}_mean"].shift(season+model_forecast).interpolate() 
    df[f"past_{season}_{feature}_adj_left"] = df[f"month_{feature}_mean"].shift(season+1+model_forecast).interpolate() 
    df[f"past_{season}_{feature}_adj_right"] =  df[f"month_{feature}_mean"].shift(season-1+model_forecast).interpolate() 

    df[f"past_{season}_{feature}_diff_left"] =  df[f"past_{season}_{feature}"] 
    df[f"past_{season}_{feature}_diff_right"] = df[f"past_{season}_{feature}"] 

    df[f"past_{season}_{feature}_ratio_left"] = df[f"past_{season}_{feature}_adj_left"] /  df[f"past_{season}_{feature}"]
    df[f"past_{season}_{feature}_ratio_right"] =  df[f"past_{season}_{feature}_adj_right"] /  df[f"past_{season}_{feature}"]

    df[f"past_{season}_{feature}_adj_sum"] =  df[f"past_{season}_{feature}"] +  df[f"past_{season}_{feature}_adj_left"] + df[f"past_{season}_{feature}_adj_right"] 

    df[f"past_{season}_{feature}_adj_mean"] = df[f"past_{season}_{feature}_adj_sum"]/3 
    
    new_cols = [f'past_{season}_{feature}',f'past_{season}_{feature}_adj_left',
                f'past_{season}_{feature}_adj_right',
                f'past_{season}_{feature}_diff_left',
                f'past_{season}_{feature}_diff_right',
                f'past_{season}_{feature}_ratio_left', 
                f'past_{season}_{feature}_ratio_right',
                f'past_{season}_{feature}_adj_sum',
                f'past_{season}_{feature}_adj_mean']
    return df, new_cols
              

In [None]:

def extract_features(start_datetime, end_datetime, ag_areas, model_forecast = 1, season=7):
    all_df = pd.DataFrame()
    failed = []
    new_sm_cols = sm_cols
    cols_to_use = sm_cols + chirps_cols + [forecast_spi_cols[model_forecast-1]] #+ we_cols#+ ndvi_cols

    for ag_area in ag_areas:

        try:
            r = client.get(
                    "events/",
                    params=dict(
                        aoi_id=ag_area,
                        start_datetime=start_datetime,
                        end_datetime=end_datetime,
                        limit=10000,
                    ),
                    headers=headers,
                    timeout=60,
                )
            aga_results = json.loads(r.text)["events"]

            ndvi = get_keyed_values(aga_results, "ndvi", "mean_value", "month_ndvi_mean")
            sm = get_keyed_values(aga_results, "soil_moisture", "mean_value", "month_sm_mean")
            chirps_actual = get_keyed_values(aga_results, "total_precipitation", "CHIRPS_SPI_actual", "chirps_actual")
            forecast_spi = get_keyed_values(aga_results, "total_precipitation", "FORECAST_SPI", "forecast_spi")
            mixed_spi = get_keyed_values(aga_results, "total_precipitation", "MIXED_SPI", "mixed_spi")
        
            if season:
                sm, new_sm_cols = add_season_features(sm, season, "sm", model_forecast)
                new_sm_cols = list(set(sm_cols + new_sm_cols))
                cols_to_use = list(set(cols_to_use + new_sm_cols))        

            final_df = ndvi.join(sm, lsuffix="", rsuffix="_r")
            final_df = final_df.join(forecast_spi, lsuffix="", rsuffix="_r")
            final_df = final_df.join(mixed_spi, lsuffix="", rsuffix="_r")
            final_df = final_df.join(chirps_actual, lsuffix="", rsuffix="_r")
            final_df["chirps_actual"] = final_df.chirps_actual.shift(1)
            final_df[f"target_ndvi"] = final_df.month_ndvi_mean.shift(-model_forecast)

            import numpy as np

            final_df.replace([np.inf, -np.inf], np.nan, inplace=True)
            final_df = final_df.drop(columns=["month_ndvi_mean"]).dropna(subset=cols_to_use+["target_ndvi"])
            #final_df = final_df.dropna()
            print(final_df.shape)

            all_df = pd.concat([all_df, final_df], axis=0)
        except Exception as e:
            print(ag_area, f" failed with exception {e}.")
            failed.append(ag_area)
    all_df = all_df.sort_index()
    all_df = all_df[cols_to_use + ["aoi_id", "target_ndvi"]]
    return all_df, [new_sm_cols, chirps_cols,[forecast_spi_cols[model_forecast-1]] ]
    

In [None]:
# Function modified from https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html



import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.max_columns = 30
import os
import re
from colorama import Fore, Back, Style
import seaborn as sns
import plotly.express as px
import matplotlib
from matplotlib.patches import Patch
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
plt.style.use('fivethirtyeight')
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
import warnings
warnings.filterwarnings('ignore')



def plot_cv_indices(cv, n_splits, X, y, date_col = None):
    """Create a sample plot for indices of a cross-validation object."""
    
    fig, ax = plt.subplots(1, 1, figsize = (11, 7))
    
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=10, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)


    # Formatting
    yticklabels = list(range(n_splits))
    
    if date_col is not None:
        tick_locations  = ax.get_xticks()
        tick_dates = [" "] + date_col.iloc[list(tick_locations[1:-1])].astype(str).tolist() + [" "]

        tick_locations_str = [str(int(i)) for i in tick_locations]
        new_labels = ['\n\n'.join(x) for x in zip(list(tick_locations_str), tick_dates) ]
        ax.set_xticks(tick_locations)
        ax.set_xticklabels(new_labels)
    
    ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
              ['Testing set', 'Training set'], loc=(1.02, .8))
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    

In [None]:
from sklearn.metrics import mean_squared_error
def train_flaml(subset, X_train, y_train, X_test, y_test, budget):
    cols_to_use = list(chain.from_iterable(subset))
    
    selected_features = cols_to_use
    
    
    automl = AutoML(time_budget=budget, seed=42, 
                    estimator_list=["lgbm", "xgboost"], starting_points = "data",
                    task="regression", verbose=0, n_jobs=-1, metric="mse", limit_train_time=3)
    
    
    automl.fit(X_train[selected_features], y_train, X_val=X_test, y_val=y_test)
    
    
    
    preds = automl.predict(X_test[selected_features])
    y_true = y_test

    return {"model": automl.best_estimator, 
            "cols": selected_features, 
            "mse": mean_squared_error(y_true,preds),
            "y_true": y_true,
            "y_pred": preds,
            "aoi_id": X_test["aoi_id"]
            }
    
 

In [None]:
from sklearn.model_selection import TimeSeriesSplit


from flaml import AutoML
from itertools import chain, combinations
from operator import add

def all_subsets(ss):
    return chain(*map(lambda x: combinations(ss, x), range(0, len(ss)+1)))
    
all_res = []
for model_forecast in range(1,8):
    print(f"RUNNING MODEL {model_forecast}")
    all_df, cols_to_use = extract_features(start_datetime, end_datetime, tete_aga, model_forecast = model_forecast)
    
    X = all_df.drop(["target_ndvi"],axis=1)
    y = all_df["target_ndvi"]
    
    
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits,test_size=int(X.shape[0]*0.15))
    
    all_cols = cols_to_use#[sm_cols , chirps_cols , forecast_spi_cols[:model_forecast] ]#, we_cols, ndvi_cols]

    res = []
    for fold, (train_index, test_index) in enumerate(tscv.split(X)):
        # check there is no day overlap between train and test
        train_dates = X.iloc[train_index].index
        test_dates = X.iloc[test_index].index
        to_delete = []
        for i,ti in enumerate(test_index):
            if X.iloc[ti].name in train_dates:
                to_delete.append(i)
        test_index = np.delete(test_index, to_delete)

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        for subset in list(all_subsets(all_cols))[1:]:
            res.append(train_flaml(subset, X_train, y_train, X_test, y_test, budget=3))
            
    final_res = pd.DataFrame(res)
    final_res["cols"] = final_res.cols.apply(tuple)
    best_df = final_res[final_res.cols==final_res.groupby("cols").mean().sort_values(by="mse").iloc[0].name]
    base_df = final_res[final_res.cols==("chirps_actual", forecast_spi_cols[model_forecast-1])]
    all_res.append(final_res.groupby("cols").mean().sort_values(by="mse"))

    cv_mse = pd.DataFrame()
    for row in best_df.iterrows():
        aoi_ids = row[1]["aoi_id"]
        y_true = row[1]["y_true"]
        y_pred = row[1]["y_pred"]
        df_cv = pd.DataFrame({"aoi_id":aoi_ids, "y_true":y_true, "y_pred":y_pred})
        cv_mse=pd.concat([cv_mse, df_cv.groupby("aoi_id").agg(lambda x: mean_squared_error(**x))], axis=0)
    best_by_aoi = cv_mse.groupby("aoi_id").mean()["y_true"].copy()

    for row in base_df.iterrows():
        aoi_ids = row[1]["aoi_id"]
        y_true = row[1]["y_true"]
        y_pred = row[1]["y_pred"]
        df_cv = pd.DataFrame({"aoi_id":aoi_ids, "y_true":y_true, "y_pred":y_pred})
        cv_mse=pd.concat([cv_mse, df_cv.groupby("aoi_id").agg(lambda x: mean_squared_error(**x))], axis=0)
    base_by_aoi = cv_mse.groupby("aoi_id").mean()["y_true"].copy()

    final_by_aoi = pd.concat([best_by_aoi, base_by_aoi], axis=1)
    final_by_aoi.columns = ["mse_sm", "mse_spi"]
    final_by_aoi.to_csv(f'cv_mse_by_aoi_{model_forecast}.csv')