# Modeling Agricultural Variables
## Python modules

In [9]:
import warnings
import time
import os

import dask
from dask.distributed import Client

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors

import geopandas as gpd
from shapely.geometry import Polygon
import pyarrow
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from scipy.stats import pearsonr
from hilbertcurve.hilbertcurve import HilbertCurve

import math
import seaborn as sns

ModuleNotFoundError: No module named 'hilbert'

This notebook will be used to prepare our feature and ground-truth data for our modeling process. At this point, we have a directory of .feather files that contain our features (see this notebook for feature generation). We want to concatentate these together, and perform the necessary operations to achieve a dataframe where a row is one observation unit; its columns are the associated features and the ground truth data that we want to train the model on. Broadly, we want our dataframe to take on the following form:

| observation_unit |  outcome_1 | outcome_2 | feature_1| feature_2 | feature_3
| ----|  ---- | ---- | -- | -- | -- |
| 1   |  $y_{1}$ | $y_{2}$ | $x_{1}$ | $x_{2}$ | $x_{3}$
| 2   |  $y_{1}$ | $y_{2}$ | $x_{1}$| $x_{2}$ | $x_{3}$

Where we can run regress `outcome_1` and `outcome_2` on our features, using the following form:

$y_{1}$ = $\beta_{1}$$x_{1}$ + $\beta_{2}$$x_{2}$ + $\beta_{3}$$x_{3}$ + $\beta_{n}$$x_{n}$ 

In our case, an observation is one SEA/year. We have ground truth data that summarizes the total crop yield, total crop loss, as well as other pertinent agricultural variables. We want our dataframe described above to look like this:

| SEA | Year | Observed Yield (Tonnes) | Observed Loss (Tonnes) | feature1| feature2 | feature3
| ----| ----| ---- | ---- | -- | -- | -- |
| 1   | 2016 | 72 | 13 | 1.23 | 3.25 | 0.123
| 2   | 2016  | 50 | 7.5 | 0.78| 1.2 | 2.4

## Mild Preprocessing
### 1. Load in the data

First, we load in the feature data. This data was aggregated in the ___ notebook

In [10]:
# Let's read in the new concatenated features:
features = gpd.read_feather("/capstone/mosaiks/repos/modeling/data/sentinel_rgb_features_sea_save_2023_04_24.feather")


In [11]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,995,996,997,998,999,lon,lat,year,month,geometry
0,0.001058,0.0,0.005181,1.014728,0.098902,0.0,0.033511,1.81028,0.0,0.0,...,3.478022,4.774719,0.011104,0.840888,4.5e-05,27.47466,-16.339357,2015,7,POINT (27.47466 -16.33936)
1,0.00201,2.2e-05,0.003418,1.05819,0.125574,0.000291,0.039863,1.858072,0.0,2.9e-05,...,3.526972,4.882292,0.024313,0.873838,4.7e-05,27.46466,-16.339357,2015,7,POINT (27.46466 -16.33936)
2,0.004124,0.0,0.0,1.069269,0.127892,0.0,0.03624,1.871935,0.0,5.4e-05,...,3.552448,4.921965,0.022708,0.881511,0.0,27.46466,-16.329357,2015,7,POINT (27.46466 -16.32936)
3,0.001559,4e-06,0.0,1.103106,0.134511,0.00023,0.057822,1.941573,0.0,0.0,...,3.57198,4.965928,0.021992,0.857326,0.000266,27.45466,-16.339357,2015,7,POINT (27.45466 -16.33936)
4,0.001731,2e-06,0.00551,1.056985,0.122743,0.000167,0.034343,1.86024,0.0,0.0,...,3.535786,4.864718,0.009189,0.852267,0.0,27.47466,-16.349357,2015,7,POINT (27.47466 -16.34936)


### Ground-Truth Data
Next, we read in our ground truth data, which was processed in the preprocessing notebook

In [12]:
# Name list because gpd doesn't read in column names correctly
names = [
    "sea_unq",
    "year",
    "total_area_planted_ha",
    "total_area_harv_ha",
    "total_area_lost_ha",
    "total_harv_kg",
    "yield_kgha",
    "frac_area_harv",
    "frac_area_loss",
    "area_lost_fire",
    "maize",
    "groundnuts",
    "mixed_beans",
    "popcorn",
    "sorghum",
    "soybeans",
    "sweet_potatoes",
    "bunding",
    "frac_loss_drought",
    "frac_loss_flood",
    "frac_loss_animal",
    "frac_loss_pests",
    "frac_loss_soil",
    "frac_loss_fert",
    "prop_till_plough",
    "prop_till_ridge",
    "prop_notill",
    "prop_hand",
    "log_maize",
    "log_sweetpotatoes",
    "log_groundnuts",
    "log_soybeans",
    "loss_ind",
    "drought_loss_ind",
    "flood_loss_ind",
    "animal_loss_ind",
    "pest_loss_ind",
    "geometry"
]

# Read in the survey data
country_sea = gpd.read_file('/capstone/mosaiks/repos/preprocessing/data/ground_data_spatial/updated_data.shp')
country_sea.columns = names
country_sea.head()

Unnamed: 0,sea_unq,year,total_area_planted_ha,total_area_harv_ha,total_area_lost_ha,total_harv_kg,yield_kgha,frac_area_harv,frac_area_loss,area_lost_fire,...,log_maize,log_sweetpotatoes,log_groundnuts,log_soybeans,loss_ind,drought_loss_ind,flood_loss_ind,animal_loss_ind,pest_loss_ind,geometry
0,1,2009.0,34.9725,34.9725,0.0,57563.0,1645.95039,1.0,0.0,0.0,...,7.419992,8.455318,7.276696,7.107469,0.0,0.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
1,1,2010.0,32.215,26.7625,5.4525,69925.0,2612.797758,0.830747,0.169253,0.0,...,7.70416,9.021598,8.309147,7.665441,1.0,0.0,0.0,0.0,1.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
2,1,2011.0,60.4075,59.7525,0.655,120614.0,2018.559893,0.989157,0.010843,0.0,...,7.576247,8.759924,8.066608,-inf,0.0,0.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
3,1,2012.0,84.6175,74.6025,10.015,151890.0,2035.990751,0.881644,0.118356,0.0,...,7.550251,8.836374,8.067501,6.476972,1.0,1.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
4,1,2013.0,325.5,282.75,42.75,1975.0,6.984969,0.868664,0.131336,0.0,...,0.355723,-inf,4.382027,-inf,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."


We're going to make another object `sea_unq_join` which contains the spatial information and a unique key for each SEA. This will be handy later, when we need to join the features to the ground-truth data.

In [13]:
# Filter country_sea for unique values of 'seq_unq' and 'geometry'
sea_unq_join = country_sea[['sea_unq', 'geometry']].drop_duplicates()

# Display the filtered DataFrame
print(sea_unq_join)


      sea_unq                                           geometry
0           1  POLYGON ((27.82327 -13.65772, 27.82294 -13.657...
10          2  POLYGON ((27.99349 -13.46497, 27.99352 -13.464...
20          3  POLYGON ((28.09909 -13.51864, 28.09867 -13.516...
29          4  POLYGON ((28.31924 -13.42915, 28.31911 -13.426...
38          5  POLYGON ((28.39982 -13.51544, 28.40012 -13.514...
...       ...                                                ...
3571      388  POLYGON ((25.07771 -14.63920, 25.07732 -14.638...
3578      389  POLYGON ((22.74142 -14.00343, 22.73856 -14.002...
3585      390  POLYGON ((23.08604 -14.20026, 23.08957 -14.202...
3592      391  POLYGON ((24.36764 -16.62208, 24.36564 -16.621...
3599      392  POLYGON ((23.23962 -16.31204, 23.23876 -16.312...

[392 rows x 2 columns]


### 2. Organize the features by growing season



In [14]:
# Organize the features by growing season
# Carry months October, November, and December over to the following year's data
# These months represent the start of the growing season for the following year's maize yield
year_end = 2022

features['year'] = np.where(
    features['month'].isin([10, 11, 12]),
    features['year'] + 1, 
    features['year'])

features_gs = features[features['year'] <= year_end]

features_gs.sort_values(['year', 'month'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_gs.sort_values(['year', 'month'], inplace=True)


### 3. Convert the features into a geo dataframe

We first create a GeoDataFrame (`features_new_gdf`) from the DataFrame features, using the longitude and latitude columns to create the geometry column (geospatial points) and specifying the CRS (Coordinate Reference System) as EPSG:4326.

In [15]:
# Create a geodataframe of the new features
features_new_gdf = gpd.GeoDataFrame(
    features_gs, 
    geometry = gpd.points_from_xy(x = features_gs.lon, y = features_gs.lat), 
    crs='EPSG:4326'
)

In [16]:
# Notes: Have to change the year, get an error rn. Also, check to make sure the number of cells is correct
features_new_gdf = features_new_gdf.replace([np.inf, -np.inf], np.nan)
features_new_gdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,995,996,997,998,999,lon,lat,year,month,geometry
0,0.001058,0.0,0.005181,1.014728,0.098902,0.0,0.033511,1.81028,0.0,0.0,...,3.478022,4.774719,0.011104,0.840888,4.5e-05,27.47466,-16.339357,2015,7,POINT (27.47466 -16.33936)
1,0.00201,2.2e-05,0.003418,1.05819,0.125574,0.000291,0.039863,1.858072,0.0,2.9e-05,...,3.526972,4.882292,0.024313,0.873838,4.7e-05,27.46466,-16.339357,2015,7,POINT (27.46466 -16.33936)
2,0.004124,0.0,0.0,1.069269,0.127892,0.0,0.03624,1.871935,0.0,5.4e-05,...,3.552448,4.921965,0.022708,0.881511,0.0,27.46466,-16.329357,2015,7,POINT (27.46466 -16.32936)
3,0.001559,4e-06,0.0,1.103106,0.134511,0.00023,0.057822,1.941573,0.0,0.0,...,3.57198,4.965928,0.021992,0.857326,0.000266,27.45466,-16.339357,2015,7,POINT (27.45466 -16.33936)
4,0.001731,2e-06,0.00551,1.056985,0.122743,0.000167,0.034343,1.86024,0.0,0.0,...,3.535786,4.864718,0.009189,0.852267,0.0,27.47466,-16.349357,2015,7,POINT (27.47466 -16.34936)


## Pivot Wider by months

Since our ground-truth data has an annual resolution, we need to pivot our monthly features wider so that each observation contains all of the features for the full year. The main goal in this next step is to aggregate the features data into years while preserving the monthly information as separate columns. To achieve this, the code performs the following steps:

1. Separate the 'geometry' column temporarily to avoid duplicating it during unstacking.
2. Create a multi-level index using 'lon', 'lat', 'year', and 'month' in the DataFrame.
3. Unstack the DataFrame, transforming the innermost index level ('month') into new columns, creating a wide format DataFrame where each row represents a unique combination of 'lon', 'lat', and 'year', and the columns contain the values of the original features for each month.
4. Merge the 'geometry' column back into the DataFrame.

The resulting DataFrame contains the features aggregated by year with separate columns for each month's data. The naming convention will be `featurenumber_monthnumber`, so `587_12` corresponds to the 587th feature generated in December for that year 

In [17]:
# Store the 'geometry' column separately before unstacking
geometry_col = features_new_gdf[['lon', 'lat', 'geometry']].drop_duplicates(subset=['lon', 'lat'])

# Perform the unstacking operation without the 'geometry' column
features_gs_no_geometry = features_gs.drop(columns=['geometry'])
features = features_gs_no_geometry.set_index(['lon', 'lat', 'year', 'month']).unstack()
features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))

# Merge the 'geometry' column back into the features DataFrame
features = features.reset_index().merge(geometry_col, on=['lon', 'lat'])


In [18]:

# Convert the 'features' DataFrame to a GeoDataFrame
features_gdf = gpd.GeoDataFrame(features, geometry=features['geometry'], crs='EPSG:4326')

In [21]:
features_gdf.columns

Index(['lon', 'lat', 'year', '0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '0_7',
       ...
       '999_4', '999_5', '999_6', '999_7', '999_8', '999_9', '999_10',
       '999_11', '999_12', 'geometry'],
      dtype='object', length=12004)

In [22]:
impute_manual = True

def hilbert_distance(x1, y1, x2, y2, p=15, N=2):
    hilbert_curve = HilbertCurve(p, N)
    
    def get_hilbert_index(x, y):
        distances = np.arange(2 ** (p * N))
        coordinates = np.array([hilbert_curve.coordinates_from_distance(d) for d in distances])
        squared_diff = np.square(coordinates - np.array([x, y]))
        sum_squared_diff = np.sum(squared_diff, axis=1)
        return np.argmin(sum_squared_diff)
    
    index1 = get_hilbert_index(x1, y1)
    index2 = get_hilbert_index(x2, y2)
    return abs(index1 - index2)

def find_closest_point(row, feature, month, threshold=100000):  # threshold in meters
    current_year = row['year']
    current_point = row['geometry']

    mask = (features_gdf['year'] == current_year) & (~features_gdf[f'{feature}_{month}'].isna())
    non_na_points = features_gdf.loc[mask, ['geometry', f'{feature}_{month}']]

    if non_na_points.empty:
        return np.nan

    non_na_points['distance'] = non_na_points['geometry'].apply(lambda x: hilbert_distance(current_point.x, current_point.y, x.x, x.y))
    non_na_points = non_na_points[non_na_points['distance'] <= threshold]

    if non_na_points.empty:
        return np.nan

    closest_point = non_na_points.loc[non_na_points['distance'].idxmin()]
    return closest_point[f'{feature}_{month}']

# Modify the imputation step
if impute_manual:
    for feature in range(1000):  # We have features 0-999
        for month in range(1, 13):  # We have months 1-12
            mask = features_gdf[f'{feature}_{month}'].isna()
            features_gdf.loc[mask, f'{feature}_{month}'] = features_gdf[mask].apply(lambda row: find_closest_point(row, feature, month), axis=1)



AttributeError: 'HilbertCurve' object has no attribute 'coordinates_from_distance'

In [1]:
features_gdf.head()

NameError: name 'features_gdf' is not defined

In [None]:
impute_manual = True
# Extract the size of the features dataframe
rows, cols = features_new_gdf.shape

# compute the number of feature cells in the features dataframe
num_cells = rows * cols
num_cells

In [None]:
class bcolors:
    BL = '\x1b[1;34m' #GREEN
    GR = '\x1b[1;36m' #GREEN
    YL = '\x1b[1;33m' #YELLOW
    RD = '\x1b[1;31m' #RED
    RESET = '\033[0m' #RESET COLOR

In [None]:
if impute_manual:
    ln_ft = len(features_new_gdf)
    ln_na = len(features_new_gdf.dropna())
    print(f'Starting total row count: {bcolors.BL}{ln_ft}{bcolors.RESET}',
          f'\nPre-Impute NaN row count: {bcolors.RD}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPre-Impute NaN row %: {bcolors.RD}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPre-Impute NaN cell %: {bcolors.RD}{(features_new_gdf.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 1: Filling NaN values by month, year, and district group average')
    features_new_gdf = (
        features_new_gdf
        .fillna(features_new_gdf
                .groupby(['year', 'sea_unq'], as_index=False) 
                .transform('mean')
               )
    )
    ln_ft = len(features_new_gdf)
    ln_na = len(features_new_gdf.dropna())
    print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 1 NaN cell %: {bcolors.YL}{(features_new_gdf.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by month and district across group average')
    features_new_gdf = (
        features_new_gdf
        .fillna(features_new_gdf
                .groupby(['sea_unq'], as_index=False)
                .transform('mean')
               )
    )
    ln_ft = len(features_new_gdf)
    ln_na = len(features_new_gdf.dropna())
    print(f'Post step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features_new_gdf.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 3: Drop remaining NaN values\n')
    features_new_gdf = features_new_gdf.dropna(axis=0)
    print(f'Ending total row count: {bcolors.BL}{len(features_new_gdf)}{bcolors.RESET}')

## Join features to ground data

In [11]:
# Now lets combine the sea data 
spatial_join = gpd.sjoin(features_gdf, sea_unq_join, how='right', predicate = 'within')

In [12]:
spatial_join.head()

Unnamed: 0,index_left,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,...,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,sea_unq,geometry
0,,,,,,,,,,,...,,,,,,,,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
10,240165.0,28.000588,-13.503257,2020.0,,0.0,1e-06,3e-06,6.8e-05,0.000102,...,0.003168,0.002198,0.001368,0.000273,0.00031,0.001201,0.001103,0.003984,2,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
10,240160.0,28.000588,-13.503257,2015.0,,,,,,,...,,,,,0.001402,,,,2,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
10,240166.0,28.000588,-13.503257,2021.0,,,,0.0,0.0,5e-06,...,0.003553,0.002816,0.002079,0.0007,0.000221,0.001191,0.0,0.003701,2,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
10,240161.0,28.000588,-13.503257,2016.0,0.0,,,0.0,0.0,1.9e-05,...,0.003695,0.002374,0.001509,0.001603,0.001815,,,,2,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."


In [21]:
features_join = spatial_join.merge(country_sea, on=['year', 'sea_unq'], how='inner')

In [22]:
# Drop the redundant independent lon and lat columns because now that they are in a separate geometry column
features_join = features_join.drop(['geometry_x', 'index_left', 'lat', 'lon'], axis = 1)

In [23]:
features_join.head()

Unnamed: 0,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,...,log_maize,log_sweetpotatoes,log_groundnuts,log_soybeans,loss_ind,drought_loss_ind,flood_loss_ind,animal_loss_ind,pest_loss_ind,geometry_y
0,2020.0,,0.0,1e-06,3e-06,6.8e-05,0.000102,0.000206,0.000232,0.000642,...,7.496787,-inf,7.944166,7.426549,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
1,2020.0,,4e-05,7.1e-05,3.4e-05,8.4e-05,8.8e-05,0.000143,0.001001,0.001676,...,7.496787,-inf,7.944166,7.426549,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
2,2020.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5e-06,...,7.496787,-inf,7.944166,7.426549,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
3,2020.0,,0.0,0.00215,0.0,0.0,0.000108,0.000199,0.000517,0.00094,...,7.496787,-inf,7.944166,7.426549,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
4,2020.0,,0.0,0.0,0.0,0.0,1e-06,6e-06,0.00031,0.000438,...,7.496787,-inf,7.944166,7.426549,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."


In [40]:
features_join.head()


Unnamed: 0,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,...,log_sweetpotatoes,log_groundnuts,log_soybeans,loss_ind,drought_loss_ind,flood_loss_ind,animal_loss_ind,pest_loss_ind,geometry_y,geometry
0,2020.0,,0.0,1e-06,3e-06,6.8e-05,0.000102,0.000206,0.000232,0.000642,...,-inf,7.944166,7.426549,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464...","POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
1,2020.0,,4e-05,7.1e-05,3.4e-05,8.4e-05,8.8e-05,0.000143,0.001001,0.001676,...,-inf,7.944166,7.426549,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464...","POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
2,2020.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5e-06,...,-inf,7.944166,7.426549,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464...","POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
3,2020.0,,0.0,0.00215,0.0,0.0,0.000108,0.000199,0.000517,0.00094,...,-inf,7.944166,7.426549,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464...","POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
4,2020.0,,0.0,0.0,0.0,0.0,1e-06,6e-06,0.00031,0.000438,...,-inf,7.944166,7.426549,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.99349 -13.46497, 27.99352 -13.464...","POLYGON ((27.99349 -13.46497, 27.99352 -13.464..."
