# Modeling Agricultural Variables
## Python modules

In [71]:
import warnings
import time
import os

import dask
from dask.distributed import Client

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors

import geopandas as gpd

import pyarrow
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, confusion_matrix, r2_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingRegressor
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from scipy.stats import pearsonr
from sklearn.utils import check_random_state

import math
import seaborn as sns

## Read in Data

We first read in the aggregated features and ground-truth data joined in  feature_preprocessing.ipynb 

In [72]:
grouped_features = pd.read_csv("/capstone/mosaiks/repos/preprocessing/data/features_join_imputed_2023_05_02.csv")

In [73]:
grouped_features

Unnamed: 0,year,sea_unq,index_left,lon,lat,0_1,0_2,0_3,0_4,0_5,...,prop_mix,log_maize,log_sweetpotatoes,log_groundnuts,log_soybeans,loss_ind,drought_loss_ind,flood_loss_ind,animal_loss_ind,pest_loss_ind
0,2016.0,1,46302.000000,27.807993,-13.659357,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,4.058626,5.269229,5.811191,5.222801,0.0,0.0,0.0,0.0,0.0
1,2016.0,7,51611.666667,28.634660,-13.772690,0.001141,0.000329,0.000329,0.000329,0.000000,...,0.181102,3.387211,0.689155,6.227928,7.113191,1.0,1.0,0.0,0.0,0.0
2,2016.0,9,44806.714286,27.406446,-12.905428,0.000006,0.000006,0.000006,0.000006,0.000004,...,0.069018,2.703935,6.548992,-1.408767,5.584247,1.0,0.0,0.0,0.0,0.0
3,2016.0,10,44644.411765,27.381719,-12.962298,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,3.714757,2.525729,3.354421,5.332026,1.0,0.0,0.0,0.0,0.0
4,2016.0,12,47769.000000,28.014660,-12.889357,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,2.786884,6.409848,2.852125,0.798508,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
861,2021.0,347,22038.000000,25.204660,-14.879357,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,7.952872,8.294050,8.079163,7.021973,1.0,1.0,0.0,0.0,0.0
862,2021.0,348,19562.000000,24.774660,-14.799357,0.000000,0.000000,0.000018,0.000039,0.000071,...,0.000000,7.642350,8.070906,8.429997,8.006368,1.0,1.0,0.0,0.0,0.0
863,2021.0,355,15659.538462,24.260045,-14.563972,0.000038,0.000038,0.001014,0.000033,0.000028,...,0.000000,7.508878,7.665441,8.211719,5.238174,1.0,0.0,0.0,0.0,0.0
864,2021.0,356,19411.000000,24.752993,-14.764357,0.000158,0.000158,0.000246,0.000040,0.000038,...,0.000000,7.608263,9.042113,8.224773,8.028346,1.0,0.0,0.0,0.0,0.0


In [74]:
features = grouped_features.iloc[:,5:12005]
features.head()

Unnamed: 0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,0_10,...,999_3,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.157999e-06,0.000207,0.0,...,1.0,1.0,0.274676,1.0,0.115388,0.002708,0.001319,1.0,1.0,1.0
1,0.001141,0.000329,0.000329,0.000329,0.0,0.0,0.0,0.001008277,0.00136,0.002211,...,0.006789,0.006789,1.0,1.0,1.0,0.000517,0.000343,0.000396,0.000396,0.000396
2,6e-06,6e-06,6e-06,6e-06,4e-06,1e-05,1.4e-05,2.590917e-05,0.00011,0.00011,...,0.005561,0.005561,0.006391,0.004212,0.003235,0.001937,0.001683,0.001683,0.001683,0.001683
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.113844e-07,1.2e-05,1.2e-05,...,0.00557,0.00557,0.006739,0.003991,0.002857,0.001979,0.001435,0.001435,0.001435,0.001435
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.7e-06,0.000186,0.000186,...,1.0,1.0,1.0,1.0,0.00269,0.001603,0.00082,0.00082,0.00082,0.00082


In [75]:
outcomes = grouped_features.iloc[:,12006:]

outcomes["loss_ind"].astype('category')
outcomes["drought_loss_ind"].astype('category')
outcomes['pest_loss_ind'].astype('category')
outcomes['animal_loss_ind'].astype('category')
outcomes['flood_loss_ind'].astype('category')
outcomes.head()

list(outcomes.columns)

['total_area_harv_ha',
 'total_area_lost_ha',
 'total_harv_kg',
 'yield_kgha',
 'frac_area_harv',
 'frac_area_loss',
 'area_lost_fire',
 'maize',
 'groundnuts',
 'mixed_beans',
 'popcorn',
 'sorghum',
 'soybeans',
 'sweet_potatoes',
 'bunding',
 'monocrop',
 'mixture',
 'frac_loss_drought',
 'frac_loss_flood',
 'frac_loss_animal',
 'frac_loss_pests',
 'frac_loss_soil',
 'frac_loss_fert',
 'prop_till_plough',
 'prop_till_ridge',
 'prop_notill',
 'prop_hand',
 'prop_mono',
 'prop_mix',
 'log_maize',
 'log_sweetpotatoes',
 'log_groundnuts',
 'log_soybeans',
 'loss_ind',
 'drought_loss_ind',
 'flood_loss_ind',
 'animal_loss_ind',
 'pest_loss_ind']

## Model

We define a model to predict each of our outcome variables on our features for each SEA/year. The `train_and_evaluate_models` function trains and evaluates Ridge Linear Regression models for each target variable specified in the `target_columns` parameter. It handles both categorical and continuous target variables and provides the option to block sample on specific SEAs (Survey Enumeration Areas) by providing the SEA IDs to hold out for the validation set.

The function works as follows:

1. Read the grouped features and outcomes from a CSV file.
2. Define a helper function `block_sampling` to perform block sampling based on the provided SEA IDs.
3. For each target variable in `target_columns`, select the corresponding target variable data.
4. If `block_sea_ids` is provided and not empty, perform block sampling using the `block_sampling` helper function. Otherwise, use `train_test_split` to split the data into training and testing sets.
5. Train a Ridge Linear Regression model using RidgeCV with 5-fold cross-validation and a range of alpha values.
6. If the target variable is categorical, calculate and print the false positive rate and AUC-ROC. If the target variable is continuous, calculate and print the estimated regularization parameter, training R2 performance, validation R2 performance, and Pearson's correlation coefficient.

### Helper Function for Confusion Matrix for Categorical Variables
`calculate_confusion_matrix`:
This function calculates the confusion matrix for binary classification problems based on the given true labels (`y_true`), predicted values (`y_pred`), and a decision boundary (`decision_boundary`). The decision boundary is used to threshold the predicted values to obtain binary predictions.

Inputs:

`y_true`: The true labels of the target variable (a pandas Series or numpy array).

`y_pred`: The predicted values of the target variable (a numpy array).

`decision_boundary`: A float value that serves as the threshold for classifying the predicted values into two classes (0 or 1).


The function performs the following steps:
1. It adjusts the predicted values by setting them to 1 if they are greater than or equal to the decision boundary, and 0 otherwise.
2. It calculates the confusion matrix using the true labels and adjusted predicted values.
3. Depending on the shape of the confusion matrix, it extracts the true negatives (tn), false positives (fp), false negatives (fn), and true positives (tp).
4. If the shape of the confusion matrix is not (1, 1) or (2, 2), it raises an error.

Output: The function returns the values of tn, fp, fn, and tp.

In [76]:
def calculate_confusion_matrix(y_true, y_pred, decision_boundary):
    y_pred_adj = np.where(y_pred >= decision_boundary, 1, 0)
    cm = confusion_matrix(y_true, y_pred_adj)
    if cm.shape == (1, 1):
        if y_true.iloc[0] == 0:
            tn, fp, fn, tp = cm[0, 0], 0, 0, 0
        else:
            tn, fp, fn, tp = 0, 0, 0, cm[0, 0]
    elif cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
    else:
        print("Unexpected confusion matrix:")
        print(cm)
        raise ValueError('Unexpected confusion matrix shape.')
    return tn, fp, fn, tp

### Helper Function for Block Sampling on SEAs

This function randomly selects a specified number of unique SEA IDs from the `grouped_features` DataFrame.

Inputs:

1. n: The number of unique SEA IDs to select.

2. `grouped_features`: A DataFrame containing the feature data with a column '`sea_unq`' that stores the unique SEA IDs.

The function performs the following steps:

1. It extracts the unique SEA IDs from the '`sea_unq`' column of the `grouped_features` DataFrame.

2. It randomly selects n SEA IDs from the unique SEA IDs without replacement.

Output: The function returns a numpy array of the randomly selected SEA IDs.

These helper functions are used in the main model as follows:

`calculate_confusion_matrix` is used to calculate the confusion matrix for the categorical target variables. It is called in the `train_and_evaluate_models` function to compute the false positive rate and AUC-ROC for different decision boundaries.

`randomly_select_seas` is not used in the current implementation of the main model. However, it can be used to randomly select SEA IDs if you want to implement a custom sampling strategy.

In [77]:
def randomly_select_seas(n, grouped_features):
    unique_seas = grouped_features['sea_unq'].unique()
    selected_seas = np.random.choice(unique_seas, n, replace=False)
    return selected_seas

## Cross-Validator Custom Class

This custom cross-validator class, BlockSamplingCV, inherits from the BaseCrossValidator class in scikit-learn. It is designed to perform block sampling for cross-validation, holding out specific groups of observations (in this case, SEA IDs) during each split. This ensures that all observations with the same SEA ID are either in the training set or the test set, but not both.

Here's a detailed explanation of the class:

__init__(self, n_splits=5, n_seas_to_hold_out=10, sea_ids=None, random_state=None):
The constructor takes the following arguments:

n_splits: The number of cross-validation splits (default is 5).
n_seas_to_hold_out: The number of SEAs to hold out in each cross-validation split (default is 10).
sea_ids: A list or array of SEA IDs corresponding to the rows of the dataset (default is None).
random_state: An integer seed or a RandomState instance for reproducible results (default is None).
The constructor initializes the class with these arguments.

_iter_test_indices(self, X=None, y=None, groups=None):
This method generates test indices for each cross-validation split. It takes the following optional arguments:

X: Feature matrix (not used in this method but included for compatibility with scikit-learn).
y: Target variable (not used in this method but included for compatibility with scikit-learn).
groups: Group labels for the samples used to ensure that each group is either entirely in the training or test set (not used in this method but included for compatibility with scikit-learn).
The method performs the following steps:

a. It calculates the total number of samples and extracts the unique SEA IDs from the sea_ids attribute.
b. It initializes a random number generator with the specified random_state.
c. For each split, it randomly selects a set of n_seas_to_hold_out SEA IDs without replacement.
d. It finds the indices of the observations with the selected SEA IDs and yields them as test indices for the current split.

The BlockSamplingCV class is designed to perform cross-validation with block sampling, where groups of observations (in this case, SEA IDs) are held out together during each split. This is useful because it ensures that all observations with the same SEA ID are either in the training set or the test set, but not both. This can help prevent leakage of information between the training and test sets when observations with the same SEA ID are strongly correlated.

In [78]:
from sklearn.model_selection import BaseCrossValidator

class BlockSamplingCV(BaseCrossValidator):
    def __init__(self, n_splits=5, n_seas_to_hold_out=40, sea_ids=None, random_state=None):
        self.n_splits = n_splits
        self.n_seas_to_hold_out = n_seas_to_hold_out
        self.sea_ids = sea_ids
        self.random_state = random_state

    def _iter_test_indices(self, X=None, y=None, groups=None):
        n_samples = len(self.sea_ids)
        unique_sea_ids = np.unique(self.sea_ids)

        rng = check_random_state(self.random_state)

        for _ in range(self.n_splits):
            held_out_sea_ids = rng.choice(unique_sea_ids, self.n_seas_to_hold_out, replace=False)
            test_indices = np.where(np.isin(self.sea_ids, held_out_sea_ids))[0]

            yield test_indices

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits


In [79]:
def train_and_evaluate_models(target_columns, test_size, categorical_columns, decision_boundaries, n_seas_to_hold_out=None, bootstrap=False, n_bootstrap_samples=100, sea_ids=None, validation_size=0.2):
    grouped_features = pd.read_csv("/capstone/mosaiks/repos/preprocessing/data/features_join_imputed_2023_05_02.csv")

    features = grouped_features.iloc[:, 5:12005]
    outcomes = grouped_features.iloc[:, 12006:]

    # Helper function to perform block sampling based on SEA IDs
    def block_sampling(features, outcomes, n_seas_to_hold_out):
        unique_seas = grouped_features['sea_unq'].unique()
        np.random.seed(42)
        held_out_seas = np.random.choice(unique_seas, size=n_seas_to_hold_out, replace=False)
        
        train_data = grouped_features[~grouped_features['sea_unq'].isin(held_out_seas)]
        test_data = grouped_features[grouped_features['sea_unq'].isin(held_out_seas)]

        X_train = train_data.iloc[:, 5:12005]
        X_test = test_data.iloc[:, 5:12005]
        y_train = train_data[target_column]
        y_test = test_data[target_column]

        return X_train, X_test, y_train, y_test

    # Initialize an empty DataFrame to store the predictions
    predictions_df = pd.DataFrame()
    # Initialize an empty DataFrame to store groundtruth
    groundtruth_df = pd.DataFrame()
    # Initialize an empty dictionary to store best decision boundaries
    best_decisionboundary = {}
    r2_train = {}
    models = {}

    for target_column in target_columns:
        # Select the target variable
        y = outcomes[target_column]

        # Split the data into training and testing sets
        if n_seas_to_hold_out and n_seas_to_hold_out > 0:
            X_train, X_test, y_train, y_test = block_sampling(features, y, n_seas_to_hold_out)
        else:
            X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=test_size, random_state=42)

        # Split the training data again to create a validation set
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_size, random_state=42)

        sea_ids_train = sea_ids[X_train.index] # Update the sea_ids for the training set
        sea_ids_test = sea_ids[X_test.index] # Update the sea_ids for the test set

        # Train the model
        if bootstrap:
            cv = BlockSamplingCV(n_splits=n_bootstrap_samples, n_seas_to_hold_out=n_seas_to_hold_out, sea_ids=sea_ids_train, random_state=42)
            ridge_cv = RidgeCV(cv=cv, alphas=np.logspace(-8, 8, base=10, num=17))
        else:
            ridge_cv = RidgeCV(cv=5, alphas=np.logspace(-8, 8, base=10, num=17))

        
        ridge_cv.fit(X_train, y_train)
        models[target_column] = ridge_cv


        # Make predictions on the test data
        y_val_pred = ridge_cv.predict(X_val)

        # Update the predictions DataFrame with the new predictions
        predictions_df[target_column] = y_val_pred
        groundtruth_df[target_column] = y_val

        if target_column in categorical_columns:
            min_false_positive_rate = float('inf')
            selected_decision_boundary = None
            
            for decision_boundary in decision_boundaries:
                # Calculate confusion matrix
                tn, fp, fn, tp = calculate_confusion_matrix(y_val, y_val_pred, decision_boundary)

                # Calculate the false positive rate
                false_positive_rate = fp / (fp + tn)

                # Calculate AUC-ROC
                auc_roc = roc_auc_score(y_val, y_val_pred)
                
                if false_positive_rate == 0:
                    selected_decision_boundary = decision_boundary
                elif false_positive_rate < min_false_positive_rate:
                    min_false_positive_rate = false_positive_rate
                    selected_decision_boundary = decision_boundary

                print(f"Target variable: {target_column} (Categorical)")
                print(f"Decision boundary: {decision_boundary}")
                print(f"False positive rate: {false_positive_rate:0.2f}")
                print(f"AUC-ROC: {auc_roc:0.2f}")
                print()
            
            print(f"Selected decision boundary: {selected_decision_boundary}")
            best_decisionboundary[target_column] = selected_decision_boundary
            
        else:
            # Calculate Pearson's correlation coefficient
            pearson_coeff, _ = pearsonr(y_val, y_val_pred)

            # Calculate training R squared
            train_r_squared = ridge_cv.score(X_train, y_train)
            r2_train[target_column] = train_r_squared
            
            # Calculate validation R squared
            val_r_squared = ridge_cv.score(X_val, y_val)


            print(f"Target variable: {target_column}")
            print(f"Estimated regularization parameter: {ridge_cv.alpha_}")
            print(f"Training R2 performance: {train_r_squared:0.2f}")
            print(f"Validation R2 performance: {val_r_squared:0.2f}")
            print(f"Pearson's correlation coefficient: {pearson_coeff:0.2f}")
            print()

    return predictions_df, groundtruth_df, best_decisionboundary, r2_train, models

In [122]:
import warnings
warnings.filterwarnings('ignore')

test_size = 0.2
target_columns = ['maize', 'log_maize']

categorical_columns = ['loss_ind', 'drought_loss_ind', 'flood_loss_ind', 'animal_loss_ind', 'pest_loss_ind']
n_seas_to_hold_out = 40
bootstrap = True
n_bootstrap_samples = 10
sea_ids = grouped_features['sea_unq']

predictions_df, groundtruth_df, best_decisionboundary, r2_train, models = train_and_evaluate_models(target_columns, test_size, categorical_columns, decision_boundaries=[0.3, 0.5, 0.7], n_seas_to_hold_out=n_seas_to_hold_out, bootstrap=bootstrap, n_bootstrap_samples=n_bootstrap_samples, sea_ids=sea_ids)

Target variable: maize
Estimated regularization parameter: 1.0
Training R2 performance: 0.78
Validation R2 performance: 0.40
Pearson's correlation coefficient: 0.68

Target variable: log_maize
Estimated regularization parameter: 1.0
Training R2 performance: 0.82
Validation R2 performance: 0.52
Pearson's correlation coefficient: 0.74



In [27]:
# predictions_df

In [79]:
# groundtruth_df

In [80]:
# best_decisionboundary

In [81]:
# r2_train

In [91]:
#models

array([0.75530778, 0.76102397, 0.73510188, ..., 0.43702278, 0.72701432,
       1.05170245])

### Train set

In [82]:
import matplotlib.pyplot as plt
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
from IPython.display import clear_output

options = []

# create the dropdown widget
variable_dropdown = widgets.Dropdown(options=target_columns,
                                     value=target_columns[0],
                                     description='Variable:')

# create a container widget to hold the dropdown and the plot
container = widgets.VBox(children=[variable_dropdown])

# define a function to update the plot based on the selected variable
def update_plot(variable):
    with plot_output:
        clear_output(wait=True)
    
        # filter the dataframes based on the selected variable
        predictions_filtered = predictions_df[variable]
        groundtruth_filtered = groundtruth_df[variable]
        
        if variable in categorical_columns:
            decision_boundary = best_decisionboundary.get(variable, 0.5)
            #create the decision boundary plot
            tn, fp, fn, tp = calculate_confusion_matrix(groundtruth_filtered, predictions_filtered, decision_boundary)

            # create the confusion matrix
            conf_matrix = np.array([[tn, fp], [fn, tp]])

            # plot the confusion matrix heatmap
            fig, ax = plt.subplots()
            sns.heatmap(conf_matrix, annot=True, fmt='.1f', cmap='PuBu')
            ax.set(xlabel='Predicted label', ylabel='True label', 
                   title=f"Decision boundary for {variable}: {decision_boundary}")
            
        else:
            r2_var = r2_train.get(variable)
            # create the scatter plot
            fig, ax = plt.subplots(ncols=1)
            plt.scatter(predictions_filtered, groundtruth_filtered, alpha=1, s=4)
            plt.xlabel("Predicted", fontsize=15, x = .3)
            plt.ylabel("Ground Truth", fontsize=15)
            plt.suptitle(f"Variable: {variable}", fontsize=15, y=1.02)
            plt.title((f"Model applied to train data n = {len(predictions_filtered)}, R$^2$ = {r2_var:0.2f}"),
                    fontsize=12, y=1.01)

            plt.xticks(fontsize=14)
            plt.yticks(fontsize=14)

            ax.axline([0, 0], [1, 1], c = "k")

            plt.gca().spines.right.set_visible(False)
            plt.gca().spines.top.set_visible(False)

        plt.show()

# call the update_plot function with the initial value of the dropdown
plot_output = widgets.Output()
update_plot(variable_dropdown.value)
container.children = [variable_dropdown, plot_output]

# set up the interaction between the dropdown and the plot
def dropdown_eventhandler(change):
    variable = change.new
    update_plot(variable)

variable_dropdown.observe(dropdown_eventhandler, names='value')

# display the dropdown and the plot
display(container)

VBox(children=(Dropdown(description='Variable:', options=('frac_area_harv',), value='frac_area_harv'), Output(…

### Test set

### Use the trained model to predict crop yields over all years from 1km grid-cell resolution features 

Recall that after we executed imputation on all feature years in the dataframe `features`, we copied the dataframe and named it `features_all_years`. Now we can plug that into the model to visualize how our model performs over time.

In [83]:
features_zambia_all = pd.read_feather('/capstone/mosaiks/repos/modeling/data/zambia_features_10sample.feather')
features_zambia_all.head()

Unnamed: 0,year,ward_id,lon,lat,0_1,0_2,0_3,0_4,0_5,0_6,...,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,geometry
0,2015.0,0,22.799338,-15.095757,0.001077,0.0007,0.000706,0.000395,0.000172,0.000183,...,0.075707,0.056214,0.056737,0.036262,0.004133,0.000831,0.002924,0.004093,0.040344,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\xa0\x00...
1,2015.0,1,22.660588,-14.993257,0.001077,0.0007,0.000706,0.000395,0.000172,0.000183,...,0.075707,0.056214,0.056737,0.036262,0.004133,0.0,0.002924,0.004093,0.040344,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00X\x00\x0...
2,2015.0,2,22.465033,-15.005479,0.001077,0.0007,0.000706,0.000395,0.000172,0.000183,...,0.075707,0.056214,0.056737,0.036262,0.004133,0.000186,0.002924,0.004093,0.040344,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\xad\x00...
3,2015.0,3,22.559819,-15.041718,0.001077,0.0007,0.000706,0.000395,0.000172,0.000183,...,0.075707,0.056214,0.056737,0.036262,0.004133,0.00087,0.002924,0.004093,0.040344,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\xf5\x00...
4,2015.0,4,22.718088,-15.097007,0.001077,0.0007,0.000706,0.000395,0.000172,0.000183,...,0.075707,0.056214,0.056737,0.036262,0.004133,0.000792,0.002924,0.004093,0.040344,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\xa4\x00...


In the following chunk, we drop certain columns from `features_all_years` because we only need to feed the feature data into the model to generate predictions. Using the argument `axis = 1`, we specify that we are dropping columns rather than rows. 

In [84]:
x_all = features_zambia_all.drop([
    'year', 
    'geometry',
    'ward_id',
    'lat',
    'lon'
], axis = 1)
x_all

Unnamed: 0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,0_10,...,999_3,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12
0,0.001077,0.000700,0.000706,0.000395,0.000172,0.000183,0.000276,0.000605,0.002596,0.001419,...,0.058021,0.075707,0.056214,0.056737,0.036262,0.004133,0.000831,0.002924,0.004093,0.040344
1,0.001077,0.000700,0.000706,0.000395,0.000172,0.000183,0.000276,0.000605,0.001739,0.001419,...,0.058021,0.075707,0.056214,0.056737,0.036262,0.004133,0.000000,0.002924,0.004093,0.040344
2,0.001077,0.000700,0.000706,0.000395,0.000172,0.000183,0.000276,0.000605,0.005177,0.001419,...,0.058021,0.075707,0.056214,0.056737,0.036262,0.004133,0.000186,0.002924,0.004093,0.040344
3,0.001077,0.000700,0.000706,0.000395,0.000172,0.000183,0.000276,0.000605,0.003367,0.001419,...,0.058021,0.075707,0.056214,0.056737,0.036262,0.004133,0.000870,0.002924,0.004093,0.040344
4,0.001077,0.000700,0.000706,0.000395,0.000172,0.000183,0.000276,0.000605,0.003582,0.001419,...,0.058021,0.075707,0.056214,0.056737,0.036262,0.004133,0.000792,0.002924,0.004093,0.040344
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9729,0.001077,0.000700,0.000706,0.000395,0.000329,0.000534,0.000766,0.001134,0.001992,0.002714,...,0.058021,0.075707,0.003742,0.002590,0.001876,0.001346,0.000456,0.000331,0.001712,0.001382
9730,0.001077,0.000700,0.000138,0.000395,0.000246,0.000291,0.001470,0.001112,0.002212,0.003687,...,0.008110,0.075707,0.005396,0.003475,0.002716,0.002813,0.002238,0.002020,0.002028,0.003645
9731,0.001077,0.000700,0.000706,0.000395,0.000020,0.000028,0.000039,0.000096,0.000124,0.001915,...,0.058021,0.075707,0.102218,0.053286,0.005945,0.006825,0.004489,0.000866,0.003640,0.031428
9732,0.001077,0.000700,0.000027,0.000458,0.000054,0.000057,0.000070,0.000214,0.000744,0.001419,...,0.020116,0.021653,0.007164,0.006104,0.005544,0.003283,0.001262,0.002924,0.004093,0.039828


In [123]:
zambia_predictions = pd.DataFrame()

for variable, model in models.items():
    preds = model.predict(x_all)
    zambia_predictions[variable] = preds

In [124]:
zambia_predictions

Unnamed: 0,maize,log_maize
0,1747.962435,5.927935
1,1858.251920,6.066826
2,2122.042037,6.084168
3,1805.951429,5.979300
4,1691.299302,5.737664
...,...,...
9729,2891.583327,8.521031
9730,2449.779175,8.112095
9731,-403.486220,0.933380
9732,1602.783661,5.601063


In [125]:
zambia_predictions_maize = pd.concat([zambia_predictions, features_zambia_all[['lat','lon']]], axis = 1)

In [126]:
zambia_predictions_maize

Unnamed: 0,maize,log_maize,lat,lon
0,1747.962435,5.927935,-15.095757,22.799338
1,1858.251920,6.066826,-14.993257,22.660588
2,2122.042037,6.084168,-15.005479,22.465033
3,1805.951429,5.979300,-15.041718,22.559819
4,1691.299302,5.737664,-15.097007,22.718088
...,...,...,...,...
9729,2891.583327,8.521031,-14.934616,28.359715
9730,2449.779175,8.112095,-15.581035,30.348366
9731,-403.486220,0.933380,-9.254870,28.704943
9732,1602.783661,5.601063,-9.768893,29.503497


In [127]:
output_path = '/capstone/mosaiks/repos/modeling/data/zambia_maize_predictions_10sample.feather'
zambia_predictions_maize.to_feather(output_path)

The dataframe is already a geodataframe, so we do not have to convert it to one before mapping predictions. However, we do need to replace all the zero value crop percentage areas with `NA`. We do this by applying the `mask()` function. This function is similar to an if-else statement. If the value of the `crop_perc` is equal to 0, that value is replaced by the value of the second argument, which is `NA`. If the value of `crop_prec` is _not_ equal to zero, we retain the current value. The argument `inplace = True` executes this replacement in the same cell. 

In [None]:
features_all_years['yield_prediction'].mask(features_all_years['crop_perc']==0, np.nan, inplace=True)

Recall that this dataframe has a geometry column, with latitude and longitude together. In order to map the predicted features, we separate this geometry column into separate `lon` and `lat` columns. 

In [None]:
# extract the longitude and latitude from the geometry column, and make then into independent columns
features_all_years['lon'], features_all_years['lat'] = features_all_years.geometry.x, features_all_years.geometry.y

Plot the predicted features for each year:

In [None]:
def scatter(x, y, c, **kwargs):
    plt.scatter(x, y, c=c, s = 1.25)
sns.color_palette("viridis", as_cmap=True)
g = sns.FacetGrid(
    features_all_years, 
    col="year", 
    col_wrap = 4, 
    height=5, 
    aspect=1
)
g.map(scatter, "lon", "lat", "yield_prediction")
g.set_axis_labels(r"Yield Prediction")
# save the figure and name the file so that it represents the model parameters that created the predictions
# plt.savefig(f'images/{feature_file_name}_all_predictions.jpg', dpi=300)

Plot the model's predicted features summarized to district level. In this visualization, we choose a specific year to examine rather than visualizing all years in one figure. Visualizing the the features summarized to district level is interesting because the crop data resolution provided by Zambia Statistics Agency is at the district level, and therefore it is easier to compare our model results to those ground-truth values when they are summarized to district level as well. Furthermore, our model's crop predictions for the years 2020 and 2021 might be more valuable when summarized to district level if Zambian governments, policy-makers, farmers, and researchers wish to use this data to determine crop imports, exports, and storage according to district summaries. 

In [None]:
features_all_years_summary = (
    features_all_years
    .groupby(['district',"year"], as_index = False)['yield_prediction']
    .mean()
    .set_index('district')
)

In [None]:
# join Zambia's shapefile to the summarized features to map the districts
# reset the index so it is a properly formatted dataframe
features_all_years_summary = features_all_years_summary.join(country_shp).reset_index()

Now that the geometries have been converted to districts from points, the geomatries are now polygons. There is still a row for each district for each year.

In order to change the year visualized, simply change the year in the following code and re-run the chunk.

In [None]:
features_all_years_summary[features_all_years_summary.year == 2020].plot(column = "yield_prediction")

Plot a boxplot for each year to visualize the range and quantile distribution of each year's crop predictions, summarized to district level. This enables us to identify years with exceptional disparities between the predicted yields by district. It also allows us to identify years that have many outliers.

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x="year", y="yield_prediction", data = features_all_years_summary)
plt.xlabel("Year", fontsize=15)
plt.ylabel("Predicted Yield", fontsize=15)

Visualize the total crop yield predictions by year. This bar chart shows the sum of all the district crop yields.

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x="year", y="yield_prediction", data = features_all_years_summary, estimator = sum)

## Yield and Residual Plots

Create a dataframe of residuals called `residuals_df` from the `features_summary` dataframe. Note that we are _not_ using the predicted crop yields for _all_ years for these residuals, but rather the ground-truth crop yields for just the years through 2018.

The residuals give us an idea of the amount of uncertianty that is present in our model. By demeaning the residuals over space, we are able to remove the uncertainty over space and better determine our model performance over time and our uncertainty over time.

In [None]:
x_all = features_summary.drop(drop_cols, axis = 1)

# create empty dataframe to then populate with columns
residual_df = pd.DataFrame()

residual_df["yield_mt"] = features_summary.yield_mt.to_numpy()
residual_df["log_yield"] = np.log10(features_summary.yield_mt.to_numpy() + 1)
residual_df["prediction"] = np.maximum(ridge_cv_random.predict(x_all), 0)
residual_df["residual"] = residual_df["log_yield"] - residual_df["prediction"]
residual_df["year"] = features_summary.year
residual_df["district"] = features_summary.district
# join the district geometries
residual_df = residual_df.join(country_shp, how = "left", on = "district")

# demean by location so we can analyze the data over time
residual_df["district_yield_mean"] = residual_df.groupby('district')['log_yield'].transform('mean')
residual_df["district_prediction_mean"] = residual_df.groupby('district')['prediction'].transform('mean')
residual_df["demean_yield"] = residual_df["log_yield"] - residual_df["district_yield_mean"]
residual_df["demean_prediction"] = residual_df["prediction"] - residual_df["district_prediction_mean"]
residual_gdf = geopandas.GeoDataFrame(residual_df)

residual_gdf.head(3)

Visualize the residuals for the ground truth crop yields through 2018 with a boxplot.

In [None]:
plt.figure(figsize=(6, 5))
sns.boxplot(x="year", y="log_yield", data=residual_df)
plt.xlabel("Year", fontsize=15)
plt.ylabel("Log Yield", fontsize=15)

Visualize the residuals as a sum by year with a bar plot.

In [None]:
plt.figure(figsize=(6, 5))
sns.barplot(x="year", y="log_yield", data=residual_df, estimator = sum)

Visualize the crop yield residuals by year as a histogram to determine how they are distributed.

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "yield_mt", bins = 20)
g.set_axis_labels("Yield (MT)")

Visualize the log-transformed crop yield residuals by year as a histogram to compare how they are distributed after the transformation.

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "log_yield", bins = 20)
g.set_axis_labels(r"$\log_{10}(1 + Crop Yield)$")

#### Crop prediction histogram

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "prediction", bins = 20)
g.set_axis_labels(r"Crop yield predictions")

#### Residual histogram

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "residual", bins = 20)
g.set_axis_labels(r"Residuals")

In [None]:
residual_gdf.residual.min()

In [None]:
residual_gdf.residual.max()

#### Log crop yield vs residuals

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.scatterplot, "log_yield", "residual")
g.set_axis_labels(r"$\log_{10}(1 + Crop Yield)$")

#### District residuals 

In [None]:
if satellite == 'landsat-8-c2-l2':
    fig, (ax1,ax2) = plt.subplots(nrows=1, ncols=2, figsize=(13, 5))
    ax1 = (residual_gdf[residual_gdf.year == 2014]
           .plot(ax = ax1, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
           .set_title("2014 Residuals"))
    ax2 = (residual_gdf[residual_gdf.year == 2015]
           .plot(ax = ax2, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
           .set_title("2015 Residuals"))
else:
    pass
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
ax1 = (residual_gdf[residual_gdf.year == 2016]
       .plot(ax = ax1, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
       .set_title("2016 Residuals"))
ax2 = (residual_gdf[residual_gdf.year == 2017]
       .plot(ax = ax2, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
       .set_title("2017 Residuals"))
ax3 = (residual_gdf[residual_gdf.year == 2018]
       .plot(ax = ax3, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
       .set_title("2018 Residuals"))

caption = "A positive value is an underestimated prediction (the prediction is lower than the actual yield), a negative value is an over estimated prediction"
plt.figtext(0.5, 0.01, caption, wrap=True, horizontalalignment='center', fontsize=12)


#### Difference from the mean

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.scatterplot, "demean_yield", "demean_prediction")
g.set_axis_labels('Difference from Yield Mean', 'Difference from Prediction Mean')

In [None]:
fig, ax = plt.subplots(figsize= (6, 5))
ax.axline([-.2, -.2], [.2, .2], c = "k")
plt.scatter(residual_gdf.demean_yield, residual_gdf.demean_prediction)
plt.title("Demeaned truth and predictions by district")
plt.xlabel('Difference from Yield Mean')
plt.ylabel('Difference from Predictions Mean')
r_squared = r2_score(residual_gdf["demean_yield"], residual_gdf["demean_prediction"])
plt.text(
    -0.2,
    .18,
    s=f"Demeaned R$^2$ = {r_squared:0.2f}",
    fontsize=15,
    fontweight="bold",
)
plt.savefig(f'images/{feature_file_name}_demean.jpg', dpi=300)

In [None]:
for yr in range(year_start+1, 2018):
    r_squared = r2_score(residual_gdf[residual_gdf.year == yr]["demean_yield"], residual_gdf[residual_gdf.year == yr]["demean_prediction"])
    pearson_r = pearsonr(residual_gdf[residual_gdf.year == yr]["demean_yield"], residual_gdf[residual_gdf.year == yr]["demean_prediction"])
    
    print(yr, f"    R^2: {r_squared:.2f}\n",
          f"Pearson's r: {pearson_r[0]:.2f}\n", 
          sep = "")
    
r_squared = r2_score(residual_gdf["demean_yield"], residual_gdf["demean_prediction"])
pearson_r = pearsonr(residual_gdf["demean_yield"], residual_gdf["demean_prediction"])
print(f"All     R^2: {r_squared:.2f}\n",
      f"Pearson's r: {pearson_r[0]:.2f}", sep = "")

In [None]:
r2 = round(pearson_r[0] ** 2, 2)
r2

#### Join residuals to the features for _all_ years to visualize the residuals of the features before they were summarized to district level.

In [None]:
complete_df = (
    features_all_years_summary
    .set_index(['district', 'year'])
    .join(residual_df
          .drop('geometry', axis = 1)
          .set_index(['district', 'year'])
         )
    .reset_index()
)

complete_df.head(3)

In [None]:
fig, ax1 = plt.subplots(figsize=(10, 5))
tidy = complete_df.melt(id_vars='year').rename(columns=str.title)
tidy = tidy[tidy.Variable.isin(['yield_prediction', 'log_yield'])]
sns.barplot(x='Year', y='Value', hue='Variable', data=tidy, ax=ax1, ci = None)
sns.despine(fig)

h, l = ax1.get_legend_handles_labels()
ax1.legend(h, ['Predicted Yield', 'Observed Yield'],loc='lower left')

plt.savefig(f'images/{feature_file_name}_yield_pred.jpg', dpi=300)

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x="year", y="yield_prediction", data=complete_df, estimator = sum)

### Congratulations on completing this analysis!