# Modeling Agricultural Variables
## Python modules

In [35]:
import warnings
import time
import os
import random

import dask
from dask.distributed import Client

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import ipywidgets as widgets
from IPython.display import clear_output

import geopandas as gpd
import pyarrow

from IPython.display import display
from joblib import Parallel, delayed
from matplotlib.axes import Axes
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import mean_squared_error, confusion_matrix, r2_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingRegressor
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from scipy.stats import pearsonr
from sklearn.utils import check_random_state, resample


import math
import seaborn as sns

## Read in Data

We first read in the aggregated features and ground-truth data joined in  feature_preprocessing.ipynb. We separate this aggregated file into two distinct dataframes, *features* and *outcomes*, where features contain aggregated features and outcomes contain our ground-truth data.

In [2]:
grouped_features = pd.read_csv("/capstone/mosaiks/repos/modeling/data/model_directory/SEA_averaged_features_manual_impute_bfill_modeltrain.csv")

In [3]:
grouped_features.head()

Unnamed: 0,year,sea_unq,index_left,lon,lat,0_1,0_2,0_3,0_4,0_5,...,prop_mix,log_maize,log_sweetpotatoes,log_groundnuts,log_soybeans,loss_ind,drought_loss_ind,flood_loss_ind,animal_loss_ind,pest_loss_ind
0,2016.0,1,46302.0,27.807993,-13.659357,0.0,0.0,0.0,0.0,0.0,...,0.0,4.058626,5.269229,7.640386,6.97709,0.0,0.0,0.0,0.0,0.0
1,2016.0,7,51611.666667,28.63466,-13.77269,0.001141,0.000329,0.000329,0.000329,0.0,...,0.181102,3.387211,0.689155,7.707512,7.113191,1.0,1.0,0.0,0.0,0.0
2,2016.0,9,44806.714286,27.406446,-12.905428,6e-06,6e-06,6e-06,6e-06,4e-06,...,0.069018,2.703935,8.486127,-1.408767,7.14137,1.0,0.0,0.0,0.0,0.0
3,2016.0,10,44644.411765,27.381719,-12.962298,0.0,0.0,0.0,0.0,0.0,...,0.0,3.714757,2.525729,3.354421,6.929734,1.0,0.0,0.0,0.0,0.0
4,2016.0,12,47769.0,28.01466,-12.889357,0.0,0.0,0.0,0.0,0.0,...,0.0,2.786884,8.509161,2.852125,0.798508,1.0,0.0,0.0,0.0,0.0


In [6]:
features = grouped_features.iloc[:,5:12005]
features.head()

Unnamed: 0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,0_10,...,999_3,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.157999e-06,0.000207,0.0,...,1.0,1.0,0.274676,1.0,0.115388,0.002708,0.001319,1.0,1.0,1.0
1,0.001141,0.000329,0.000329,0.000329,0.0,0.0,0.0,0.001008277,0.00136,0.002211,...,0.006789,0.006789,1.0,1.0,1.0,0.000517,0.000343,0.000396,0.000327,0.004724
2,6e-06,6e-06,6e-06,6e-06,4e-06,1e-05,1.4e-05,2.590917e-05,0.00011,0.000109,...,0.005561,0.005561,0.006391,0.004212,0.003235,0.001937,0.001683,0.00197,0.00234,0.005251
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.113844e-07,1.2e-05,0.0,...,0.00557,0.00557,0.006739,0.003991,0.002857,0.001979,0.001435,0.001284,0.001814,0.00754
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.7e-06,0.000186,0.000166,...,1.0,1.0,1.0,1.0,0.00269,0.001603,0.00082,0.001269,0.001692,0.018616


In [7]:
outcomes = grouped_features.iloc[:,12006:]

outcomes["loss_ind"].astype('category')
outcomes["drought_loss_ind"].astype('category')
outcomes['pest_loss_ind'].astype('category')
outcomes['animal_loss_ind'].astype('category')
outcomes['flood_loss_ind'].astype('category')
outcomes.head()

print(outcomes.columns)

Index(['total_area_harv_ha', 'total_area_lost_ha', 'total_harv_kg',
       'yield_kgha', 'frac_area_harv', 'frac_area_loss', 'area_lost_fire',
       'maize', 'groundnuts', 'mixed_beans', 'popcorn', 'sorghum', 'soybeans',
       'sweet_potatoes', 'bunding', 'monocrop', 'mixture', 'frac_loss_drought',
       'frac_loss_flood', 'frac_loss_animal', 'frac_loss_pests',
       'frac_loss_soil', 'frac_loss_fert', 'prop_till_plough',
       'prop_till_ridge', 'prop_notill', 'prop_hand', 'prop_mono', 'prop_mix',
       'log_maize', 'log_sweetpotatoes', 'log_groundnuts', 'log_soybeans',
       'loss_ind', 'drought_loss_ind', 'flood_loss_ind', 'animal_loss_ind',
       'pest_loss_ind'],
      dtype='object')


## Model

We define a model to predict each of our outcome variables on our features for each survey enumeration area (SEA)/year. The `train_and_evaluate_models` function trains and evaluates Ridge Linear Regression models for each target variable specified in the `target_columns` parameter. It handles both categorical and continuous target variables.

The function works as follows:

1. Read the grouped features and outcomes from a CSV file.
2. For each target variable in `target_columns`, select the corresponding target variable data.
3. Use `train_test_split` to split the data into training and testing sets.
5. Train a Ridge Linear Regression model using RidgeCV with 5-fold cross-validation and a range of alpha values.
6. If the target variable is categorical, calculate and print the false positive rate and AUC-ROC. If the target variable is continuous, calculate and print the estimated regularization parameter, training R2 performance, validation R2 performance, and Pearson's correlation coefficient.

### Helper Function for Confusion Matrix for Categorical Variables
`calculate_confusion_matrix`:
This function calculates the confusion matrix for binary classification problems based on the given true labels (`y_true`), predicted values (`y_pred`), and a decision boundary (`decision_boundary`). The decision boundary is used to threshold the predicted values to obtain binary predictions.

Inputs:

`y_true`: The true labels of the target variable (a pandas Series or numpy array).

`y_pred`: The predicted values of the target variable (a numpy array).

`decision_boundary`: A float value that serves as the threshold for classifying the predicted values into two classes (0 or 1).


The function performs the following steps:
1. It adjusts the predicted values by setting them to 1 if they are greater than or equal to the decision boundary, and 0 otherwise.
2. It calculates the confusion matrix using the true labels and adjusted predicted values.
3. Depending on the shape of the confusion matrix, it extracts the true negatives (tn), false positives (fp), false negatives (fn), and true positives (tp).
4. If the shape of the confusion matrix is not (1, 1) or (2, 2), it raises an error.

Output: The function returns the values of tn, fp, fn, and tp.

In [8]:
def calculate_confusion_matrix(y_true, y_pred, decision_boundary):
    y_pred_adj = np.where(y_pred >= decision_boundary, 1, 0)
    cm = confusion_matrix(y_true, y_pred_adj)
    if cm.shape == (1, 1):
        if y_true.iloc[0] == 0:
            tn, fp, fn, tp = cm[0, 0], 0, 0, 0
        else:
            tn, fp, fn, tp = 0, 0, 0, cm[0, 0]
    elif cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
    else:
        print("Unexpected confusion matrix:")
        print(cm)
        raise ValueError('Unexpected confusion matrix shape.')
    return tn, fp, fn, tp

In [9]:
def randomly_select_seas(n, grouped_features):
    unique_seas = grouped_features['sea_unq'].unique()
    selected_seas = np.random.choice(unique_seas, n, replace=False)
    return selected_seas

### Model Implementation

In [10]:
# Prepare the arguments as a dictionary
args = {
    'target_columns': ['total_area_harv_ha', 'total_area_lost_ha', 'total_harv_kg',
       'yield_kgha', 'frac_area_harv', 'frac_area_loss', 'area_lost_fire',
       'maize', 'groundnuts', 'mixed_beans', 'popcorn', 'sorghum', 'soybeans',
       'sweet_potatoes', 'bunding', 'monocrop', 'mixture', 'frac_loss_drought',
       'frac_loss_flood', 'frac_loss_animal', 'frac_loss_pests',
       'frac_loss_soil', 'frac_loss_fert', 'prop_till_plough',
       'prop_till_ridge', 'prop_notill', 'prop_hand', 'prop_mono', 'prop_mix',
       'log_maize', 'log_sweetpotatoes', 'log_groundnuts', 'log_soybeans',
       'loss_ind', 'drought_loss_ind', 'flood_loss_ind', 'animal_loss_ind',
       'pest_loss_ind'],
    'test_size': 0.1,
    'categorical_columns':['loss_ind','drought_loss_ind', 'flood_loss_ind','animal_loss_ind','pest_loss_ind'],
    'decision_boundaries': [0.3,0.5,0.7],
    'sea_ids': grouped_features['sea_unq'],
    'validation_size' : 0.1,
    'random_state': 50
}

In [None]:
def train_and_evaluate_models(args):
    # Extracting input parameters
    target_columns = args['target_columns']
    test_size = args.get('test_size', 0.1)
    categorical_columns = args['categorical_columns']
    decision_boundaries = args['decision_boundaries']
    sea_ids = args['sea_ids']
    validation_size = args.get('validation_size', 0.1)
    random_state = args.get('random_state', False)
    
    # Read the grouped features from a CSV file
    grouped_features = pd.read_csv("/capstone/mosaiks/repos/modeling/data/model_directory/SEA_averaged_features_manual_impute_bfill_modeltrain.csv")

    # Extract the relevant features, outcomes, and year columns
    features = grouped_features.iloc[:, 5:12005]
    outcomes = grouped_features.iloc[:, 12006:]
    year = grouped_features.iloc[:, 0]
    
    # Initialize data structures to store metrics and results
    metrics_df = pd.DataFrame(columns=['target_column', 'train_score', 'val_score', 'pearson_coeff'])
    models = {}
    X_trains = {}
    X_tests = {}
    y_trains = pd.DataFrame()
    y_tests = pd.DataFrame()
    y_year = pd.DataFrame()
    
    # Print the model parameters
    print(f"\nRunning model with the following parameters:")
    print(f"Target columns: {target_columns}")
    print(f"Test size: {test_size}", f"Validation size: {validation_size}")
    print(f"Random State: {random_state}")

    # Iterate over each target column
    for target_column in target_columns:
        
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(features, outcomes[target_column], test_size=test_size, random_state = random_state)
        
        # Split the training data again to create a validation set
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_size, random_state = random_state)
        
        # Store the training and testing data for each target column
        X_trains[target_column] = X_train
        X_tests[target_column] = X_test
        y_trains[target_column] = y_train
        y_tests[target_column] = y_test
        y_year[target_column] = year.loc[y_trains.index]

        # Train a RidgeCV model with cross-validation
        cv = 5
        ridge_cv = RidgeCV(cv=cv, alphas=np.logspace(-8, 8, base=10, num=75))
        ridge_cv.fit(X_train, y_train)
        
        # Store the trained model for each target column
        models[target_column] = ridge_cv
        
        # Make predictions on the training and validation data
        y_val_pred = ridge_cv.predict(X_val)
        y_train_pred = ridge_cv.predict(X_train)

        # Perform evaluation for categorical target columns
        if target_column in categorical_columns:
            for decision_boundary in decision_boundaries:
                # Calculate confusion matrix
                tn, fp, fn, tp = calculate_confusion_matrix(y_val, y_val_pred, decision_boundary)

                # Calculate the false positive rate
                false_positive_rate = fp / (fp + tn)

                # Calculate AUC-ROC
                auc_roc = roc_auc_score(y_val, y_val_pred)

                # Print evaluation metrics for categorical columns
                print(f"Target variable: {target_column} (Categorical)")
                print(f"Decision boundary: {decision_boundary}")
                print(f"False positive rate: {false_positive_rate:0.2f}")
                print(f"AUC-ROC: {auc_roc:0.2f}")
                print()
        else:
            # Calculate Pearson's correlation coefficient
            pearson_coeff, _ = pearsonr(y_val, y_val_pred)

            # Calculate training R squared
            train_r_squared = ridge_cv.score(X_train, y_train)

            # Calculate validation R squared
            val_r_squared = ridge_cv.score(X_val, y_val)
            
            # Append metrics to the metrics DataFrame
            metrics_df = metrics_df.append({
                'target_column': target_column,
                'train_score': train_r_squared,
                'val_score': val_r_squared,
                'pearson_coeff': pearson_coeff}, ignore_index=True)
                
            # Print evaluation metrics for non-categorical columns
            print()
            print(f"Target variable: {target_column}")
            print(f"Estimated regularization parameter: {ridge_cv.alpha_}")
            print(f"Training R2 performance: {train_r_squared:0.2f}")
            print(f"Validation R2 performance: {val_r_squared:0.2f}")
            print(f"Pearson's correlation coefficient: {pearson_coeff:0.2f}")
            print()

    # Return the collected data and results
    return X_trains, X_tests, y_trains, y_tests, metrics_df, models, y_year


In [13]:
X_trains, X_tests, y_trains, y_tests, metrics_df, models, y_year  = train_and_evaluate_models(args)


Running model with the following parameters:
Target columns: ['total_area_harv_ha', 'total_area_lost_ha', 'total_harv_kg', 'yield_kgha', 'frac_area_harv', 'frac_area_loss', 'area_lost_fire', 'maize', 'groundnuts', 'mixed_beans', 'popcorn', 'sorghum', 'soybeans', 'sweet_potatoes', 'bunding', 'monocrop', 'mixture', 'frac_loss_drought', 'frac_loss_flood', 'frac_loss_animal', 'frac_loss_pests', 'frac_loss_soil', 'frac_loss_fert', 'prop_till_plough', 'prop_till_ridge', 'prop_notill', 'prop_hand', 'prop_mono', 'prop_mix', 'log_maize', 'log_sweetpotatoes', 'log_groundnuts', 'log_soybeans', 'loss_ind', 'drought_loss_ind', 'flood_loss_ind', 'animal_loss_ind', 'pest_loss_ind']
Test size: 0.1 Validation size: 0.1
Random State: 50


  metrics_df = metrics_df.append({



Target variable: total_area_harv_ha
Estimated regularization parameter: 4.45295850994266
Training R2 performance: 0.71
Validation R2 performance: 0.46
Pearson's correlation coefficient: 0.69



  metrics_df = metrics_df.append({



Target variable: total_area_lost_ha
Estimated regularization parameter: 2.706652070033247
Training R2 performance: 0.75
Validation R2 performance: 0.50
Pearson's correlation coefficient: 0.72



  metrics_df = metrics_df.append({



Target variable: total_harv_kg
Estimated regularization parameter: 0.22456979955397763
Training R2 performance: 0.86
Validation R2 performance: 0.45
Pearson's correlation coefficient: 0.71



  metrics_df = metrics_df.append({



Target variable: yield_kgha
Estimated regularization parameter: 1.6451905877536674
Training R2 performance: 0.74
Validation R2 performance: 0.62
Pearson's correlation coefficient: 0.80



  metrics_df = metrics_df.append({



Target variable: frac_area_harv
Estimated regularization parameter: 4.45295850994266
Training R2 performance: 0.64
Validation R2 performance: 0.46
Pearson's correlation coefficient: 0.71



  metrics_df = metrics_df.append({



Target variable: frac_area_loss
Estimated regularization parameter: 4.45295850994266
Training R2 performance: 0.64
Validation R2 performance: 0.46
Pearson's correlation coefficient: 0.71



  metrics_df = metrics_df.append({



Target variable: area_lost_fire
Estimated regularization parameter: 100000000.0
Training R2 performance: 0.00
Validation R2 performance: 0.00
Pearson's correlation coefficient: nan



  metrics_df = metrics_df.append({



Target variable: maize
Estimated regularization parameter: 1.0
Training R2 performance: 0.77
Validation R2 performance: 0.61
Pearson's correlation coefficient: 0.79



  metrics_df = metrics_df.append({



Target variable: groundnuts
Estimated regularization parameter: 32.62222009711673
Training R2 performance: 0.52
Validation R2 performance: 0.44
Pearson's correlation coefficient: 0.67



  metrics_df = metrics_df.append({



Target variable: mixed_beans
Estimated regularization parameter: 53.66976945540476
Training R2 performance: 0.34
Validation R2 performance: 0.22
Pearson's correlation coefficient: 0.49



  metrics_df = metrics_df.append({



Target variable: popcorn
Estimated regularization parameter: 646.8607661546321
Training R2 performance: 0.13
Validation R2 performance: -0.04
Pearson's correlation coefficient: 0.08



  metrics_df = metrics_df.append({



Target variable: sorghum
Estimated regularization parameter: 393.18287557057704
Training R2 performance: 0.12
Validation R2 performance: -0.01
Pearson's correlation coefficient: 0.10



  metrics_df = metrics_df.append({



Target variable: soybeans
Estimated regularization parameter: 19.828839491270752
Training R2 performance: 0.42
Validation R2 performance: 0.17
Pearson's correlation coefficient: 0.44



  metrics_df = metrics_df.append({



Target variable: sweet_potatoes
Estimated regularization parameter: 32.62222009711673
Training R2 performance: 0.46
Validation R2 performance: 0.30
Pearson's correlation coefficient: 0.55



  metrics_df = metrics_df.append({



Target variable: bunding
Estimated regularization parameter: 100000000.0
Training R2 performance: 0.00
Validation R2 performance: -0.02
Pearson's correlation coefficient: 0.10



  metrics_df = metrics_df.append({



Target variable: monocrop
Estimated regularization parameter: 12.052609368708413
Training R2 performance: 0.59
Validation R2 performance: 0.51
Pearson's correlation coefficient: 0.73



  metrics_df = metrics_df.append({



Target variable: mixture
Estimated regularization parameter: 1064.2092440647268
Training R2 performance: 0.08
Validation R2 performance: 0.06
Pearson's correlation coefficient: 0.29



  metrics_df = metrics_df.append({



Target variable: frac_loss_drought
Estimated regularization parameter: 238.98925662310526
Training R2 performance: 0.42
Validation R2 performance: 0.38
Pearson's correlation coefficient: 0.62



  metrics_df = metrics_df.append({



Target variable: frac_loss_flood
Estimated regularization parameter: 688395.206964551
Training R2 performance: 0.00
Validation R2 performance: -0.00
Pearson's correlation coefficient: -0.04



  metrics_df = metrics_df.append({



Target variable: frac_loss_animal
Estimated regularization parameter: 21102.034285685964
Training R2 performance: 0.10
Validation R2 performance: -0.12
Pearson's correlation coefficient: -0.10



  metrics_df = metrics_df.append({



Target variable: frac_loss_pests
Estimated regularization parameter: 100000000.0
Training R2 performance: 0.00
Validation R2 performance: -0.03
Pearson's correlation coefficient: -0.01



  metrics_df = metrics_df.append({



Target variable: frac_loss_soil
Estimated regularization parameter: 7796.360130405253
Training R2 performance: 0.07
Validation R2 performance: 0.01
Pearson's correlation coefficient: 0.12



  metrics_df = metrics_df.append({



Target variable: frac_loss_fert
Estimated regularization parameter: 254334.57613046587
Training R2 performance: 0.03
Validation R2 performance: 0.01
Pearson's correlation coefficient: 0.29



  metrics_df = metrics_df.append({



Target variable: prop_till_plough
Estimated regularization parameter: 7.3259654282152304
Training R2 performance: 0.78
Validation R2 performance: 0.71
Pearson's correlation coefficient: 0.85



  metrics_df = metrics_df.append({



Target variable: prop_till_ridge
Estimated regularization parameter: 0.6078323128297236
Training R2 performance: 0.77
Validation R2 performance: 0.54
Pearson's correlation coefficient: 0.74



  metrics_df = metrics_df.append({



Target variable: prop_notill
Estimated regularization parameter: 100000000.0
Training R2 performance: 0.00
Validation R2 performance: -0.75
Pearson's correlation coefficient: 0.05



  metrics_df = metrics_df.append({



Target variable: prop_hand
Estimated regularization parameter: 2.706652070033247
Training R2 performance: 0.60
Validation R2 performance: 0.33
Pearson's correlation coefficient: 0.58



  metrics_df = metrics_df.append({



Target variable: prop_mono
Estimated regularization parameter: 0.369460120519931
Training R2 performance: 0.90
Validation R2 performance: 0.56
Pearson's correlation coefficient: 0.76



  metrics_df = metrics_df.append({



Target variable: prop_mix
Estimated regularization parameter: 154592.77364194783
Training R2 performance: 0.02
Validation R2 performance: -0.09
Pearson's correlation coefficient: -0.02



  metrics_df = metrics_df.append({



Target variable: log_maize
Estimated regularization parameter: 1.6451905877536674
Training R2 performance: 0.77
Validation R2 performance: 0.71
Pearson's correlation coefficient: 0.84



  metrics_df = metrics_df.append({



Target variable: log_sweetpotatoes
Estimated regularization parameter: 7.3259654282152304
Training R2 performance: 0.51
Validation R2 performance: 0.36
Pearson's correlation coefficient: 0.60



  metrics_df = metrics_df.append({



Target variable: log_groundnuts
Estimated regularization parameter: 7.3259654282152304
Training R2 performance: 0.58
Validation R2 performance: 0.42
Pearson's correlation coefficient: 0.66



  metrics_df = metrics_df.append({



Target variable: log_soybeans
Estimated regularization parameter: 12.052609368708413
Training R2 performance: 0.40
Validation R2 performance: -0.07
Pearson's correlation coefficient: 0.20

Target variable: loss_ind (Categorical)
Decision boundary: 0.3
False positive rate: 0.83
AUC-ROC: 0.84

Target variable: loss_ind (Categorical)
Decision boundary: 0.5
False positive rate: 0.33
AUC-ROC: 0.84

Target variable: loss_ind (Categorical)
Decision boundary: 0.7
False positive rate: 0.00
AUC-ROC: 0.84

Target variable: drought_loss_ind (Categorical)
Decision boundary: 0.3
False positive rate: 0.33
AUC-ROC: 0.79

Target variable: drought_loss_ind (Categorical)
Decision boundary: 0.5
False positive rate: 0.03
AUC-ROC: 0.79

Target variable: drought_loss_ind (Categorical)
Decision boundary: 0.7
False positive rate: 0.00
AUC-ROC: 0.79

Target variable: flood_loss_ind (Categorical)
Decision boundary: 0.3
False positive rate: 0.00
AUC-ROC: 0.46

Target variable: flood_loss_ind (Categorical)
Decisi

### Train Set

After training models for each specified target variable in `target_columns`, we employ these models to create and store predictions and R^2 scores for each target column on our training data. Our training data has been aggregated by survey enumeration area (SEA) and year, which means that each of the 436 rows of `y_pred_train` represents a prediction made for a particular SEA during a particular year. 

In [31]:
# Initialize empty dataframes for storing the predicted values and R2 scores
y_pred_train = pd.DataFrame()
r2_train = pd.DataFrame()

# Iterate over the keys in models dictionary
for target_column in models.keys():
    # Get the corresponding trained model for the target column
    model = models[target_column]
    
    # Get the training data for the target column
    X_train_column = X_trains[target_column]
    y_train_column = y_trains[target_column]
    
    # Make predictions for the target column
    y_pred_train_column = np.maximum(model.predict(X_train_column), 0)
    
    # Compute the R2 score for the target column
    r2_train_column = r2_score(y_train_column, y_pred_train_column)
    
    # Store the predicted values and R2 score in their respective dictionaries
    y_pred_train[target_column] = y_pred_train_column
    r2_train[target_column] = [r2_train_column]

In [33]:
y_pred_train.head()

Unnamed: 0,total_area_harv_ha,total_area_lost_ha,total_harv_kg,yield_kgha,frac_area_harv,frac_area_loss,area_lost_fire,maize,groundnuts,mixed_beans,...,prop_mix,log_maize,log_sweetpotatoes,log_groundnuts,log_soybeans,loss_ind,drought_loss_ind,flood_loss_ind,animal_loss_ind,pest_loss_ind
0,935.813979,1289.657304,35053.226702,156.162266,0.443569,0.556431,0.013532,218.023574,82.50281,238.919371,...,0.03048,3.209652,4.319458,3.250281,4.029624,0.95353,0.30702,0.038384,0.02721,0.045872
1,1233.538655,2082.036541,47873.34003,38.363936,0.372473,0.627527,0.013496,0.0,1064.982263,531.659898,...,0.037534,3.106022,3.847343,2.9925,5.859174,1.007499,0.084682,0.031714,0.012468,0.045877
2,621.75201,500.072322,107092.754235,1373.119866,0.676181,0.323819,0.013581,1628.506074,2000.871383,518.657605,...,0.019694,6.483269,6.436786,5.588523,5.917981,0.882286,0.688257,0.044518,0.0,0.045868
3,616.20043,900.484352,82401.302522,2307.667465,0.758817,0.241183,0.013554,2230.649639,3670.513302,906.540039,...,0.022664,6.507477,6.202475,7.400402,7.839794,0.523251,0.253419,0.038917,0.005693,0.045877
4,0.0,0.0,125761.251107,2429.729313,1.012649,0.0,0.013501,2417.996095,3546.945204,1197.419294,...,0.035116,8.397025,9.730469,9.24645,8.069045,0.404031,0.0,0.03446,0.031894,0.045869


### Visualize Performance of Train Set 

We visualize performances of the training set through scatterplots of our predicted values versus ground-truthed values. These scatterplots include a regression line, and display the R^2 value for the selected variable. 

In [36]:
# Create a list of variable names from the dataframes
variable_names = list(y_pred_train.columns)

# Create the dropdown widget
variable_dropdown = widgets.Dropdown(options=variable_names, description='Variable:')

# create a container widget to hold the dropdown and the plot
container = widgets.VBox(children=[variable_dropdown])

# Create an output widget to display the plot
plot_output = widgets.Output()

# Define a function to update the plot based on the selected variable
def update_plot_train(variable):
    with plot_output:
        clear_output(wait=True)
        # Create the scatterplot
        fig, ax = plt.subplots()
        ax.scatter(y_pred_train[variable], y_trains[variable])
        ax.axline([0, 0], [1, 1], c="k")

        # Extract the R2 value from the r2_train dataframe
        r2_value = r2_train[variable]
        r2_value = round(r2_value, 2)

        # Set the title with the current title as a subtitle and the new title as "Variable: [variable]"
        sub_title = f"Model applied to train data n = {len(y_trains)}, R$^2$ = {r2_value}"
        title = f"Variable: {variable}"
        plt.title(sub_title, fontsize=12, y=1.0, loc='left')
        plt.title(title, fontsize=14, y=1.15, loc='center')

        # Set x and y axis labels
        ax.set_xlabel("Predicted", fontsize=15)
        ax.set_ylabel("Ground Truth", fontsize=15)

        # Display the plot
        plt.show()

# Define a function to update the dropdown options when the variable names change
def update_dropdown_options(change):
    variable_dropdown.options = variable_names

# Call the update_plot_train function with the initial value of the dropdown
update_plot_train(variable_dropdown.value)

# Register the event handler to update the dropdown options
variable_dropdown.observe(update_dropdown_options, 'options')

# Set up the interaction between the dropdown and the plot
def dropdown_eventhandler(change):
    variable = change.new
    update_plot_train(variable)

variable_dropdown.observe(dropdown_eventhandler, 'value')

# Display the dropdown and the plot
display(widgets.VBox([variable_dropdown, plot_output]))

VBox(children=(Dropdown(description='Variable:', options=('total_area_harv_ha', 'total_area_lost_ha', 'total_h…

### Test Set 

Next, we employ these models to create and store predictions and R^2 scores for each target column on our testing data. Again, our testing data has been aggregated by survey enumeration area (SEA) and year, which means that each of the 436 rows of `y_pred_test` represents a prediction made for a particular SEA during a particular year. 

In [37]:
# Initialize empty dictionaries for storing the predicted values and R2 scores
y_pred_test = pd.DataFrame()
r2_test = pd.DataFrame()

# Iterate over the keys in models dictionary
for target_column in models.keys():
    # Get the corresponding trained model for the target column
    model = models[target_column]
    
    # Get the training data for the target column
    X_test_column = X_tests[target_column]
    y_test_column = y_tests[target_column]
    
    # Make predictions for the target column
    y_pred_test_column = np.maximum(model.predict(X_test_column), 0)
    
    # Compute the R2 score for the target column
    r2_test_column = r2_score(y_test_column, y_pred_test_column)
    
    # Store the predicted values and R2 score in their respective dictionaries
    y_pred_test[target_column] = y_pred_test_column
    r2_test[target_column] = [r2_test_column]

In [39]:
y_pred_test

Unnamed: 0,total_area_harv_ha,total_area_lost_ha,total_harv_kg,yield_kgha,frac_area_harv,frac_area_loss,area_lost_fire,maize,groundnuts,mixed_beans,...,prop_mix,log_maize,log_sweetpotatoes,log_groundnuts,log_soybeans,loss_ind,drought_loss_ind,flood_loss_ind,animal_loss_ind,pest_loss_ind
0,917.813698,0.0,111435.011634,341.454985,0.976549,0.023451,0.013659,377.65178,0.0,0.0,...,0.000404,4.555314,6.258815,3.184263,4.502035,1.226299,1.027636,0.048856,0.0,0.045864
1,791.052135,11.192504,45102.609459,0.0,0.683158,0.316842,0.013655,0.0,0.0,0.0,...,0.0,2.951948,5.282578,3.860021,5.214353,1.272964,1.01956,0.04943,0.010031,0.04587
2,391.878285,254.997234,102768.847047,1233.565136,0.658221,0.341779,0.013583,1362.083833,1948.804512,376.208639,...,0.018646,6.484091,7.141421,6.32203,5.899069,0.915167,0.657961,0.044257,0.0,0.045868
3,915.230064,1257.836838,52782.728788,1007.351629,0.545082,0.454918,0.013512,1035.826396,1341.86001,620.285469,...,0.034321,3.937893,4.197342,3.719265,5.937764,0.760854,0.097803,0.035037,0.023696,0.045877
4,782.274013,939.409659,88759.321211,604.816086,0.50427,0.49573,0.013592,663.225058,970.482632,10.045635,...,0.01572,5.068544,7.06918,5.034843,5.354388,1.140324,0.812087,0.046157,0.0,0.045871
5,1012.046607,1648.383404,25370.726266,360.449625,0.526944,0.473056,0.013562,518.124264,17.648074,9.207263,...,0.023304,3.477883,5.547287,2.951858,5.661399,0.812376,0.366215,0.039943,0.001973,0.045873
6,477.66082,680.724448,106965.723669,2283.333963,0.764828,0.235172,0.013498,2649.313718,2604.863987,1012.033003,...,0.036449,7.163687,6.449258,7.386272,6.430003,0.633261,0.0,0.031394,0.012443,0.045878
7,2406.583182,3806.406288,8009.244938,0.0,0.323951,0.676049,0.013583,0.0,0.0,270.207776,...,0.020733,0.6865,3.225395,1.285908,3.208963,1.155907,1.01851,0.043463,0.0,0.045871
8,0.0,0.0,124372.652236,3315.658773,0.957808,0.042192,0.013533,3789.423892,3797.084231,1320.336113,...,0.031279,9.312883,8.896121,9.544446,8.501795,0.558353,0.283116,0.038429,0.003849,0.04587
9,1023.940828,1303.896108,15499.150907,42.101692,0.610987,0.389013,0.013489,0.0,804.269856,420.579618,...,0.038884,3.313278,5.392915,5.632121,5.263136,0.976424,0.032197,0.032195,0.034917,0.045875


### Visualize Performance of Test Set 

In [30]:
# Create a list of variable names from the dataframes
variable_names = list(y_pred_test.columns)

# create a container widget to hold the dropdown and the plot
container = widgets.VBox(children=[variable_dropdown])

# Create the dropdown widget
variable_dropdown = widgets.Dropdown(options=variable_names, description='Variable:')

# Create an output widget to display the plot
plot_output = widgets.Output()


# Define a function to update the plot based on the selected variable
def update_plot_test(variable):
    with plot_output:
        clear_output(wait=True)
        # Create the scatterplot
        fig, ax = plt.subplots()
        ax.scatter(y_pred_test[variable], y_tests[variable])
        ax.axline([0, 0], [1, 1], c="k")

        # Extract the R2 value from the r2_train dataframe
        r2_value = r2_test[variable]
        r2_value = round(r2_value, 2)

        # Set the title with the current title as a subtitle and the new title as "Variable: [variable]"
        sub_title = f"Model applied to test data n = {len(y_tests)}, R$^2$ = {r2_value}"
        title = f"Variable: {variable}"
        plt.title(sub_title, fontsize=12, y=1.0, loc='left')
        plt.title(title, fontsize=14, y=1.15, loc='center')

        # Set x and y axis labels
        ax.set_xlabel("Predicted", fontsize=15)
        ax.set_ylabel("Ground Truth", fontsize=15)

        # Display the plot
        plt.show()

# Define a function to update the dropdown options when the variable names change
def update_dropdown_options(change):
    variable_dropdown.options = variable_names

# Call the update_plot_train function with the initial value of the dropdown
update_plot_test(variable_dropdown.value)

# Register the event handler to update the dropdown options
variable_dropdown.observe(update_dropdown_options, 'options')

# Set up the interaction between the dropdown and the plot
def dropdown_eventhandler(change):
    variable = change.new
    update_plot_test(variable)

variable_dropdown.observe(dropdown_eventhandler, 'value')

# Display the dropdown and the plot
display(widgets.VBox([variable_dropdown, plot_output]))

VBox(children=(Dropdown(description='Variable:', options=('total_area_harv_ha', 'total_area_lost_ha', 'total_h…

### Apply Model to Ungrouped SEA Features

In [7]:
features_sea_ungrouped = pd.read_feather("/capstone/mosaiks/repos/modeling/data/model_directory/SEA_ungroup_features_simple_impute_mean.feather")

In [153]:
features_sea = features_sea_ungrouped.iloc[:, 2:12002]

Unnamed: 0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,0_10,...,999_3,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12
0,0.001143,0.000751,0.000000,0.000346,0.000000,0.000000,0.000000,0.000000,0.000000,0.001020,...,0.034016,0.071989,0.532948,0.469076,0.007786,0.006779,0.004811,0.001675,0.029891,0.033437
1,0.001143,0.000000,0.000776,0.000000,0.000000,0.000000,0.000000,0.000000,0.000002,0.000308,...,0.043530,0.494736,0.474246,0.417571,0.135569,0.003355,0.004876,0.003185,0.187867,0.156783
2,0.001143,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000234,0.000039,...,0.021665,0.324015,0.490193,0.392582,0.325692,0.004587,0.002309,0.002191,0.002333,0.033437
3,0.001143,0.000751,0.000776,0.000000,0.000000,0.000000,0.000000,0.000000,0.000657,0.000894,...,0.043530,0.718919,0.645601,0.673485,0.631725,0.004959,0.001359,0.001627,0.052683,0.033437
4,0.000000,0.000751,0.000776,0.000000,0.000000,0.000000,0.000000,0.000000,0.000002,0.001360,...,0.043530,1.000000,0.679722,1.000000,0.324600,0.006641,0.003481,0.003071,0.004119,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72278,0.001143,0.003581,0.003867,0.001019,0.001521,0.000000,0.002637,0.003194,0.004948,0.005988,...,0.008950,0.013373,0.009622,0.008655,0.003418,0.001569,0.000234,0.000054,0.000039,0.000276
72279,0.000000,0.000000,0.000000,0.000000,0.000000,0.002137,0.003117,0.004004,0.004732,0.000000,...,0.013883,0.014116,0.009862,0.002694,0.000432,0.000132,0.000086,0.000063,0.000060,0.033437
72280,0.002399,0.000751,0.000000,0.000000,0.001860,0.002130,0.002149,0.000000,0.004094,0.006669,...,0.012160,0.011552,0.013037,0.007480,0.002309,0.000083,0.000194,0.000100,0.000016,0.004205
72281,0.003386,0.000000,0.000000,0.000000,0.000923,0.002777,0.003057,0.004031,0.000000,0.000000,...,0.005391,0.004631,0.007071,0.001832,0.001039,0.000223,0.000081,0.000324,0.001030,0.004801


In [155]:
# Initialize empty dictionaries for storing the predicted values and R2 scores
y_pred_sea = pd.DataFrame()

# Iterate over the keys in models dictionary
for target_column in models.keys():
    # Get the corresponding trained model for the target column
    model = models[target_column]
    
    # Make predictions for the target column
    y_pred_sea_column = np.maximum(model.predict(features_sea), 0)
    
    # Store the predicted values and R2 score in their respective dictionaries
    y_pred_sea[target_column] = y_pred_sea_column

In [162]:
# Select the columns from features
selected_columns_sea = features_ungrouped[['lat', 'lon', 'year']]

# Concatenate selected_columns with y_preds
sea_preds = pd.concat([selected_columns_sea, y_pred_sea], axis=1)

# Display the combined dataframe
sea_preds

In [None]:
sea_preds.to_feather("/capstone/mosaiks/repos/modeling/data/predictions/SEA_predictions_ungrouped.feather")

## Apply Model to Zambia 10% Data


In [163]:
zambia = pd.read_feather("/capstone/mosaiks/repos/modeling/data/model_directory/zambia_10percent_features_simple_impute_modelpredict.feather")

In [165]:
zambia_features = zambia.iloc[:,2:12002]
zambia_features.head()

Unnamed: 0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,0_10,...,999_3,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12
0,0.002994,0.003749,0.002417,0.001449,0.001208,0.002577,0.002151,0.003231,0.004019,0.001838,...,0.000653,0.00141,0.001619,0.000461,0.000611,0.000226,0.000138,0.000562,0.000503,0.000406
1,0.00203,0.000743,0.0,0.0,0.0,7e-06,3.5e-05,0.000189,0.000664,0.002471,...,0.01228,0.009524,0.006202,0.004043,0.003652,0.002408,0.001475,0.000435,0.000302,0.005276
2,0.001111,0.003541,0.003555,0.001752,0.001398,0.001469,0.002361,0.002198,0.003063,0.005263,...,0.001252,0.004579,0.00331,0.002417,0.001392,0.001687,0.000812,0.000241,0.000365,0.000645
3,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.00064,0.001207,0.004661,...,0.055707,0.075393,0.056347,0.056281,0.036438,0.00406,0.008108,0.000633,0.000496,0.000598
4,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.00064,0.004295,0.001444,...,0.055707,0.075393,0.056347,0.056281,0.036438,0.00406,0.000187,0.002911,0.004108,0.035491


In [166]:
# Initialize empty dictionaries for storing the predicted values and R2 scores
y_pred_zambia = pd.DataFrame()

# Iterate over the keys in models dictionary
for target_column in models.keys():
    # Get the corresponding trained model for the target column
    model = models[target_column]
    
    # Make predictions for the target column
    y_pred_zambia_column = np.maximum(model.predict(zambia_features), 0)
    
    # Store the predicted values and R2 score in their respective dictionaries
    y_pred_zambia[target_column] = y_pred_zambia_column

In [167]:
y_pred_zambia.head()

Unnamed: 0,total_area_harv_ha,total_area_lost_ha,yield_kgha,frac_area_harv,frac_area_loss,maize,frac_loss_drought,prop_till_plough,prop_mono
0,0.0,0.0,1735.397458,0.813706,0.186294,1722.112146,0.0,0.0,1.07367
1,447.061644,256.229862,2331.179534,0.935772,0.064228,2405.568184,0.046058,0.550284,0.400076
2,1206.135845,2076.177727,1970.026901,0.587526,0.412474,1764.996532,0.053027,0.100866,0.471011
3,670.181614,898.956082,1155.632228,0.622778,0.377222,1093.192949,0.096518,0.632547,0.714045
4,588.095553,878.12345,1066.901627,0.603163,0.396837,1051.265307,0.05254,0.513613,0.712723


In [168]:
# Select the columns from features
selected_columns_zambia = zambia[['lat', 'lon', 'year']]

# Concatenate selected_columns with y_preds
zambia_preds = pd.concat([selected_columns_zambia, y_pred_zambia], axis=1)

# Display the combined dataframe
zambia_preds

Unnamed: 0,lat,lon,year,total_area_harv_ha,total_area_lost_ha,yield_kgha,frac_area_harv,frac_area_loss,maize,frac_loss_drought,prop_till_plough,prop_mono
0,-15.053257,22.730588,2019.0,0.000000,0.000000,1735.397458,0.813706,0.186294,1722.112146,0.000000,0.000000,1.073670
1,-15.053257,22.730588,2018.0,447.061644,256.229862,2331.179534,0.935772,0.064228,2405.568184,0.046058,0.550284,0.400076
2,-15.053257,22.730588,2022.0,1206.135845,2076.177727,1970.026901,0.587526,0.412474,1764.996532,0.053027,0.100866,0.471011
3,-15.053257,22.730588,2023.0,670.181614,898.956082,1155.632228,0.622778,0.377222,1093.192949,0.096518,0.632547,0.714045
4,-15.053257,22.730588,2015.0,588.095553,878.123450,1066.901627,0.603163,0.396837,1051.265307,0.052540,0.513613,0.712723
...,...,...,...,...,...,...,...,...,...,...,...,...
680651,-17.473257,26.080588,2016.0,0.000000,0.000000,596.193086,1.145336,0.000000,352.100772,0.165099,1.075864,0.000000
680652,-17.473257,26.080588,2015.0,102.839855,0.000000,3497.057246,0.813705,0.186295,4081.543865,0.027710,0.556807,1.361831
680653,-17.473257,26.080588,2015.0,102.839855,0.000000,3497.057246,0.813705,0.186295,4081.543865,0.027710,0.556807,1.361831
680654,-17.473257,26.080588,2018.0,768.755230,983.689961,987.401820,0.665632,0.334368,1079.121809,0.144083,0.831311,0.631516


In [169]:
zambia_preds.to_feather("/capstone/mosaiks/repos/modeling/data/predictions/zambia_10perc_predictions.feather")

In [None]:
sea_preds = pd.read_csv('capstone/mosaiks/repos/modeling/data/predictions/SEA_predictions_ungrouped.csv')


### Congratulations on completing this analysis!