# Modeling Agricultural Variables - Feature Preprocessing
## Python modules

In [15]:
import warnings
import time
import os

import dask
from dask.distributed import Client

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors

import geopandas as gpd

import pyarrow
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from scipy.stats import pearsonr

import math
import seaborn as sns
import random

## Joining Features and Ground Truth Data - Preprocessing
This notebook will be used to prepare our feature and ground-truth data for our modeling process. At this point, we have a directory of .feather files that contain our features (see this notebook for feature generation). We want to concatentate these together, and perform the necessary operations to achieve a dataframe where a row is one observation unit; its columns are the associated features and the ground truth data that we want to train the model on. Broadly, we want our dataframe to take on the following form:

| observation_unit |  outcome_1 | outcome_2 | feature_1| feature_2 | feature_3
| ----|  ---- | ---- | -- | -- | -- |
| 1   |  $y_{1}$ | $y_{2}$ | $x_{1}$ | $x_{2}$ | $x_{3}$
| 2   |  $y_{1}$ | $y_{2}$ | $x_{1}$| $x_{2}$ | $x_{3}$

Where we can run regress `outcome_1` and `outcome_2` on our features, using the following form:

$y_{1}$ = $\beta_{1}$$x_{1}$ + $\beta_{2}$$x_{2}$ + $\beta_{3}$$x_{3}$ + $\beta_{n}$$x_{n}$ 

In our case, an observation is one SEA/year. We have ground truth data that summarizes the total crop yield, total crop loss, as well as other pertinent agricultural variables. We want our dataframe described above to look like this:

| SEA | Year | Observed Yield (Tonnes) | Observed Loss (Tonnes) | feature1| feature2 | feature3
| ----| ----| ---- | ---- | -- | -- | -- |
| 1   | 2016 | 72 | 13 | 1.23 | 3.25 | 0.123
| 2   | 2016  | 50 | 7.5 | 0.78| 1.2 | 2.4 

## 1. Load in the Data

### Read in the features

First, load in the concatenated feature feather file. This file should be what was produced from the ______ notebook file.

In [2]:
# First, define the filepaths where you saved your concatenated features and your cleaned ground-truth data, respectively

features_filepath = "/capstone/mosaiks/repos/modeling/data/features_raw"
features_filename = "sentinel_rgb_features_sea.feather"
#features_filename = "sentinel_rgb8_features_sea_save_2023_05_10.feather"

ground_truth_filepath = "/capstone/mosaiks/repos/preprocessing/data/ground_data_spatial"
ground_truth_filename = "updated_data.shp"


In [7]:
# Read in the new concatenated features feather file:
features = gpd.read_feather(f"{features_filepath}/{features_filename}")
#features = pd.read_feather(f"{features_filepath}/{features_filename}")
features = features.reset_index()
features = features.drop(columns=['index'])

# We can print the features dataframe and what years are present to check if they are correct
print(pd.unique(features['year']))
features


[2015 2016 2017 2018 2019 2020 2021 2022]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,995,996,997,998,999,lon,lat,year,month,geometry
0,0.001058,0.000000,0.005181,1.014728,0.098902,0.000000,0.033511,1.810280,0.0,0.000000,...,3.478022,4.774719,0.011104,0.840888,0.000045,27.47466,-16.339357,2015,7,POINT (27.47466 -16.33936)
1,0.002010,0.000022,0.003418,1.058190,0.125574,0.000291,0.039863,1.858072,0.0,0.000029,...,3.526972,4.882292,0.024313,0.873838,0.000047,27.46466,-16.339357,2015,7,POINT (27.46466 -16.33936)
2,0.004124,0.000000,0.000000,1.069269,0.127892,0.000000,0.036240,1.871935,0.0,0.000054,...,3.552448,4.921965,0.022708,0.881511,0.000000,27.46466,-16.329357,2015,7,POINT (27.46466 -16.32936)
3,0.001559,0.000004,0.000000,1.103106,0.134511,0.000230,0.057822,1.941573,0.0,0.000000,...,3.571980,4.965928,0.021992,0.857326,0.000266,27.45466,-16.339357,2015,7,POINT (27.45466 -16.33936)
4,0.001731,0.000002,0.005510,1.056985,0.122743,0.000167,0.034343,1.860240,0.0,0.000000,...,3.535786,4.864718,0.009189,0.852267,0.000000,27.47466,-16.349357,2015,7,POINT (27.47466 -16.34936)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549298,0.004385,0.000510,0.008147,1.396553,0.238291,0.000534,0.236429,2.404210,0.0,0.000024,...,3.850011,5.434838,0.012184,0.761620,0.000007,27.96466,-16.739357,2022,12,POINT (27.96466 -16.73936)
549299,0.000493,0.001652,0.002549,1.860167,0.405460,0.001686,0.382340,3.069240,0.0,0.000000,...,4.408813,6.329021,0.021479,0.742347,0.000000,27.97466,-16.739357,2022,12,POINT (27.97466 -16.73936)
549300,0.006834,0.000185,0.006003,1.653710,0.388718,0.000715,0.292500,2.731598,0.0,0.000031,...,4.198204,5.937481,0.013647,0.769534,0.000002,27.97466,-16.749357,2022,12,POINT (27.97466 -16.74936)
549301,0.003454,0.000005,0.001523,1.121913,0.219986,0.000052,0.136004,1.987867,0.0,0.000000,...,3.586843,4.973966,0.024207,0.821185,0.000817,27.97466,-16.759357,2022,12,POINT (27.97466 -16.75936)


### Read in Ground-Truth Data

Next, we read in our ground truth data, which should be cleaned and processed beforehand. The code we used to clean our ground-truth data will not be shared publicly due to the level of detail it contains about our raw data from the Zambian Ministry of Agriculture.

In [8]:
# If you have many variables or columns you will want to model, it is a good idea to make sure you save the names to the dataframe columns
names = ["sea_unq", "year", "total_area_planted_ha", "total_area_harv_ha", "total_area_lost_ha", "total_harv_kg", "yield_kgha", "frac_area_harv", "frac_area_loss", "area_lost_fire", "maize", "groundnuts", "mixed_beans",  "popcorn", "sorghum", "soybeans", "sweet_potatoes", "bunding", "monocrop", "mixture", "frac_loss_drought",  "frac_loss_flood", "frac_loss_animal", "frac_loss_pests", "frac_loss_soil", "frac_loss_fert", "prop_till_plough",  "prop_till_ridge", "prop_notill", "prop_hand", "prop_mono", "prop_mix", "log_maize",  "log_sweetpotatoes", "log_groundnuts", "log_soybeans", "loss_ind", "drought_loss_ind", "flood_loss_ind", "animal_loss_ind", "pest_loss_ind", "geometry"]

# Now read in your cleaned and formatted ground-truth data, it should contain geometry and is likely a shapefile
# If the below code line does not work, make sure to format your data as outlined in the Preprocessing repository, AND if it is a .shp file, make sure it has all additional files needed to read it in.
country_sea = gpd.read_file(f"{ground_truth_filepath}/{ground_truth_filename}")

# This is iptional if geopandas did not name your columns correctly, rename the columns of your data with the names list
country_sea.columns = names

# Check the years and print the information/columns stored in your data to check that everything was read in correctly
print(pd.unique(country_sea['year']))
country_sea


[2009. 2010. 2011. 2012. 2013. 2014. 2016. 2017. 2020. 2021.]


Unnamed: 0,sea_unq,year,total_area_planted_ha,total_area_harv_ha,total_area_lost_ha,total_harv_kg,yield_kgha,frac_area_harv,frac_area_loss,area_lost_fire,...,log_maize,log_sweetpotatoes,log_groundnuts,log_soybeans,loss_ind,drought_loss_ind,flood_loss_ind,animal_loss_ind,pest_loss_ind,geometry
0,1,2009.0,34.9725,34.972500,0.000000,57563.0,1645.950390,1.000000,0.000000,0.0,...,7.419992,8.455318,7.276696,7.107469,0.0,0.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
1,1,2010.0,32.2150,26.762500,5.452500,69925.0,2170.572715,0.830747,0.169253,0.0,...,7.479710,9.021598,8.309147,7.377759,1.0,0.0,0.0,0.0,1.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
2,1,2011.0,60.4075,59.752500,0.655000,120614.0,1996.672599,0.989157,0.010843,0.0,...,7.563515,8.759924,8.066608,-inf,0.0,0.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
3,1,2012.0,64.6175,57.602500,7.015000,105990.0,1640.267729,0.891438,0.108562,0.0,...,7.248387,8.836374,8.067501,6.476972,1.0,1.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
4,1,2013.0,325.5000,282.750000,42.750000,1975.0,6.067588,0.868664,0.131336,0.0,...,0.213193,-inf,4.264244,-inf,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3600,392,2014.0,485.6942,116.705696,368.988504,358.0,0.737089,0.240286,0.759714,0.0,...,-0.866062,-inf,9.433484,-inf,1.0,1.0,0.0,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
3601,392,2016.0,5193.0000,3272.000000,1921.000000,26125.0,5.030811,0.630079,0.369921,0.0,...,1.821687,2.074326,1.408068,-inf,1.0,1.0,0.0,1.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
3602,392,2017.0,2652.7500,1261.000000,1391.750000,17880.0,6.740175,0.475356,0.524644,0.0,...,2.306288,-inf,1.781542,-inf,1.0,1.0,0.0,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
3603,392,2020.0,103.5250,66.501000,37.024000,35620.0,344.071480,0.642367,0.357633,0.0,...,5.648800,-inf,6.830794,-inf,1.0,1.0,0.0,0.0,1.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."


In [4]:
# This code chunk is for loading the 10% sampled zambian features

features_filepath = "/capstone/mosaiks/repos/modeling/data"
features_filename = "features_zmb_save.feather"

ground_truth_filepath = "/capstone/mosaiks/repos/modeling/data/zambian_ward_boundries"
ground_truth_filename = "popward.shp"

features = pd.read_feather(f"{features_filepath}/{features_filename}")

country_ward = gpd.read_file(f"{ground_truth_filepath}/{ground_truth_filename}")

In [5]:
len(pd.unique(country_ward['WARD_NAME'])) # 1278
country_ward

Unnamed: 0,ID,WARD_NAME,PROV_CODE,PROVINCENA,DISTRICT_C,DISTRICTNA,CONST_CODE,WARD_CODE,perimeter,Shape_Leng,Shape_Area,geometry
0,100113408,Mapungu,1.0,Western,1001.0,Kalabo,134.0,08,59.835569,0.547729,0.010040,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155..."
1,100113409,Luanginga,1.0,Western,1001.0,Kalabo,134.0,09,17.512021,0.160195,0.001180,"POLYGON ((22.69203 -14.98920, 22.69243 -14.989..."
2,100113410,Liumba,1.0,Western,1001.0,Kalabo,134.0,10,54.226736,0.507828,0.006990,"POLYGON ((22.50557 -14.97359, 22.50855 -14.974..."
3,100113411,Yuka,1.0,Western,1001.0,Kalabo,134.0,11,90.136126,0.827044,0.018422,"POLYGON ((22.65805 -14.98798, 22.65975 -14.988..."
4,100113412,Buleya,1.0,Western,1001.0,Kalabo,134.0,12,61.102504,0.559915,0.008911,"POLYGON ((22.70868 -15.13386, 22.70826 -15.133..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1411,80611011,Tuvwanganai,8.0,North-Western,806.0,Solwezi,110.0,11,17.304878,0.153453,0.000803,"POLYGON ((26.39001 -12.17563, 26.39083 -12.175..."
1412,40606605,Kashikishi,4.0,Luapula,702.0,Nchelenge,66.0,05,167.699521,1.413525,0.069055,"POLYGON ((28.85148 -9.18543, 28.85182 -9.18544..."
1413,40205809,Mulundu,4.0,Luapula,402.0,Kawambwa,58.0,09,120.613286,1.085770,0.049460,"POLYGON ((29.50465 -9.61324, 29.50846 -9.61495..."
1414,50407607,Libala,5.0,Lusaka,504.0,Lusaka,76.0,07,15.840070,0.119842,0.000551,"POLYGON ((28.31243 -15.47706, 28.31198 -15.477..."


### Define the spatial boundaries

We're going to make another object `sea_unq_join` which contains the spatial information and a unique key for each SEA. This will be handy later, when we need to join the features to the ground-truth data.

In [9]:
# Filter country_sea for unique values of 'seq_unq' and 'geometry'
sea_unq_join = country_sea[['sea_unq', 'geometry']].drop_duplicates()
sea_unq_join

# Display the filtered DataFrame
print(sea_unq_join)


      sea_unq                                           geometry
0           1  POLYGON ((27.82327 -13.65772, 27.82294 -13.657...
10          2  POLYGON ((27.99349 -13.46497, 27.99352 -13.464...
20          3  POLYGON ((28.09909 -13.51864, 28.09867 -13.516...
29          4  POLYGON ((28.31924 -13.42915, 28.31911 -13.426...
38          5  POLYGON ((28.39982 -13.51544, 28.40012 -13.514...
...       ...                                                ...
3571      388  POLYGON ((25.07771 -14.63920, 25.07732 -14.638...
3578      389  POLYGON ((22.74142 -14.00343, 22.73856 -14.002...
3585      390  POLYGON ((23.08604 -14.20026, 23.08957 -14.202...
3592      391  POLYGON ((24.36764 -16.62208, 24.36564 -16.621...
3599      392  POLYGON ((23.23962 -16.31204, 23.23876 -16.312...

[392 rows x 2 columns]


In [7]:
# For getting the features at the ward level, filter them for unique wards
ward_unq_join = country_ward[['geometry']].drop_duplicates()

# Assign a unique numerical identifier to each geometry
ward_unq_join['ward_id'] = np.arange(len(ward_unq_join))
# Display the filtered DataFrame
print(ward_unq_join)

                                               geometry  ward_id
0     POLYGON ((22.80895 -15.15215, 22.80747 -15.155...        0
1     POLYGON ((22.69203 -14.98920, 22.69243 -14.989...        1
2     POLYGON ((22.50557 -14.97359, 22.50855 -14.974...        2
3     POLYGON ((22.65805 -14.98798, 22.65975 -14.988...        3
4     POLYGON ((22.70868 -15.13386, 22.70826 -15.133...        4
...                                                 ...      ...
1411  POLYGON ((26.39001 -12.17563, 26.39083 -12.175...     1411
1412  POLYGON ((28.85148 -9.18543, 28.85182 -9.18544...     1412
1413  POLYGON ((29.50465 -9.61324, 29.50846 -9.61495...     1413
1414  POLYGON ((28.31243 -15.47706, 28.31198 -15.477...     1414
1415  POLYGON ((26.09249 -17.47196, 26.08855 -17.487...     1415

[1416 rows x 2 columns]


## 2. Organize the features by growing season

**Important Note: Be sure to only run the below code ONCE, as it can cause issues if run multiple times (without restarting the kernel, of course).**

If you are looking at crop yields, then it is best to carry months over into the next year according to the crops' growing season(s). The below code is for maize's growing season. For maize, we carry the months October, November, and December over to the following year's data. These months represent the start of the growing season for the following year's maize yield. 


In [10]:
# IMPORTANT: Only run this chunk once after reading in your features initially
# Define the last year you have features for
year_end = 2023

features['year'] = np.where(
    features['month'].isin([10, 11, 12]),
    features['year'] + 1, 
    features['year'])

features_gs = features[features['year'] <= year_end]

features_gs.sort_values(['year', 'month'], inplace=True)

## 3. Pivot Wider by months

Since we want each row to represent one location per year, we can use the .unstack() function to pivot wider all rows with the same lat/lon and year. This results in a dataframe with 12,000 columns (1,000 columns for each month). 

In [11]:
features_gs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,995,996,997,998,999,lon,lat,year,month,geometry
0,0.001058,0.000000,0.005181,1.014728,0.098902,0.000000,0.033511,1.810280,0.0,0.000000,...,3.478022,4.774719,0.011104,0.840888,0.000045,27.47466,-16.339357,2015,7,POINT (27.47466 -16.33936)
1,0.002010,0.000022,0.003418,1.058190,0.125574,0.000291,0.039863,1.858072,0.0,0.000029,...,3.526972,4.882292,0.024313,0.873838,0.000047,27.46466,-16.339357,2015,7,POINT (27.46466 -16.33936)
2,0.004124,0.000000,0.000000,1.069269,0.127892,0.000000,0.036240,1.871935,0.0,0.000054,...,3.552448,4.921965,0.022708,0.881511,0.000000,27.46466,-16.329357,2015,7,POINT (27.46466 -16.32936)
3,0.001559,0.000004,0.000000,1.103106,0.134511,0.000230,0.057822,1.941573,0.0,0.000000,...,3.571980,4.965928,0.021992,0.857326,0.000266,27.45466,-16.339357,2015,7,POINT (27.45466 -16.33936)
4,0.001731,0.000002,0.005510,1.056985,0.122743,0.000167,0.034343,1.860240,0.0,0.000000,...,3.535786,4.864718,0.009189,0.852267,0.000000,27.47466,-16.349357,2015,7,POINT (27.47466 -16.34936)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549298,0.004385,0.000510,0.008147,1.396553,0.238291,0.000534,0.236429,2.404210,0.0,0.000024,...,3.850011,5.434838,0.012184,0.761620,0.000007,27.96466,-16.739357,2023,12,POINT (27.96466 -16.73936)
549299,0.000493,0.001652,0.002549,1.860167,0.405460,0.001686,0.382340,3.069240,0.0,0.000000,...,4.408813,6.329021,0.021479,0.742347,0.000000,27.97466,-16.739357,2023,12,POINT (27.97466 -16.73936)
549300,0.006834,0.000185,0.006003,1.653710,0.388718,0.000715,0.292500,2.731598,0.0,0.000031,...,4.198204,5.937481,0.013647,0.769534,0.000002,27.97466,-16.749357,2023,12,POINT (27.97466 -16.74936)
549301,0.003454,0.000005,0.001523,1.121913,0.219986,0.000052,0.136004,1.987867,0.0,0.000000,...,3.586843,4.973966,0.024207,0.821185,0.000817,27.97466,-16.759357,2023,12,POINT (27.97466 -16.75936)


In [12]:
# Store the geometry column so it doesn't get pivoted
# Save the geometry separately
geometry_col = features_gs['geometry']

# Drop the geometry column to avoid pivoting it
features_gs = features_gs.drop(columns=['geometry'])

In [13]:
# Use the unstack() function to pivot wider the rows with the same lat/lon 
features = features_gs.set_index(['lon', 'lat', 'year', 'month']).unstack()

# Apply a transformation to the columns' names
features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))

In [14]:
# Since some of our features contain infinite values, it is important to replace those with NaN values
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features = features.reset_index()

# Check the dataframe to see what it looks like
features.iloc[:, :]


Unnamed: 0,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,...,999_3,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12
0,22.00466,-16.189357,2015,,,,,,,,...,,,,,,,0.000210,,,
1,22.00466,-16.189357,2016,,0.000038,,0.000000,0.000000e+00,5.606620e-05,0.000000,...,,1.000000,1.000000,0.000197,1.000000,0.000144,0.000061,0.000102,0.000338,0.000273
2,22.00466,-16.189357,2017,,,,0.000000,3.640897e-07,8.515847e-06,0.000068,...,,0.004659,0.001663,0.001083,0.000475,0.000215,0.000102,0.000039,0.021058,
3,22.00466,-16.189357,2018,0.000032,,,0.000000,0.000000e+00,9.116117e-07,0.000044,...,,0.003729,0.002599,0.001340,0.000328,0.000404,0.000182,0.000114,0.001221,
4,22.00466,-16.189357,2019,0.000080,0.000025,0.000026,0.000078,3.502390e-05,1.300048e-04,0.000079,...,0.001926,0.001003,0.000488,0.000445,0.000310,0.000058,0.000035,0.000064,0.000242,0.000298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72513,33.50466,-10.209357,2019,0.005556,,,0.000067,4.927176e-05,5.067072e-05,0.000145,...,,0.005145,0.003170,0.002521,0.000776,0.000474,0.000011,0.000004,0.000041,0.000078
72514,33.50466,-10.209357,2020,,,,0.000080,1.206588e-04,,0.000283,...,,0.004205,0.007104,,0.001003,0.000172,0.000003,0.000015,0.000032,0.000130
72515,33.50466,-10.209357,2021,,,0.000100,,7.305521e-05,1.146089e-04,0.000278,...,0.004744,,0.004004,0.002973,0.001367,0.000131,0.000033,0.000021,0.000219,
72516,33.50466,-10.209357,2022,,,,,2.063466e-04,1.241788e-04,0.000497,...,,,0.004903,0.003632,0.001904,0.000701,0.000214,,0.000204,0.000255


### 4. Convert the features into a Geo Dataframe

This step allows us to join the features with our clean, ground-truth survey data based on the geometries. 

In [18]:
# Create a geodataframe of the new features
features_gdf = gpd.GeoDataFrame(
    features, 
    geometry = gpd.points_from_xy(x = features.lon, y = features.lat), # Use the lon and lat values to define the points/geometry
    crs='EPSG:4326' # Define that coordinate reference system that your features and data use
)

In [19]:
# Check what this geodataframe looks like, note the many different geometry columns
features_gdf.iloc[:, :]
#features_gdf.iloc[0:5,9000:]

Unnamed: 0,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,...,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,geometry
0,22.00466,-16.189357,2015,,,,,,,,...,,,,,,0.000210,,,,POINT (22.00466 -16.18936)
1,22.00466,-16.189357,2016,,0.000038,,0.000000,0.000000e+00,5.606620e-05,0.000000,...,1.000000,1.000000,0.000197,1.000000,0.000144,0.000061,0.000102,0.000338,0.000273,POINT (22.00466 -16.18936)
2,22.00466,-16.189357,2017,,,,0.000000,3.640897e-07,8.515847e-06,0.000068,...,0.004659,0.001663,0.001083,0.000475,0.000215,0.000102,0.000039,0.021058,,POINT (22.00466 -16.18936)
3,22.00466,-16.189357,2018,0.000032,,,0.000000,0.000000e+00,9.116117e-07,0.000044,...,0.003729,0.002599,0.001340,0.000328,0.000404,0.000182,0.000114,0.001221,,POINT (22.00466 -16.18936)
4,22.00466,-16.189357,2019,0.000080,0.000025,0.000026,0.000078,3.502390e-05,1.300048e-04,0.000079,...,0.001003,0.000488,0.000445,0.000310,0.000058,0.000035,0.000064,0.000242,0.000298,POINT (22.00466 -16.18936)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72513,33.50466,-10.209357,2019,0.005556,,,0.000067,4.927176e-05,5.067072e-05,0.000145,...,0.005145,0.003170,0.002521,0.000776,0.000474,0.000011,0.000004,0.000041,0.000078,POINT (33.50466 -10.20936)
72514,33.50466,-10.209357,2020,,,,0.000080,1.206588e-04,,0.000283,...,0.004205,0.007104,,0.001003,0.000172,0.000003,0.000015,0.000032,0.000130,POINT (33.50466 -10.20936)
72515,33.50466,-10.209357,2021,,,0.000100,,7.305521e-05,1.146089e-04,0.000278,...,,0.004004,0.002973,0.001367,0.000131,0.000033,0.000021,0.000219,,POINT (33.50466 -10.20936)
72516,33.50466,-10.209357,2022,,,,,2.063466e-04,1.241788e-04,0.000497,...,,0.004903,0.003632,0.001904,0.000701,0.000214,,0.000204,0.000255,POINT (33.50466 -10.20936)


In [19]:
# Drop the redundant independent geometry columns because now that they are in a separate geometry column (if needed)
# Check what it looks like now
features_gdf.iloc[:, 9000:]
#features_gdf.geometry

Unnamed: 0,749_10,749_11,749_12,750_1,750_2,750_3,750_4,750_5,750_6,750_7,...,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,geometry
0,,,,,,,,,,,...,,,,,,0.000210,,,,POINT (22.00466 -16.18936)
1,0.029813,0.078275,0.010708,,0.232216,,1.000000,1.000000,0.095275,1.000000,...,1.000000,1.000000,0.000197,1.000000,0.000144,0.000061,0.000102,0.000338,0.000273,POINT (22.00466 -16.18936)
2,0.022654,0.186308,,,,,0.418081,0.215358,0.183773,0.066819,...,0.004659,0.001663,0.001083,0.000475,0.000215,0.000102,0.000039,0.021058,,POINT (22.00466 -16.18936)
3,0.011619,0.108658,,0.221533,,,0.371180,0.229619,0.179176,0.104843,...,0.003729,0.002599,0.001340,0.000328,0.000404,0.000182,0.000114,0.001221,,POINT (22.00466 -16.18936)
4,0.012925,0.056511,0.049056,0.205820,0.284523,0.132658,0.146269,0.098466,0.060138,0.104179,...,0.001003,0.000488,0.000445,0.000310,0.000058,0.000035,0.000064,0.000242,0.000298,POINT (22.00466 -16.18936)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72513,0.049009,0.033550,0.026155,0.264853,,,0.385118,0.233107,0.212048,0.080460,...,0.005145,0.003170,0.002521,0.000776,0.000474,0.000011,0.000004,0.000041,0.000078,POINT (33.50466 -10.20936)
72514,0.018789,0.048263,0.013669,,,,0.396309,0.263039,,0.096227,...,0.004205,0.007104,,0.001003,0.000172,0.000003,0.000015,0.000032,0.000130,POINT (33.50466 -10.20936)
72515,0.045091,0.097623,,,,0.473596,,0.253704,0.199283,0.121137,...,,0.004004,0.002973,0.001367,0.000131,0.000033,0.000021,0.000219,,POINT (33.50466 -10.20936)
72516,,0.022850,0.055518,,,,,0.353117,0.244127,0.117323,...,,0.004903,0.003632,0.001904,0.000701,0.000214,,0.000204,0.000255,POINT (33.50466 -10.20936)


## 5. Join features to ground data

This is an important step, since this is how we can use both the features and the data to use in training our models. It allows us to spatially connect the ground-truth data and the features.

In [20]:
# Now lets combine the ground-truth data that is grouped by unique SEAs with our features
spatial_join = gpd.sjoin(features_gdf, sea_unq_join, how='right', predicate = 'within')


In [21]:
spatial_join

Unnamed: 0,index_left,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,...,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,sea_unq,geometry
0,46330.0,27.81466,-13.669357,2017.0,,,0.000000,,0.000000,0.000000,...,0.532948,0.469076,0.007786,0.006779,0.004811,0.001675,0.029891,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
0,46333.0,27.81466,-13.669357,2020.0,,0.000000,,0.000000,0.000000,0.000000,...,0.474246,0.417571,0.135569,0.003355,0.004876,0.003185,0.187867,0.156783,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
0,46332.0,27.81466,-13.669357,2019.0,,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.490193,0.392582,0.325692,0.004587,0.002309,0.002191,0.002333,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
0,46334.0,27.81466,-13.669357,2021.0,,,,0.000000,0.000000,0.000000,...,0.645601,0.673485,0.631725,0.004959,0.001359,0.001627,0.052683,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
0,46329.0,27.81466,-13.669357,2016.0,0.000000,,,0.000000,0.000000,0.000000,...,0.679722,1.000000,0.324600,0.006641,0.003481,,,1.000000,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3599,7316.0,23.24466,-16.269357,2022.0,,0.003581,0.003867,0.001019,0.001521,0.000000,...,0.009622,0.008655,0.003418,0.001569,0.000234,0.000054,0.000039,0.000276,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
3599,7315.0,23.24466,-16.269357,2021.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.002137,...,0.009862,0.002694,0.000432,0.000132,0.000086,0.000063,0.000060,,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
3599,7314.0,23.24466,-16.269357,2020.0,0.002399,,0.000000,0.000000,0.001860,0.002130,...,0.013037,0.007480,0.002309,0.000083,0.000194,0.000100,0.000016,0.004205,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
3599,7313.0,23.24466,-16.269357,2019.0,0.003386,0.000000,0.000000,0.000000,0.000923,0.002777,...,0.007071,0.001832,0.001039,0.000223,0.000081,0.000324,0.001030,0.004801,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."


In [22]:
# We can check this joined dataframe
spatial_join = spatial_join.reset_index()
spatial_join.iloc[:, :]

Unnamed: 0,index,index_left,lon,lat,year,0_1,0_2,0_3,0_4,0_5,...,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,sea_unq,geometry
0,0,46330.0,27.81466,-13.669357,2017.0,,,0.000000,,0.000000,...,0.532948,0.469076,0.007786,0.006779,0.004811,0.001675,0.029891,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
1,0,46333.0,27.81466,-13.669357,2020.0,,0.000000,,0.000000,0.000000,...,0.474246,0.417571,0.135569,0.003355,0.004876,0.003185,0.187867,0.156783,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
2,0,46332.0,27.81466,-13.669357,2019.0,,0.000000,0.000000,0.000000,0.000000,...,0.490193,0.392582,0.325692,0.004587,0.002309,0.002191,0.002333,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
3,0,46334.0,27.81466,-13.669357,2021.0,,,,0.000000,0.000000,...,0.645601,0.673485,0.631725,0.004959,0.001359,0.001627,0.052683,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
4,0,46329.0,27.81466,-13.669357,2016.0,0.000000,,,0.000000,0.000000,...,0.679722,1.000000,0.324600,0.006641,0.003481,,,1.000000,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72278,3599,7316.0,23.24466,-16.269357,2022.0,,0.003581,0.003867,0.001019,0.001521,...,0.009622,0.008655,0.003418,0.001569,0.000234,0.000054,0.000039,0.000276,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
72279,3599,7315.0,23.24466,-16.269357,2021.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.009862,0.002694,0.000432,0.000132,0.000086,0.000063,0.000060,,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
72280,3599,7314.0,23.24466,-16.269357,2020.0,0.002399,,0.000000,0.000000,0.001860,...,0.013037,0.007480,0.002309,0.000083,0.000194,0.000100,0.000016,0.004205,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
72281,3599,7313.0,23.24466,-16.269357,2019.0,0.003386,0.000000,0.000000,0.000000,0.000923,...,0.007071,0.001832,0.001039,0.000223,0.000081,0.000324,0.001030,0.004801,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."


In [23]:
# Drop the redundant indexing columns
features_join = spatial_join.drop(columns=['index', 'index_left'])

In [24]:
# Now we need to do an inner join of the spatial_join geodataframe with the ground-truth data
features_join = features_join.merge(country_sea, on=['year', 'sea_unq'], how='inner')


In [26]:
# Check what this final joined features and ground-truth geodataframe
features_join.iloc[:, 12000:]

Unnamed: 0,999_10,999_11,999_12,sea_unq,geometry_x,total_area_planted_ha,total_area_harv_ha,total_area_lost_ha,total_harv_kg,yield_kgha,...,log_maize,log_sweetpotatoes,log_groundnuts,log_soybeans,loss_ind,drought_loss_ind,flood_loss_ind,animal_loss_ind,pest_loss_ind,geometry_y
0,0.001675,0.029891,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657...",5329.234136,1662.551496,3666.68264,37750.0,7.083569,...,2.028847,2.333044,1.937424,1.586514,1.0,1.0,0.0,0.0,1.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
1,0.000931,0.002581,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657...",5329.234136,1662.551496,3666.68264,37750.0,7.083569,...,2.028847,2.333044,1.937424,1.586514,1.0,1.0,0.0,0.0,1.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
2,0.000466,0.000509,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657...",5329.234136,1662.551496,3666.68264,37750.0,7.083569,...,2.028847,2.333044,1.937424,1.586514,1.0,1.0,0.0,0.0,1.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
3,0.003185,0.187867,0.156783,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657...",75.812500,62.377500,13.43500,139700.0,1842.704040,...,7.597093,8.205414,7.640386,7.104800,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
4,0.001024,0.002458,0.003444,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657...",75.812500,62.377500,13.43500,139700.0,1842.704040,...,7.597093,8.205414,7.640386,7.104800,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32926,0.001083,0.001000,0.004963,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312...",103.525000,66.501000,37.02400,35620.0,344.071480,...,5.648800,-inf,6.830794,-inf,1.0,1.0,0.0,0.0,1.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
32927,0.002277,0.002202,0.003774,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312...",103.525000,66.501000,37.02400,35620.0,344.071480,...,5.648800,-inf,6.830794,-inf,1.0,1.0,0.0,0.0,1.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
32928,0.001737,0.001547,0.006698,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312...",103.525000,66.501000,37.02400,35620.0,344.071480,...,5.648800,-inf,6.830794,-inf,1.0,1.0,0.0,0.0,1.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
32929,0.001590,0.001557,0.003505,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312...",103.525000,66.501000,37.02400,35620.0,344.071480,...,5.648800,-inf,6.830794,-inf,1.0,1.0,0.0,0.0,1.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."


In [27]:
# Drop the redundant independent lon and lat columns because now that they are in a separate geometry column
# Will drop the extra geometry column in this case
features_join = features_join.drop(['geometry_x'], axis = 1)

In [28]:
# Check what this final joined features and ground-truth geodataframe
features_join.iloc[:, 12000:]

Unnamed: 0,999_10,999_11,999_12,sea_unq,total_area_planted_ha,total_area_harv_ha,total_area_lost_ha,total_harv_kg,yield_kgha,frac_area_harv,...,log_maize,log_sweetpotatoes,log_groundnuts,log_soybeans,loss_ind,drought_loss_ind,flood_loss_ind,animal_loss_ind,pest_loss_ind,geometry_y
0,0.001675,0.029891,,1,5329.234136,1662.551496,3666.68264,37750.0,7.083569,0.311968,...,2.028847,2.333044,1.937424,1.586514,1.0,1.0,0.0,0.0,1.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
1,0.000931,0.002581,,1,5329.234136,1662.551496,3666.68264,37750.0,7.083569,0.311968,...,2.028847,2.333044,1.937424,1.586514,1.0,1.0,0.0,0.0,1.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
2,0.000466,0.000509,,1,5329.234136,1662.551496,3666.68264,37750.0,7.083569,0.311968,...,2.028847,2.333044,1.937424,1.586514,1.0,1.0,0.0,0.0,1.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
3,0.003185,0.187867,0.156783,1,75.812500,62.377500,13.43500,139700.0,1842.704040,0.822786,...,7.597093,8.205414,7.640386,7.104800,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
4,0.001024,0.002458,0.003444,1,75.812500,62.377500,13.43500,139700.0,1842.704040,0.822786,...,7.597093,8.205414,7.640386,7.104800,1.0,0.0,0.0,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32926,0.001083,0.001000,0.004963,392,103.525000,66.501000,37.02400,35620.0,344.071480,0.642367,...,5.648800,-inf,6.830794,-inf,1.0,1.0,0.0,0.0,1.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
32927,0.002277,0.002202,0.003774,392,103.525000,66.501000,37.02400,35620.0,344.071480,0.642367,...,5.648800,-inf,6.830794,-inf,1.0,1.0,0.0,0.0,1.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
32928,0.001737,0.001547,0.006698,392,103.525000,66.501000,37.02400,35620.0,344.071480,0.642367,...,5.648800,-inf,6.830794,-inf,1.0,1.0,0.0,0.0,1.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
32929,0.001590,0.001557,0.003505,392,103.525000,66.501000,37.02400,35620.0,344.071480,0.642367,...,5.648800,-inf,6.830794,-inf,1.0,1.0,0.0,0.0,1.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."


## ---Ignore starting here ---

In [16]:
# CODE FOR Ward LEVEL FEATURES
# Now lets combine the ground-truth data that is grouped by unique SEAs with our features
spatial_join = gpd.sjoin(features_gdf, ward_unq_join, how='right', predicate = 'within')
# Drop the redundant independent lon and lat columns because now that they are in a separate geometry column
#spatial_join = spatial_join.drop(['index_left'], axis = 1)
    # Store the geometry column separately
geometry_col = spatial_join['geometry']
    # Remove the geometry column from the DataFrame
#features_join = spatial_join.drop(columns=['geometry'])
features_join = spatial_join.replace([np.inf, -np.inf], np.nan)
#features_join = features_join.set_index(['year', 'ward_id'])

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84" ...

  spatial_join = gpd.sjoin(features_gdf, ward_unq_join, how='right', predicate = 'within')


In [None]:
sea_unq_join = sea_unq_join.set_geometry('geometry_sea')
sea_unq_join.geometry

0       POLYGON ((27.82327 -13.65772, 27.82294 -13.657...
10      POLYGON ((27.99349 -13.46497, 27.99352 -13.464...
20      POLYGON ((28.09909 -13.51864, 28.09867 -13.516...
29      POLYGON ((28.31924 -13.42915, 28.31911 -13.426...
38      POLYGON ((28.39982 -13.51544, 28.40012 -13.514...
                              ...                        
3571    POLYGON ((25.07771 -14.63920, 25.07732 -14.638...
3578    POLYGON ((22.74142 -14.00343, 22.73856 -14.002...
3585    POLYGON ((23.08604 -14.20026, 23.08957 -14.202...
3592    POLYGON ((24.36764 -16.62208, 24.36564 -16.621...
3599    POLYGON ((23.23962 -16.31204, 23.23876 -16.312...
Name: geometry_sea, Length: 392, dtype: geometry

In [28]:
joined_df = gpd.sjoin(features_join, sea_unq_join, how='left', predicate='intersects')


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84" ...
Right CRS: EPSG:4326

  joined_df = gpd.sjoin(features_join, sea_unq_join, how='left', predicate='intersects')


In [29]:
joined_df

Unnamed: 0,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,...,999_7,999_8,999_9,999_10,999_11,999_12,geometry,ward_id,index_right,sea_unq
0,22.730588,-15.053257,2019.0,0.002994,0.003749,0.002417,0.001449,0.001208,0.002577,0.002151,...,0.000611,0.000226,0.000138,0.000562,0.000503,0.000406,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",0,,
0,22.730588,-15.053257,2018.0,0.002030,,0.000000,0.000000,0.000000,0.000007,0.000035,...,0.003652,0.002408,0.001475,0.000435,0.000302,0.005276,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",0,,
0,22.730588,-15.053257,2022.0,,0.003541,0.003555,0.001752,0.001398,0.001469,0.002361,...,0.001392,0.001687,0.000812,0.000241,0.000365,0.000645,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",0,,
0,22.730588,-15.053257,2023.0,,,,,,,,...,,,,0.000633,0.000496,0.000598,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",0,,
0,22.730588,-15.053257,2015.0,,,,,,,,...,,,0.000187,,,,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1415,26.080588,-17.473257,2016.0,0.000000,,,0.000000,0.000138,0.000842,0.001018,...,0.000068,0.000001,0.000006,,,1.000000,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",1415,2724.0,284.0
1415,26.080588,-17.473257,2015.0,,,,,,,,...,,,0.000000,,,,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",1415,2774.0,289.0
1415,26.080588,-17.473257,2015.0,,,,,,,,...,,,0.000000,,,,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",1415,2724.0,284.0
1415,26.080588,-17.473257,2018.0,0.000123,,,0.000009,0.000011,0.000005,0.000019,...,0.001817,0.000828,0.000038,0.000023,0.000624,,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",1415,2774.0,289.0


In [30]:
print(pd.unique(joined_df['sea_unq']))

[ nan 334. 335. 336. 343. 337. 367. 373. 338.  54.  52. 342. 339. 363.
 340. 341. 344. 388. 347. 345. 346. 348. 380. 349. 351. 350. 355. 353.
 357. 352. 354. 361. 356. 358. 389. 366. 359. 362. 360. 390. 375. 368.
 364. 325. 365. 369. 370. 371. 387. 372. 374. 376. 327. 326. 392. 377.
 378. 379. 391. 381. 382. 383. 384. 281. 386. 385.  31. 330. 328. 329.
 331. 332. 333. 221.   8.  28. 132.  34.  27.  32.  29.  30.  36.  33.
  11.  35.  40.  37.  38.  39.  41.  42.  43.  44.  45.  60.  46.  48.
  47.  16.  12.  10.   9.  15.  13.  14.  56.  18.  17.  55.  19.  25.
  21.  20.  59.  53.  51.  22.  23.  24.  26.   1.   2.   3.   5.   4.
   6.   7.  50.  49.  58.  57. 173. 169. 172. 168. 154. 171. 170. 174.
 175. 180. 209. 177. 146. 212.  84.  85.  86.  87. 117. 123. 122.  75.
  76.  77.  93.  92.  95.  94. 307.  80.  81.  82.  83.  62.  68.  63.
  64.  65.  66.  67.  70.  69.  71. 113. 114. 115. 116. 118. 294. 108.
 109. 119. 120. 121.  97.  98. 104.  99. 100.  72.  73.  74.  78.  79.
  61. 

In [2]:

#joined_df.to_feather('/capstone/mosaiks/repos/modeling/data/zambia_10perc_joined_2023_05_17.feather')

In [32]:
#features_join = features_join.reset_index()
features_join = joined_df

In [16]:
### WARD LEVEL JOINED FEATURES - NOT IMPUTED
### Save the features so we can skip all the above steps to make this process faster
#features_join.to_feather('/capstone/mosaiks/repos/modeling/data/zambia_10perc_joined_2023_05_17.feather')

In [2]:
### Load in the ward level features
features_join = gpd.read_feather('/capstone/mosaiks/repos/modeling/data/zambia_10perc_joined_2023_05_17.feather')
features_join

Unnamed: 0,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,...,999_7,999_8,999_9,999_10,999_11,999_12,geometry,ward_id,index_right,sea_unq
0,22.730588,-15.053257,2019.0,0.002994,0.003749,0.002417,0.001449,0.001208,0.002577,0.002151,...,0.000611,0.000226,0.000138,0.000562,0.000503,0.000406,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",0,,
0,22.730588,-15.053257,2018.0,0.002030,,0.000000,0.000000,0.000000,0.000007,0.000035,...,0.003652,0.002408,0.001475,0.000435,0.000302,0.005276,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",0,,
0,22.730588,-15.053257,2022.0,,0.003541,0.003555,0.001752,0.001398,0.001469,0.002361,...,0.001392,0.001687,0.000812,0.000241,0.000365,0.000645,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",0,,
0,22.730588,-15.053257,2023.0,,,,,,,,...,,,,0.000633,0.000496,0.000598,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",0,,
0,22.730588,-15.053257,2015.0,,,,,,,,...,,,0.000187,,,,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1415,26.080588,-17.473257,2016.0,0.000000,,,0.000000,0.000138,0.000842,0.001018,...,0.000068,0.000001,0.000006,,,1.000000,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",1415,2724.0,284.0
1415,26.080588,-17.473257,2015.0,,,,,,,,...,,,0.000000,,,,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",1415,2774.0,289.0
1415,26.080588,-17.473257,2015.0,,,,,,,,...,,,0.000000,,,,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",1415,2724.0,284.0
1415,26.080588,-17.473257,2018.0,0.000123,,,0.000009,0.000011,0.000005,0.000019,...,0.001817,0.000828,0.000038,0.000023,0.000624,,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",1415,2774.0,289.0


In [3]:
#features_join = features_load.set_index(['year', 'ward_id'])
#features_join

## --- Stop Ignoring ---

### 6. Impute missing values --- 

This "manual" imputation method involves grouping the feature_months into their respective features (i.e. columns 1_1, 1_2, and so on to 1_12 will all be grouped as feature 1). Then, since they are arranged as columns in order from months 1-12, we use the backfill (bfill) method of filling NA values. This means that the NAs are filled (within the feature groups) by the next available value to the right of the column. It then goes through and uses forward fill (ffill) to get any remaining NAs in the 12 month column or other NAs that could only be filled with ffill. 

Next, the imputation uses "cascading" steps by descending group levels, decreasing the proportion of imputed values with each step. First, the NA values are imputed at by both `year` and `sea_unq`, which should yield imputed values that most closely match the feature values that would be present in the data if there was no clouds obscuring the satellite images. Next, the remaining NA values that could not be imputed by both `year` and `sea_unq` are imputed by only `sea_unq`. Lastly, the remaining NA vlaues that could not be imputed by both `year` and `sea_unq` or by just `sea_unq` are imputed by `year` only. This option gives the user more control and transparency over how the imputation is executed.

Imputing using `scikit learn`'s simple imputer executes standard imputation, the details of which can be found in the `scikitlearn` documentation [here.](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)

The imputation approach depends on the selection made below for `impute_manual`.

In [28]:
# Imputing method
impute_manual = False
#impute_manual = True

In [29]:
# Extract the size of the features dataframe
rows, cols = features_join.shape

# Compute the number of cells in the joined dataframe
num_cells = rows * (cols-5) # (cols-5) removes the 5 columns that shouldn't have NAs (index_left, lon, lat, year, and geometry)
num_cells

# Let's check how many cells do not have NA values:
nan_count = features_join.isna().sum().sum()

print(f"Number of cells = {num_cells}")
print(f"Number of NA cells = {nan_count}")
print(f"Number of rows = {rows}")
print(f"Number of columns = {cols}")


Number of cells = 867396000
Number of NA cells = 320011111
Number of rows = 72283
Number of columns = 12005


In [30]:
# This code should always be run
#features_join = features_join.replace([np.inf, -np.inf], np.nan)
features_join.iloc[:, :]

Unnamed: 0,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,...,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,sea_unq,geometry
0,27.81466,-13.669357,2017.0,,,0.000000,,0.000000,0.000000,0.000000,...,0.532948,0.469076,0.007786,0.006779,0.004811,0.001675,0.029891,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
1,27.81466,-13.669357,2020.0,,0.000000,,0.000000,0.000000,0.000000,0.000000,...,0.474246,0.417571,0.135569,0.003355,0.004876,0.003185,0.187867,0.156783,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
2,27.81466,-13.669357,2019.0,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.490193,0.392582,0.325692,0.004587,0.002309,0.002191,0.002333,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
3,27.81466,-13.669357,2021.0,,,,0.000000,0.000000,0.000000,0.000000,...,0.645601,0.673485,0.631725,0.004959,0.001359,0.001627,0.052683,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
4,27.81466,-13.669357,2016.0,0.000000,,,0.000000,0.000000,0.000000,0.000000,...,0.679722,1.000000,0.324600,0.006641,0.003481,,,1.000000,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72278,23.24466,-16.269357,2022.0,,0.003581,0.003867,0.001019,0.001521,0.000000,0.002637,...,0.009622,0.008655,0.003418,0.001569,0.000234,0.000054,0.000039,0.000276,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
72279,23.24466,-16.269357,2021.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.002137,0.003117,...,0.009862,0.002694,0.000432,0.000132,0.000086,0.000063,0.000060,,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
72280,23.24466,-16.269357,2020.0,0.002399,,0.000000,0.000000,0.001860,0.002130,0.002149,...,0.013037,0.007480,0.002309,0.000083,0.000194,0.000100,0.000016,0.004205,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
72281,23.24466,-16.269357,2019.0,0.003386,0.000000,0.000000,0.000000,0.000923,0.002777,0.003057,...,0.007071,0.001832,0.001039,0.000223,0.000081,0.000324,0.001030,0.004801,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."


In [31]:
# This code chunk will help us keep track of the imputation process by defining colors for the messages.
class bcolors:
    BL = '\x1b[1;34m' #GREEN
    GR = '\x1b[1;36m' #GREEN
    YL = '\x1b[1;33m' #YELLOW
    RD = '\x1b[1;31m' #RED
    RESET = '\033[0m' #RESET COLOR

### Important note: 
The below code chunk is for splitting the features_join dataframe into sepecific sections to do "parallel processing".
The first chunk is for `sentinel_rgb8_features_sea_save_2023_05_10.feather` file.
The second chunk is for `features_zmb_save.feather` file (10% Zambia).

# ---- Impute Manual Bfill technique ----

## The code chunk below is for the SEA sampled features, since they need to be grouped by SEA.

In [32]:
features_join

Unnamed: 0,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,...,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,sea_unq,geometry
0,27.81466,-13.669357,2017.0,,,0.000000,,0.000000,0.000000,0.000000,...,0.532948,0.469076,0.007786,0.006779,0.004811,0.001675,0.029891,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
1,27.81466,-13.669357,2020.0,,0.000000,,0.000000,0.000000,0.000000,0.000000,...,0.474246,0.417571,0.135569,0.003355,0.004876,0.003185,0.187867,0.156783,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
2,27.81466,-13.669357,2019.0,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.490193,0.392582,0.325692,0.004587,0.002309,0.002191,0.002333,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
3,27.81466,-13.669357,2021.0,,,,0.000000,0.000000,0.000000,0.000000,...,0.645601,0.673485,0.631725,0.004959,0.001359,0.001627,0.052683,,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
4,27.81466,-13.669357,2016.0,0.000000,,,0.000000,0.000000,0.000000,0.000000,...,0.679722,1.000000,0.324600,0.006641,0.003481,,,1.000000,1,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72278,23.24466,-16.269357,2022.0,,0.003581,0.003867,0.001019,0.001521,0.000000,0.002637,...,0.009622,0.008655,0.003418,0.001569,0.000234,0.000054,0.000039,0.000276,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
72279,23.24466,-16.269357,2021.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.002137,0.003117,...,0.009862,0.002694,0.000432,0.000132,0.000086,0.000063,0.000060,,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
72280,23.24466,-16.269357,2020.0,0.002399,,0.000000,0.000000,0.001860,0.002130,0.002149,...,0.013037,0.007480,0.002309,0.000083,0.000194,0.000100,0.000016,0.004205,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
72281,23.24466,-16.269357,2019.0,0.003386,0.000000,0.000000,0.000000,0.000923,0.002777,0.003057,...,0.007071,0.001832,0.001039,0.000223,0.000081,0.000324,0.001030,0.004801,392,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."


In [33]:
%%time
if impute_manual:
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 1 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by year and SEA across group average')
    # First, save the features groups by the column name 
    #(i.e. we want only feature 1 values to be imputed for feature 1)
    # These groups are for the feature files with 1000 features
    group_names = [f"{i}_{j}" for i in range(1000) for j in range(1, 13)]
    groups = [group_names[i:i+12] for i in range(0, len(group_names), 12)]
    
    # This is for the feature file with 750 features
    # group_names = [f"{i}_{j}" for i in range(750) for j in range(1, 13)]
    # groups = [group_names[i:i+12] for i in range(0, len(group_names), 12)]

    # Then use a for loop to basically make sure the columns are in the feature form. 
    # We don't want to impute features based on survey columns.
    last_group_processed = False
    for col in features_join.columns:
        col_str = str(col)
        if col_str in group_names:
            split_col = col_str.split("_")
            for group in groups:
                features_join[group] = features_join[group].fillna(method="bfill", axis=1)
                print(f"Filled NAs for {group}")
                if group == groups[-1]:
                    last_group_processed = True
            if last_group_processed:
                for group in groups:
                    if features_join[group].isnull().values.any():
                        features_join[group] = features_join[group].fillna(method="ffill", axis=1)
                        print(f"Filled NAs with ffill for {group}")
                break
                
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 1 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by year and SEA across group average')
    features_join = (
        features_join
        .fillna(features_join
                .groupby(['year', 'sea_unq'], as_index=False) 
                .transform('mean')
               )
    )
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 2 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by year and SEA across group average')
    features_join = (
        features_join
        .fillna(features_join
                .groupby(['sea_unq'], as_index=False)
                .transform('mean'), 
               )
    )
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 3: Drop remaining NaN values\n')
    features_join = features_join.dropna(axis=0)
    print(f'Ending total row count: {bcolors.BL}{len(features_join)}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}')
    
# The else section is a basic simple imputation
else: 
    # Store the geometry column separately
    #geometry_col = features_join['geometry_y']
    geometry_col = features_join['geometry']
    # Store the lat, lon, index_right, and sea_unq columns 
    lat_col = features_join['lat']
    lon_col = features_join['lon']
    
    # Remove the geometry column from the DataFrame
    features_join = features_join.drop(columns=['geometry','lat','lon'])
    #features_join = features_join.drop(columns=['lat','lon','index_left'])
    
    # Set the index and impute
    features_join = features_join.set_index(['year', 'sea_unq'])
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit_transform(features_join)
    features_join[:] = imputer.transform(features_join)
    features_join = features_join.reset_index()
    
    # Add the geometryand other columns back to the DataFrame
    features_join['geometry'] = geometry_col
    features_join['lat'] = lat_col
    features_join['lon'] = lon_col


CPU times: user 38.3 s, sys: 18 s, total: 56.3 s
Wall time: 56.3 s


In [37]:
features_join = features_join.drop(columns=['geometry'])
#grouped_features = features_join.groupby(['year', 'sea_unq']).mean()
#features_join.set_geometry('geometry')
features_join.to_feather('/capstone/mosaiks/repos/modeling/data/model_directory/SEA_ungroup_features_simple_impute.feather')

#features_join.drop(columns=['geometry'])
#features_join.to_feather('/capstone/mosaiks/repos/modeling/data/model_directory/SEA_ungroup_features_simple_impute.csv', index=True)



## The code chunk below is for the 10% Zambia features, since they need to be grouped differently.

In [7]:
features_join.columns

Index(['lon', 'lat', 'year', '0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '0_7',
       ...
       '999_7', '999_8', '999_9', '999_10', '999_11', '999_12', 'geometry',
       'ward_id', 'index_right', 'sea_unq'],
      dtype='object', length=12007)

In [8]:
features_join = features_join.set_index(['year', 'ward_id'])

In [9]:
features_join

Unnamed: 0_level_0,Unnamed: 1_level_0,lon,lat,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,...,999_6,999_7,999_8,999_9,999_10,999_11,999_12,geometry,index_right,sea_unq
year,ward_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2019.0,0,22.730588,-15.053257,0.002994,0.003749,0.002417,0.001449,0.001208,0.002577,0.002151,0.003231,...,0.000461,0.000611,0.000226,0.000138,0.000562,0.000503,0.000406,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",,
2018.0,0,22.730588,-15.053257,0.002030,,0.000000,0.000000,0.000000,0.000007,0.000035,0.000189,...,0.004043,0.003652,0.002408,0.001475,0.000435,0.000302,0.005276,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",,
2022.0,0,22.730588,-15.053257,,0.003541,0.003555,0.001752,0.001398,0.001469,0.002361,0.002198,...,0.002417,0.001392,0.001687,0.000812,0.000241,0.000365,0.000645,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",,
2023.0,0,22.730588,-15.053257,,,,,,,,,...,,,,,0.000633,0.000496,0.000598,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",,
2015.0,0,22.730588,-15.053257,,,,,,,,,...,,,,0.000187,,,,"POLYGON ((22.80895 -15.15215, 22.80747 -15.155...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016.0,1415,26.080588,-17.473257,0.000000,,,0.000000,0.000138,0.000842,0.001018,0.003062,...,0.000246,0.000068,0.000001,0.000006,,,1.000000,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",2724.0,284.0
2015.0,1415,26.080588,-17.473257,,,,,,,,,...,,,,0.000000,,,,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",2774.0,289.0
2015.0,1415,26.080588,-17.473257,,,,,,,,,...,,,,0.000000,,,,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",2724.0,284.0
2018.0,1415,26.080588,-17.473257,0.000123,,,0.000009,0.000011,0.000005,0.000019,0.000096,...,0.003698,0.001817,0.000828,0.000038,0.000023,0.000624,,"POLYGON ((26.09249 -17.47196, 26.08855 -17.487...",2774.0,289.0


In [10]:
%%time
if impute_manual:
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 1 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by year and SEA across group average')
    # First, save the features groups by the column name 
    #(i.e. we want only feature 1 values to be imputed for feature 1)
    group_names = [f"{i}_{j}" for i in range(1000) for j in range(1, 13)]
    groups = [group_names[i:i+12] for i in range(0, len(group_names), 12)]

    # Then use a for loop to basically make sure the columns are in the feature form. 
    # We don't want to impute features based on survey columns.
    last_group_processed = False
    for col in features_join.columns:
        col_str = str(col)
        if col_str in group_names:
            split_col = col_str.split("_")
            for group in groups:
                features_join[group] = features_join[group].fillna(method="bfill", axis=1)
                print(f"Filled NAs for {group}")
                if group == groups[-1]:
                    last_group_processed = True
            if last_group_processed:
                for group in groups:
                    if features_join[group].isnull().values.any():
                        features_join[group] = features_join[group].fillna(method="ffill", axis=1)
                        print(f"Filled NAs with ffill for {group}")
                break
                
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 1 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by year and SEA across group average')
    features_join = (
        features_join
        .fillna(features_join
                .groupby(['year', 'ward_id'], as_index=False) 
                .transform('mean')
               )
    )
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 2 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by year and SEA across group average')
    features_join = (
        features_join
        .fillna(features_join
                .groupby(['ward_id'], as_index=False)
                .transform('mean'), 
               )
    )
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 3: Drop remaining NaN values\n')
    features_join = features_join.dropna(axis=0)
    print(f'Ending total row count: {bcolors.BL}{len(features_join)}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}')
    
# The else section is a basic simple imputation
else: 
    # Store the geometry column separately
    geometry_col = features_join['geometry']
    # Store the lat, lon, index_right, and sea_unq columns 
    lat_col = features_join['lat']
    lon_col = features_join['lon']
    index_col = features_join['index_right']
    sea_id = features_join['sea_unq']
    
    # Remove the geometry column from the DataFrame
    features_join = features_join.drop(columns=['geometry','lat','lon','index_right','sea_unq'])
    
    #features_join = features_join.set_index(['year', 'ward_id'])
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit_transform(features_join)
    features_join[:] = imputer.transform(features_join)
    #features_join = features_join.reset_index()
    
    
    # Add the columns back to the DataFrame
    features_join['lat'] = lat_col
    features_join['lon'] = lon_col
    features_join['sea_unq'] = sea_id
    features_join = features_join.reset_index()

CPU times: user 7min 38s, sys: 16min 28s, total: 24min 6s
Wall time: 24min 8s


In [13]:
#features_join.drop(columns=['geometry'])
#grouped_features = features_join.groupby(['year', 'ward_id']).mean()
features_join.to_csv('/capstone/mosaiks/repos/modeling/data/zambia_10perc_imputed_ward_sea.csv', index=True)

KeyboardInterrupt: 

In [15]:
features_join.to_feather('/capstone/mosaiks/repos/modeling/data/zambia_10perc_imputed_ward_sea.feather')

In [12]:
grouped_features

Unnamed: 0_level_0,Unnamed: 1_level_0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,0_10,...,999_6,999_7,999_8,999_9,999_10,999_11,999_12,lat,lon,sea_unq
year,ward_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2015.0,0,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.00064,0.002596,0.001444,...,0.056281,0.036438,0.00406,0.000831,0.002911,0.004108,0.035491,-15.095757,22.799338,
2015.0,1,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.00064,0.001739,0.001444,...,0.056281,0.036438,0.00406,0.000000,0.002911,0.004108,0.035491,-14.993257,22.660588,334.0
2015.0,2,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.00064,0.005177,0.001444,...,0.056281,0.036438,0.00406,0.000186,0.002911,0.004108,0.035491,-15.005479,22.465033,335.0
2015.0,3,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.00064,0.003367,0.001444,...,0.056281,0.036438,0.00406,0.000870,0.002911,0.004108,0.035491,-15.041718,22.559819,334.5
2015.0,4,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.00064,0.003582,0.001444,...,0.056281,0.036438,0.00406,0.000792,0.002911,0.004108,0.035491,-15.097007,22.718088,335.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023.0,1409,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.00064,0.001207,0.003790,...,0.056281,0.036438,0.00406,0.008108,0.000144,0.001352,0.001694,-14.934616,28.359715,
2023.0,1410,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.00064,0.001207,0.003838,...,0.056281,0.036438,0.00406,0.008108,0.001569,0.003000,0.006356,-15.581035,30.348366,
2023.0,1412,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.00064,0.001207,0.002240,...,0.056281,0.036438,0.00406,0.008108,0.001253,0.002817,0.035491,-9.254870,28.704943,91.0
2023.0,1413,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.00064,0.001207,0.002133,...,0.056281,0.036438,0.00406,0.008108,0.002616,0.004108,0.035491,-9.768893,29.503497,69.5


In [12]:
grouped_features.to_csv('/capstone/mosaiks/repos/modeling/data/zambia_10perc_imputed_2023_05_16.csv', index=True)

In [None]:
features_join.iloc[:1000, 505:755]

In [29]:
# features_new = features_join.copy()
# features_new.drop(columns=['geometry_y'])
# grouped_features = features_new.groupby(['year', 'sea_unq']).mean()
# grouped_features.to_csv('/Users/hveirs/Python/mosaiks_repos/grouped_features.csv', index=True)

In [30]:
features_join.drop(columns=['geometry_y'])
grouped_features = features_join.groupby(['year', 'sea_unq']).mean()
grouped_features.to_csv('/capstone/mosaiks/repos/preprocessing/data/features_join_imputed_2023_05_02.csv', index=True)

  grouped_features = features_join.groupby(['year', 'sea_unq']).mean()


In [3]:
features_imputed = pd.read_feather('/capstone/mosaiks/repos/modeling/data/zambia_10perc_imputed_ward_sea.feather')
features_imputed

Unnamed: 0,year,ward_id,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,...,999_6,999_7,999_8,999_9,999_10,999_11,999_12,lat,lon,sea_unq
0,2019.0,0,0.002994,0.003749,0.002417,0.001449,0.001208,0.002577,0.002151,0.003231,...,0.000461,0.000611,0.000226,0.000138,0.000562,0.000503,0.000406,-15.053257,22.730588,
1,2018.0,0,0.002030,0.000743,0.000000,0.000000,0.000000,0.000007,0.000035,0.000189,...,0.004043,0.003652,0.002408,0.001475,0.000435,0.000302,0.005276,-15.053257,22.730588,
2,2022.0,0,0.001111,0.003541,0.003555,0.001752,0.001398,0.001469,0.002361,0.002198,...,0.002417,0.001392,0.001687,0.000812,0.000241,0.000365,0.000645,-15.053257,22.730588,
3,2023.0,0,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.000640,...,0.056281,0.036438,0.004060,0.008108,0.000633,0.000496,0.000598,-15.053257,22.730588,
4,2015.0,0,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.000640,...,0.056281,0.036438,0.004060,0.000187,0.002911,0.004108,0.035491,-15.053257,22.730588,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
680651,2016.0,1415,0.000000,0.000743,0.000706,0.000000,0.000138,0.000842,0.001018,0.003062,...,0.000246,0.000068,0.000001,0.000006,0.002911,0.004108,1.000000,-17.473257,26.080588,284.0
680652,2015.0,1415,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.000640,...,0.056281,0.036438,0.004060,0.000000,0.002911,0.004108,0.035491,-17.473257,26.080588,289.0
680653,2015.0,1415,0.001111,0.000743,0.000706,0.000395,0.000185,0.000202,0.000298,0.000640,...,0.056281,0.036438,0.004060,0.000000,0.002911,0.004108,0.035491,-17.473257,26.080588,284.0
680654,2018.0,1415,0.000123,0.000743,0.000706,0.000009,0.000011,0.000005,0.000019,0.000096,...,0.003698,0.001817,0.000828,0.000038,0.000023,0.000624,0.035491,-17.473257,26.080588,289.0


# ---- Imputation Manual Testing Done ----

Now it would be ideal to get the above manual imputation parallel processed, but this is as far as I have done.
Below is my attempt at parallelizing the process:
# ---- Parallel Imputation attempt ----

In [57]:
features_test = features_join.iloc[:200, :].copy()

In [58]:
#features_test.iloc[:, 505:755]
features_test.iloc[:, 4:16]

Unnamed: 0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,0_10,0_11,0_12
0,,,0.00000,,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.001020,0.003210,
1,,,0.00000,,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,
2,,,0.00003,,0.000000,0.000000,1.270084e-05,0.000231,0.000857,0.000321,0.000098,
3,,0.0,,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000002,0.000308,0.000071,0.000000
4,,0.0,,0.000000,0.000000,0.000000,0.000000e+00,0.000039,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
195,,,,0.000000,0.000000,0.000014,6.807013e-05,0.000778,0.000443,0.002385,0.000342,0.000169
196,,,,0.000000,0.000000,0.000000,3.505405e-06,0.000877,0.000299,0.000410,0.000017,0.000000
197,,,,0.000000,0.000000,0.000000,0.000000e+00,0.000235,0.000000,0.000557,0.000037,
198,,,,0.000075,0.000038,0.000068,7.956613e-05,0.001070,0.000777,0.002619,0.000994,


In [59]:
features_test.iloc[:, 505:755]

Unnamed: 0,41_10,41_11,41_12,42_1,42_2,42_3,42_4,42_5,42_6,42_7,...,61_10,61_11,61_12,62_1,62_2,62_3,62_4,62_5,62_6,62_7
0,1.141906,0.968697,,,,0.0,,0.0,0.0,0.0,...,1.595295,1.352799,,,,0.0,,0.0,0.0,0.0
1,1.340785,1.161160,,,,0.0,,0.0,0.0,0.0,...,1.347891,1.097611,,,,0.0,,0.0,0.0,0.0
2,1.194933,1.089161,,,,0.0,,0.0,0.0,0.0,...,1.099367,1.023362,,,,0.0,,0.0,0.0,0.0
3,0.591394,0.325480,0.090580,,0.0,,0.0,0.0,0.0,0.0,...,1.080549,0.488113,0.219602,,0.0,,0.0,0.0,0.0,0.0
4,1.315837,1.325033,0.645732,,0.0,,0.0,0.0,0.0,0.0,...,1.304484,1.365465,1.089622,,0.0,,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.721828,0.872550,0.391995,,,,0.0,0.0,0.0,0.0,...,1.590503,0.946108,0.476877,,,,0.0,0.0,0.0,0.0
196,1.274809,0.556130,0.106422,,,,0.0,0.0,0.0,0.0,...,1.318411,0.780179,0.263294,,,,0.0,0.0,0.0,0.0
197,2.037889,0.716165,,,,,0.0,0.0,0.0,0.0,...,2.143823,1.059128,,,,,0.0,0.0,0.0,0.0
198,1.433008,0.694575,,,,,0.0,0.0,0.0,0.0,...,1.596994,1.017926,,,,,0.0,0.0,0.0,0.0


In [29]:
import multiprocessing

In [48]:
# def fillna_for_group(df, group):
#     df[group] = df[group].fillna(method="bfill", axis=1)
#     print(f"Filled NAs for {group}")
#     if df[group].isnull().values.any():
#         df[group] = df[group].fillna(method="ffill", axis=1)
#         print(f"Filled NAs with ffill for {group}")
#     return df
def fillna_for_group(args):
    df, group = args
    df[group] = df[group].fillna(method="bfill", axis=1)
    print(f"Filled NAs for {group}")
    if df[group].isnull().values.any():
        df[group] = df[group].fillna(method="ffill", axis=1)
        print(f"Filled NAs with ffill for {group}")
    return df

if __name__ == '__main__':
    group_names = [f"{i}_{j}" for i in range(1000) for j in range(1, 13)]
    groups = [group_names[i:i+12] for i in range(0, len(group_names), 12)]
    #num_processes = multiprocessing.cpu_count() - 30
    pool = multiprocessing.Pool(processes = 5)
    output = pool.map(fillna_for_group, [(features_test.copy(), group) for group in groups])
    pool.close()
    pool.join()
    features_test = pd.concat(output, axis = 1)

Filled NAs for ['0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '0_7', '0_8', '0_9', '0_10', '0_11', '0_12']
Filled NAs with ffill for ['0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '0_7', '0_8', '0_9', '0_10', '0_11', '0_12']
Filled NAs for ['1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9', '1_10', '1_11', '1_12']
Filled NAs with ffill for ['1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9', '1_10', '1_11', '1_12']
Filled NAs for ['2_1', '2_2', '2_3', '2_4', '2_5', '2_6', '2_7', '2_8', '2_9', '2_10', '2_11', '2_12']
Filled NAs with ffill for ['2_1', '2_2', '2_3', '2_4', '2_5', '2_6', '2_7', '2_8', '2_9', '2_10', '2_11', '2_12']
Filled NAs for ['3_1', '3_2', '3_3', '3_4', '3_5', '3_6', '3_7', '3_8', '3_9', '3_10', '3_11', '3_12']
Filled NAs with ffill for ['3_1', '3_2', '3_3', '3_4', '3_5', '3_6', '3_7', '3_8', '3_9', '3_10', '3_11', '3_12']
Filled NAs for ['4_1', '4_2', '4_3', '4_4', '4_5', '4_6', '4_7', '4_8', '4_9', '4_10', '4_11', '4_12']
Filled NAs with ffill for ['4

In [56]:
features_test.iloc[:, 505:755]
#features_test.iloc[:, 4:16]

Unnamed: 0,41_10,41_11,41_12,42_1,42_2,42_3,42_4,42_5,42_6,42_7,...,61_10,61_11,61_12,62_1,62_2,62_3,62_4,62_5,62_6,62_7
0,1.141906,0.968697,,,,0.0,,0.0,0.0,0.0,...,1.595295,1.352799,,,,0.0,,0.0,0.0,0.0
1,1.340785,1.161160,,,,0.0,,0.0,0.0,0.0,...,1.347891,1.097611,,,,0.0,,0.0,0.0,0.0
2,1.194933,1.089161,,,,0.0,,0.0,0.0,0.0,...,1.099367,1.023362,,,,0.0,,0.0,0.0,0.0
3,0.591394,0.325480,0.090580,,0.0,,0.0,0.0,0.0,0.0,...,1.080549,0.488113,0.219602,,0.0,,0.0,0.0,0.0,0.0
4,1.315837,1.325033,0.645732,,0.0,,0.0,0.0,0.0,0.0,...,1.304484,1.365465,1.089622,,0.0,,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.721828,0.872550,0.391995,,,,0.0,0.0,0.0,0.0,...,1.590503,0.946108,0.476877,,,,0.0,0.0,0.0,0.0
196,1.274809,0.556130,0.106422,,,,0.0,0.0,0.0,0.0,...,1.318411,0.780179,0.263294,,,,0.0,0.0,0.0,0.0
197,2.037889,0.716165,,,,,0.0,0.0,0.0,0.0,...,2.143823,1.059128,,,,,0.0,0.0,0.0,0.0
198,1.433008,0.694575,,,,,0.0,0.0,0.0,0.0,...,1.596994,1.017926,,,,,0.0,0.0,0.0,0.0


In [54]:
def fillna_for_group(df, group):
    df[group] = df[group].fillna(method="bfill", axis=1) 
    print(f"Filled NAs for {group}")
    if df[group].isnull().values.any():
        df[group] = df[group].fillna(method="ffill", axis=1)
        print(f"Filled NAs with ffill for {group}")
    return df

if __name__ == '__main__':
    group_names = [f"{i}_{j}" for i in range(1000) for j in range(1, 13)]
    groups = [group_names[i:i+12] for i in range(0, len(group_names), 12)]
    #num_processes = multiprocessing.cpu_count() - 30
    pool = multiprocessing.Pool(processes = 5)
    results = [pool.apply_async(fillna_for_group, args=(features_test.copy(), group)) for group in groups]
    output = [r.get() for r in results]
    pool.close()
    pool.join()
    features_test = pd.concat(output, axis=1)


Filled NAs for ['0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '0_7', '0_8', '0_9', '0_10', '0_11', '0_12']
Filled NAs with ffill for ['0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '0_7', '0_8', '0_9', '0_10', '0_11', '0_12']
Filled NAs for ['1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9', '1_10', '1_11', '1_12']
Filled NAs with ffill for ['1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9', '1_10', '1_11', '1_12']
Filled NAs for ['2_1', '2_2', '2_3', '2_4', '2_5', '2_6', '2_7', '2_8', '2_9', '2_10', '2_11', '2_12']
Filled NAs with ffill for ['2_1', '2_2', '2_3', '2_4', '2_5', '2_6', '2_7', '2_8', '2_9', '2_10', '2_11', '2_12']
Filled NAs for ['3_1', '3_2', '3_3', '3_4', '3_5', '3_6', '3_7', '3_8', '3_9', '3_10', '3_11', '3_12']
Filled NAs for ['4_1', '4_2', '4_3', '4_4', '4_5', '4_6', '4_7', '4_8', '4_9', '4_10', '4_11', '4_12']
Filled NAs with ffill for ['3_1', '3_2', '3_3', '3_4', '3_5', '3_6', '3_7', '3_8', '3_9', '3_10', '3_11', '3_12']
Filled NAs with ffill for ['4

In [None]:
last_group_processed = False
    for col in features_join.columns:
        col_str = str(col)
        if col_str in group_names:
            split_col = col_str.split("_")
            for group in groups:
                features_join[group] = features_join[group].fillna(method="bfill", axis=1)
                print(f"Filled NAs for {group}")
                if group == groups[-1]:
                    last_group_processed = True
            if last_group_processed:
                for group in groups:
                    if features_join[group].isnull().values.any():
                        features_join[group] = features_join[group].fillna(method="ffill", axis=1)
                        print(f"Filled NAs with ffill for {group}")
                break

In [26]:
%%time
if impute_manual:
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 1 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by year and SEA across group average')
    # First, save the features groups by the column name 
    #(i.e. we want only feature 1 values to be imputed for feature 1)
    def fillna_for_group(df, group):
        df[group] = df[group].fillna(method="bfill", axis=1)
        print(f"Filled NAs for {group}")
        if df[group].isnull().values.any():
            df[group] = df[group].fillna(method="ffill", axis=1)
            print(f"Filled NAs with ffill for {group}")
        return df

    if __name__ == '__main__':
        group_names = [f"{i}_{j}" for i in range(1000) for j in range(1, 13)]
        groups = [group_names[i:i+12] for i in range(0, len(group_names), 12)]
        num_processes = multiprocessing.cpu_count() - 30
        pool = multiprocessing.Pool(processes=num_processes)
        results = [pool.apply_async(fillna_for_group, args=(features_test.copy(), group)) for group in groups]
        output = [r.get() for r in results]
        pool.close()
        pool.join()
        features_test = pd.concat(output, axis=1)
                
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 1 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by year and SEA across group average')
    features_test = (
        features_test
        .fillna(features_test
                .groupby(['year', 'sea_unq'], as_index=False) 
                .transform('mean')
               )
    )
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 2 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by year and SEA across group average')
    features_test = (
        features_test
        .fillna(features_test
                .groupby(['sea_unq'], as_index=False)
                .transform('mean'), 
               )
    )
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 3: Drop remaining NaN values\n')
    features_test = features_test.dropna(axis=0)
    print(f'Ending total row count: {bcolors.BL}{len(features_join)}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}')
    
# The else section is a basic simple imputation
else: 
    # Store the geometry column separately
    geometry_col = features_join['geometry_y']
    # Remove the geometry column from the DataFrame
    features_join = features_join.drop(columns=['geometry_y'])
    features_join = features_join.set_index(['year', 'sea_unq'])
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit_transform(features_join)
    features_join[:] = imputer.transform(features_join)
    features_join = features_join.reset_index()
    # Add the geometry column back to the DataFrame
    features_join['geometry'] = geometry_col

Post step 1 NaN row count: [1;33m32888[0m 
Post step 1 NaN row %: [1;33m99.87[0m 
Post step 1 NaN cell %: [1;33m29.70[0m 

Step 2: Filling NaN values by year and SEA across group average
Filled NAs for ['0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '0_7', '0_8', '0_9', '0_10', '0_11', '0_12']
Filled NAs with ffill for ['0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '0_7', '0_8', '0_9', '0_10', '0_11', '0_12']
Filled NAs for ['1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9', '1_10', '1_11', '1_12']
Filled NAs with ffill for ['1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9', '1_10', '1_11', '1_12']
Filled NAs for ['2_1', '2_2', '2_3', '2_4', '2_5', '2_6', '2_7', '2_8', '2_9', '2_10', '2_11', '2_12']
Filled NAs with ffill for ['2_1', '2_2', '2_3', '2_4', '2_5', '2_6', '2_7', '2_8', '2_9', '2_10', '2_11', '2_12']
Filled NAs for ['3_1', '3_2', '3_3', '3_4', '3_5', '3_6', '3_7', '3_8', '3_9', '3_10', '3_11', '3_12']
Filled NAs with ffill for ['3_1', '3_2', '3_3', '3_4'

ValueError: Grouper for 'year' not 1-dimensional

In [None]:
if impute_manual:
    ln_ft = len(features_join) # This saves the total length of the features_join dataframe 
    ln_na = len(features_join.dropna()) # This saves the length of the features_join dataframe without NA values 
    # (which is none since every row has at least 1 missing value)
    
    # This print statement simply helps us keep track of the number of rows 
    # and what we are currently on before starting the process.
    print(f'Starting total row count: {bcolors.BL}{ln_ft}{bcolors.RESET}',
          f'\nPre-Impute NaN row count: {bcolors.RD}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPre-Impute NaN row %: {bcolors.RD}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPre-Impute NaN cell %: {bcolors.RD}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 1: Filling NaN values by month and feature') 
    # We need to make a for loop so that the backfill method can iterate for each feature and not mess up between transitions of cols
    group_names = [f"{i}_{j}" for i in range(1000) for j in range(1, 13)]
    groups = [group_names[i:i+12] for i in range(0, len(group_names), 12)]

    # # Then use a for loop to basically make sure the columns are in the feature form. 
    # # We don't want to impute features based on survey columns.
    # for col in features_join.columns:
    #     col_str = str(col)
    #     if col_str in group_names:
    #         split_col = col_str.split("_")
    #         for group in groups:
    #             features_join[group] = features_join[group].fillna(method="bfill", axis=1)
    #             #print(f"Filled NAs for {group}")
    def fillna_for_group(df, group):
        df[group] = df[group].fillna(method="bfill", axis = "columns")
        print(f"Filled NAs for {group}")
        return df

    # Define the number of processes to use
    num_processes = multiprocessing.cpu_count() - 35

    # Split the group names into chunks for each process
    group_chunks = [group_names[i:i+num_processes] for i in range(0, len(group_names), num_processes)]

    # Create a process pool and map the function over each chunk of group names
    with multiprocessing.Pool(processes=num_processes) as pool:
        for group_chunk in group_chunks:
            results = [pool.apply_async(fillna_for_group, args=(features_join, group)) for group in group_chunk]
            output = [r.get() for r in results]
            
            
    ln_na = len(features_join.dropna())
    print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 1 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by year and SEA across group average')
    features_join = (
        features_join
        .fillna(features_join
                .groupby(['year', 'sea_unq'], as_index=False) 
                .transform('median')
               )
    )
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 2 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by year and SEA across group average')
    features_join = (
        features_join
        .fillna(features_join
                .groupby(['sea_unq'], as_index=False)
                .transform('median'), 
               )
    )
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 3: Drop remaining NaN values\n')
    features_join = features_join.dropna(axis=0)
    print(f'Ending total row count: {bcolors.BL}{len(features_join)}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}')
    
# The else section is a basic simple imputation
else: 
    # Store the geometry column separately
    geometry_col = features_join['geometry_y']
    # Remove the geometry column from the DataFrame
    features_join = features_join.drop(columns=['geometry_y'])
    features_join = features_join.set_index(['year', 'sea_unq'])
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit_transform(features_join)
    features_join[:] = imputer.transform(features_join)
    features_join = features_join.reset_index()
    # Add the geometry column back to the DataFrame
    features_join['geometry'] = geometry_col

### Congratulations on completing this analysis!