# Merge Labels

In [1]:
import warnings
import time
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import geopandas
import pyarrow

from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning

# Zambia

In [None]:
zambia_shp = geopandas.read_file('data/gadm36_ZMB_shp/gadm36_ZMB_2.shp')
zambia_shp = zambia_shp.rename(columns = {'NAME_2': 'district'})[['district', 'geometry']]
zambia_shp.district = zambia_shp.district.replace("MPongwe", 'Mpongwe', regex=True)
zambia_districts = zambia_shp.district.sort_values().unique().tolist()
zambia_shp = zambia_shp.set_index('district')
zambia_shp.shape
# zambia_shp.plot(column = 'district',linewidth = 1, edgecolor = 'black' )
# zambia_shp

# Crops

In [None]:
crop_df = pd.read_csv('data/cfs_maize_districts_zambia_2009_2018.csv')
crop_df.district = crop_df.district.replace(
    {"Itezhi-tezhi": 'Itezhi-Tezhi',
     "Kapiri-Mposhi": 'Kapiri Mposhi',
     "Shang'ombo": 'Shangombo',
     "Chienge": 'Chiengi'
    }, regex=True)
crop_districts = crop_df.district.sort_values().unique().tolist()
crop_df = crop_df[['district', 'year', 'yield_mt']]
# crop_df = crop_df[crop_df.year > 2015]
crop_df = crop_df.set_index('district')
# crop_df

In [None]:
list(set(crop_districts) - set(zambia_districts))

In [None]:
list(set(zambia_districts) - set(crop_districts))

In [None]:
zambia_crop = crop_df.join(zambia_shp)
zambia_crop = geopandas.GeoDataFrame(zambia_crop)
# zambia_crop = zambia_crop.reset_index()
# zambia_crop.plot(linewidth = 1, edgecolor = 'black')
# zambia_crop

# Features

In [None]:
features = geopandas.GeoDataFrame()
for yr in range(2016, 2019):
    print("Opening:", yr)
    features_x = pd.read_feather(f'data/zambia/features_{yr}.feather')
    features_x = geopandas.GeoDataFrame(
        features_x, 
        geometry = geopandas.points_from_xy(x = features_x.lon, y = features_x.lat)
        , crs='EPSG:4326'
    ).drop(['lon', 'lat'], axis = 1)
    
    print("Joining:", yr)
    zambia_x = zambia_crop[zambia_crop.year == yr]
    adm_features = features_x.sjoin(zambia_x, how = 'left', predicate = 'within')
    
    print("Appending:", yr)
    features = features.append(adm_features)
features.crs = 'EPSG:4326'

In [None]:
adm_features = features.dropna(
    subset=['index_right']).drop(
    ['geometry', 'year_right'], axis = 1).rename(
    columns = {"index_right": "district", "year_left": "year"})

In [None]:
features_summary = adm_features.groupby(
    ['district'
     # , "yield_mt"
     ,"year"], as_index = False).mean()
features_summary