# Intercomparison

**Author:**

**Last updated:**

**Description:** Runs intercomparison for [Country Year]

## 1. Setup

In [None]:
# !earthengine authenticate

In [None]:
!git clone https://github.com/nasaharvest/crop-mask.git

In [None]:
!pip install cartopy -qq
!pip install rasterio -qq
!pip install dvc[gs] -qq

In [None]:
import ee
import geemap
import sys
import pandas as pd
import numpy as np

import geopandas as gpd
from pathlib import Path

ee.Authenticate()
ee.Initialize(project="bsos-geog-harvest1")

sys.path.append("../..")

from src.compare_covermaps import TARGETS, filter_by_bounds, generate_report, CLASS_COL, COUNTRY_COL, get_ensemble_area
from src.compare_covermaps import TEST_COUNTRIES, TEST_CODE

## 2. Read in evaluation set

In [None]:
country = "<COUNTRY STRING GOES HERE>"

if country not in TEST_CODE:
    print(f"WARNING: {country} not found in TEST_CODE in src/compare_covermaps.py")
if country not in TEST_COUNTRIES:
    print(f"WARNING: {country} not found in TEST_COUNTRIES in src/compare_covermaps.py")
if country not in TEST_CODE or country not in TEST_COUNTRIES:
    print("Please update src/compare_covermaps.py and restart the notebook.")
else:
    country_code = TEST_CODE[country]
    # dataset_path = "../" + TEST_COUNTRIES[country]

In [None]:
# !dvc pull data/datasets

In [None]:
ceo_set1 = "<PATH TO CEO REFERENCE SAMPLE SET1>"
ceo_set2 = "<PATH TO CEO REFERENCE SAMPLE SET2>"

In [None]:
def reference_sample_agree(ceo_ref1, ceo_ref2):
    ceo_ref1 = pd.read_csv(ceo_ref1)
    ceo_ref2 = pd.read_csv(ceo_ref2)

    assert ceo_ref1.columns[-1] == ceo_ref2.columns[-1]

    label_question = ceo_ref1.columns[-1]

    print(f"Number of NANs/ missing answers in set 1: {ceo_ref1[label_question].isna().sum()}")
    print(f"Number of NANs/ missing answers in set 2: {ceo_ref2[label_question].isna().sum()}")

    if ceo_ref1.shape[0] != ceo_ref2.shape[0]:
        print("The number of rows in the reference sets are not equal.")
        print("Checking for duplictes on 'plotid'..")
        print(
            " Number of duplicated in set 1: %s" % ceo_ref1[ceo_ref1.plotid.duplicated()].shape[0]
        )
        print(
            " Number of duplicated in set 2: %s" % ceo_ref2[ceo_ref2.plotid.duplicated()].shape[0]
        )
        print("Removing duplicates and keeping the first...")
        ceo_ref1 = ceo_ref1.drop_duplicates(subset="plotid", keep="first")
        ceo_ref2 = ceo_ref2.drop_duplicates(subset="plotid", keep="first")

        ceo_ref1.set_index("plotid", inplace=True)
        ceo_ref2.set_index("plotid", inplace=True)
    else:
        print("The number of rows in the reference sets are equal.")

    ceo_agree = ceo_ref1[ceo_ref1[label_question] == ceo_ref2[label_question]]

    print(
        "Number of samples that are in agreement: %d out of %d (%.2f%%)"
        % (
            ceo_agree.shape[0],
            ceo_ref1.shape[0],
            ceo_agree.shape[0] / ceo_ref1.shape[0] * 100,
        )
    )
    ceo_agree_geom = gpd.GeoDataFrame(
        ceo_agree,
        geometry=gpd.points_from_xy(ceo_agree.lon, ceo_agree.lat),
        crs="EPSG:4326",
    )

    label_responses = ceo_agree_geom[label_question].unique()
    assert len(label_responses) == 2

    for r, row in ceo_agree_geom.iterrows():

        try:
            if (
                row[label_question].lower() == "crop"
                or row[label_question].lower() == "cropland"
                or row[label_question].lower() == "planted"
            ):
                ceo_agree_geom.loc[r, CLASS_COL] = 1
            elif(
                row[label_question].lower() == "non-crop"
                or row[label_question].lower() == "non-cropland"
                or row[label_question].lower() == "not planted"
            ):
                ceo_agree_geom.loc[r, CLASS_COL] = 0
        except IndexError:
            ceo_agree_geom.loc[r, CLASS_COL] = 255
    
    ceo_agree_geom = ceo_agree_geom[ceo_agree_geom[CLASS_COL] != 255]

    ceo_agree_geom[CLASS_COL] = ceo_agree_geom[CLASS_COL].astype(int)
    ceo_agree_geom[COUNTRY_COL] = country
    ceo_agree_geom = ceo_agree_geom[['lat','lon',CLASS_COL, COUNTRY_COL, 'geometry']]
    
    return ceo_agree_geom

In [None]:
gdf = reference_sample_agree(ceo_set1,ceo_set2)
gdf = filter_by_bounds(country_code=country_code, gdf=gdf)

In [None]:
# if not Path(dataset_path).exists():
#     print(f"WARNING: Dataset: {dataset_path} not found, run `dvc pull data/datasets from root.")
# else:
#     df = pd.read_csv(dataset_path)[["lat", "lon", "class_probability", "subset"]]
#     df = df[(df["class_probability"] != 0.5)].copy()
#     # use only test data because validation points used for harvest-dev map
#     df = df[df["subset"] == "testing"].copy()
#     df[CLASS_COL] = (df["class_probability"] > 0.5).astype(int)
#     df[COUNTRY_COL] = country

#     gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat), crs="epsg:4326")
#     gdf = filter_by_bounds(country_code=country_code, gdf=gdf)

## 3. Run intercomparison

In [None]:
gdf.head()

In [None]:
TARGETS = {k:v for k,v in TARGETS.items()}
for k, v in TARGETS.items():
    if country not in v.countries:
        continue
    if v.year is None:
        v.year = v.collection_years[v.countries.index(country)]

In [None]:
reference_year = "<YEAR INTEGER GOES HERE>"
TARGETS = {k: v for k, v in TARGETS.items() if v.year in [reference_year - 1, reference_year, reference_year + 1]}

In [None]:
for cropmap in TARGETS.values():
    if country not in cropmap.countries:
        continue
    print(f"[{country}] sampling " + cropmap.title + "...")
    map_sampled = cropmap.extract_test(gdf).copy()
    gdf = pd.merge(gdf, map_sampled, on=["lat", "lon"], how="left")
    gdf.drop_duplicates(inplace=True)  # TODO find why points get duplicated

In [None]:
a_j = {}
for cropmap in TARGETS.values():
    if country not in cropmap.countries:
        continue
    print(f"[{country}] calculating pixel area for " + cropmap.title + "...")
    a_j[cropmap.title] = cropmap.compute_map_area(country, export=True, dataset_name=cropmap.title).copy()

In [None]:
# update a_j values with exported values
for cropmap in a_j.keys():
    try:
        area_df = pd.read_csv(f'./Crop_NonCrop_Area_Sum_Export-Kenya-{cropmap}.csv')
    except:
        continue
    crop_area = int(area_df['crop_sum'][0])
    noncrop_area = int(area_df['noncrop_sum'][0])
    a_j[cropmap] = np.array([noncrop_area, crop_area])

In [None]:
# update a_j values with exported values
for cropmap in a_j.keys():
    try:
        area_df = pd.read_csv(f'./Crop_NonCrop_Area_Sum_Export-Kenya-{cropmap}.csv')
    except:
        continue
    crop_area = int(area_df['crop_sum'][0])
    noncrop_area = int(area_df['noncrop_sum'][0])
    a_j[cropmap] = np.array([noncrop_area, crop_area])

In [None]:
# Change None to nan
a_j = {k: np.array([np.nan, np.nan]) if np.any(v == None) else v for k,v in a_j.items()}

In [None]:
from src.area_utils import compute_area_estimate, compute_area_error_matrix, compute_std_p_i
from sklearn.metrics import confusion_matrix

In [None]:
# compute area estimate for each map
def compute_area_estimate(dataset, true, pred, a_j, resolution):
    cm = confusion_matrix(true, pred)
    total_px = a_j.sum()
    w_j = a_j / total_px

    am = compute_area_error_matrix(cm, w_j)
    a_i = am.sum(axis=1)
    std_a_i = compute_std_p_i(w_j, am, cm)
    err_a_i = 1.96 * std_a_i

    a_px = total_px * a_i
    err_px = err_a_i * total_px
    return pd.DataFrame(
        data={
            "dataset": dataset,
            "area_ha": a_px[1] * (resolution**2) / (100**2),
            "err_ha": err_px[1] * (resolution**2) / (100**2),
        },
        index=[0],
    ).round(2)

In [None]:
comparisons = []
area_est = []
for cropmap in TARGETS.values():
    cropmap, resolution = cropmap.title, cropmap.resolution
    if cropmap not in gdf.columns:
        continue
    temp = gdf[[CLASS_COL, cropmap]].dropna()
    area = compute_area_estimate(cropmap, temp[CLASS_COL], temp[cropmap], a_j[cropmap], resolution)
    comparison = generate_report(cropmap, country, temp[CLASS_COL], temp[cropmap], a_j[cropmap], area_weighted=True)
    comparisons.append(comparison)
    area_est.append(area)

comparisons = pd.concat(comparisons).set_index(['dataset'])
area_est = pd.concat(area_est).set_index(['dataset'])

results = comparisons.merge(area_est, on='dataset')

In [None]:
results.to_csv('results.csv')

In [None]:
results[['crop_f1','accuracy','std_acc','crop_recall_pa','std_crop_pa','crop_precision_ua','std_crop_ua','area_ha','err_ha']]

## 4. Plot area estimate and error

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()

n = len(results)
colors = plt.cm.viridis(np.linspace(0, 1, n))

ax.barh(
    results.index,
    results["area_ha"],
    xerr=results["err_ha"],
    align="center",
    alpha=0.5,
    ecolor="black",
    capsize=10,
    color=colors,
)

for i, (value, err) in enumerate(zip(results["area_ha"], results["err_ha"])):
    ax.text(value, i, f"{value} ± {err}", ha="center", va="bottom")

ax.set_ylabel("Area (ha)")
ax.set_title("Area of cropland")
ax.spines["right"].set_visible(False)
plt.show()