# Prepare Dataset for Publishing

<a target="_blank" href="https://colab.research.google.com/github/nasaharvest/helmets-kenya/blob/main/notebooks/3_Kenya_dataset_publish.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

**Author**: Ivan Zvonkov

**Last Updated**: May 19, 2025

**Description**
1. Load and clean dataset
2. Display dataset statistics
3. Prepare dataset for publishing to Zenodo

In [None]:
from google.colab import drive
from shapely.geometry import Point
from tqdm import tqdm

import pandas as pd
import geopandas as gpd

## 1. Load and clean dataset

In [None]:
drive.mount("/content/drive")

In [None]:
# Load data
folder = "/content/drive/MyDrive/[PROJECT] Helmets Kenya/Data/CSVs/"
df1 = pd.read_csv(folder + "Kenya_2021_batch202404.csv")
df2 = pd.read_csv(folder + "Kenya_2021_batch202407.csv")
df3 = pd.read_csv(folder + "Kenya_2022_batch202407.csv")
df4 = pd.read_csv(folder + "Kenya_2021_batch202408.csv")
df5 = pd.read_csv(folder + "Kenya_2022_batch202408.csv")
df6 = pd.read_csv(folder + "Kenya_2021_batch202409.csv")
df7 = pd.read_csv(folder + "Kenya_2021_batch202410.csv")
df8 = pd.read_csv(folder + "Kenya_2021_batch202411.csv")
df9 = pd.read_csv(folder + "Kenya_2021_batch202502.csv")
df10 = pd.read_csv(folder + "Kenya_2021_batch202503.csv")
df11 = pd.read_csv(folder + "Kenya_2021_batch202505.csv")

In [None]:
df1["year"] = "2021"
df2["year"] = "2021"
df3["year"] = "2022"
df4["year"] = "2021"
df5["year"] = "2022"
df6["year"] = "2021"
df7["year"] = "2021"
df8["year"] = "2021"
df9["year"] = "2021"
df10["year"] = "2021"
df11["year"] = "2021"

In [None]:
# Correct to GADM admin zones
gdf_gadm2 = gpd.read_file("https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_KEN_2.json")

def correct_to_gadm(df):
    geometry = [Point(xy) for xy in zip(df["longitude"], df["latitude"])]
    gdf_points = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")
    gdf_points_gadm2 = gpd.sjoin(gdf_points, gdf_gadm2, how='left', predicate="within")

    df["adm1"] = gdf_points_gadm2["NAME_1"]
    df["adm2"] = gdf_points_gadm2["NAME_2"]

    return df

df1 = correct_to_gadm(df1)
df2 = correct_to_gadm(df2)

In [None]:
# Combine point batches
df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11]).reset_index(drop=True)

df["crop_type"] = df["crop_type"].str.replace("\u200b", "").str.lower()
df["is_maize"] = (df["crop_type"] == "maize").astype(int)
df_crops = df[df["is_crop"] == 1].copy()
df_road = df[df["is_crop"] == 0].copy()
print(f"Total crops: {len(df_crops)}")

In [None]:
# Resolve duplicates
df_crops.drop_duplicates(subset="capture_info", keep='last', inplace=True)
df_road.drop_duplicates(subset="capture_info", keep='last', inplace=True)
df_deduped = pd.concat([df_crops, df_road]).reset_index(drop=True)
print(f"Total crops: {len(df_crops)}")

In [None]:
# Remove discarded points from Final Review Round
people_in_photo = [
    "2022_07_13_Nakuru_3-*-G0198444",
    "2022_07_13_Nakuru_3-*-G0198447",
    "2022_07_13_Nakuru_3-*-G0198459",
]

start_condition = (df_deduped["capture_info"] >=  "2021-07-05-T1-GPAK2671")
end_condition = (df_deduped["capture_info"] <=  "2021-07-05-T1-GPAK2934")

tree_boundary = [
    "2022_07_13_Nakuru_3-*-G0182336",
    "2022_07_13_Nakuru_2-G0075290",
    "2022_07_13_Nakuru_2-G0075880",
    "2022_07_13_Nakuru_2-G0076337",
    "2021-07-11-T1-GPJK5584",
    "2021-07-11-T1-GPJK5586",

# Tree boundary issue range from Chris
] + df_deduped[start_condition & end_condition & (df_deduped["is_crop"] == 1)]["capture_info"].tolist()

on_or_near_building = [
    "2022_07_13_Nakuru_3-*-G0182315",
    "2022_07_13_Nakuru_2-G0086524",
    "2022_07_13_Nakuru_2-G0075035",
    "2022_07_13_Nakuru_2-G0075251",
]

wrong_field = [
    "2022_07_13_Nakuru_2-G0064095",
    "2022_07_13_Nakuru_2-G0064096",
    "2022_07_13_Nakuru_2-G0064097",
    "2022_07_13_Nakuru_2-G0075912",
]

outside_or_edge_of_field = [
    "2022_07_13_Nakuru_2-G0064636",
    "2022_07_13_Nakuru_2-G0064759",
    "2022_07_13_Nakuru_2-G0074874",
    "2022_07_13_Nakuru_2-G0075124",
    "2022_07_13_Nakuru_2-G0075427",
    "2022_07_13_Nakuru_2-G0075422",
    "2022_07_13_Nakuru_2-G0075418",
    "2022_07_13_Nakuru_2-G0075429",
    "2022_07_13_Nakuru_2-G0075868",
    "2022_07_13_Nakuru_2-G0076004",
    "2021-08-02-T1-GPAH3578",
    "2021-08-02-T1-GPAG3424",
    "2021-08-02-T1-GPAE3358",
    "2021-08-02-T1-GPAL3884",
    "2021-08-02-T1-GPAL3886",
    "2021-08-02-T1-GPAL3892"
]

wrong_side = [
    "2022_07_13_Nakuru_2-G0096544",
    "2021-07-11-T1-GPJS6173",
    "2021-07-11-T1-GPJQ5855",
    "2021-07-11-T1-GPJK5585",
    "2021-07-11-T1-GPJK5580",
    "2021-07-11-T1-GPJG5481",
]

fallow = [
    "2021-08-02-T1-GPAL3874",
    "2021-08-02-T1-GPAF3397"
]

In [None]:
capture_infos = people_in_photo + on_or_near_building + wrong_field + outside_or_edge_of_field + fallow + wrong_side + tree_boundary
if (len(capture_infos) == df_deduped["capture_info"].isin(capture_infos).sum() // 2):
    print(f"Found all capture infos to remove: {len(capture_infos)}")

In [None]:
df_clean = df_deduped[~df_deduped["capture_info"].isin(capture_infos)].copy()

# Resolve asterisk issue - has to be after dedupe
df_clean["capture_info"] = df_clean["capture_info"].str.replace("*-", "")
df_clean["image_url"] = df_clean["image_url"].str.replace("/*", "")

## 2. Dataset Statistics

In [None]:
df_crops = df_clean[df_clean['is_crop'] == 1]
print(f"Total crops: {len(df_crops)}")

In [None]:
# Crop type amounts
df_crops["crop_type"].value_counts()

In [None]:
print("Number of counties: ", len(df_crops["adm1"].unique()))

In [None]:
# Per year and admin zone
df_crops[["year", "adm1"]].value_counts()

## 3. Prepare dataset for publishing

In [None]:
!gcloud auth login

In [None]:
# Get images already public
!gcloud storage ls gs://street2sat-public/KENYA_v2/** >> gcloud_storage_uris.txt

In [None]:
with open('/content/gcloud_storage_uris.txt') as f:
    gs_uris = f.read().splitlines()
len(gs_uris)

In [None]:
df_clean["public_url"] = df_clean["image_url"].str.replace("uploaded", "public")
df_clean["is_public"] = df_clean["public_url"].str.replace("https://storage.cloud.google.com/", "gs://").isin(gs_uris)
df_clean["is_public"].value_counts()

In [None]:
# Copy over images into public bucket
df_not_public = df_clean[~df_clean["is_public"]].copy()
if len(df_not_public) == 0:
    print("All images are already in public bucket")
else:
    for image_url in tqdm(df_not_public["image_url"].unique()):
        src = image_url.replace("https://storage.cloud.google.com/", "gs://")
        dest = src.replace("uploaded", "public")
        !gsutil cp -n $src $dest

In [None]:
# Current columns
df_clean.columns

In [None]:
df_clean.drop(columns=["is_public", "public_url", "gcloud_folder"], inplace=True)

In [None]:
# Update url
df_clean["image_url"] = df_clean["image_url"].str.replace("uploaded", "public")
df_clean["image_url"].iloc[-10]

In [None]:
df_clean["image_path"] = df_clean["image_url"].str.replace("https://storage.cloud.google.com/street2sat-public/", "")
df_clean["image_path"].iloc[-10]

In [None]:
df_clean.to_csv("Helmets_Kenya_v2.csv", index=False)