# Approved Crop KMZ to CSV

<a target="_blank" href="https://colab.research.google.com/github/nasaharvest/street2sat/blob/main/notebooks/CropKMZtoCSV.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

**Author**: Ivan Zvonkov

**Last Modified**: Feb 24, 2025

**Description**: Converts approved KMZ to csv (for upload to Google Earth Engine)


In [1]:
!pip install simplekml -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for simplekml (setup.py) ... [?25l[?25hdone


In [2]:
from bs4 import BeautifulSoup
from google.colab import drive
from pathlib import Path
from tqdm import tqdm

import json
import pandas as pd
import simplekml
import xml.etree.ElementTree as ET
import zipfile


drive.mount('/content/drive')

Mounted at /content/drive


## 1. Convert each KMZ into a CSV file

In [3]:
SRC_KMZ_FOLDER = "/content/drive/MyDrive/SatLabel Squad/Kenya/Reviewed 2025-02-24 (KMZ)"
!ls "{SRC_KMZ_FOLDER}"

AnjaliKENYA_v2_2021_07_18_T2_Busia_Kakamega_bg96_300_400.kmz
driveMyDriveKENYA_v2_2021_07_14_T2_Bungoma_TransNzoia_bg95_100_200_imansmith.kmz
KENYA_v2_2021_07_09_T2__Bungoma_bg99_bg99_100_154.kmz
KENYA_v2_2021_07_10_T2_110GOPRO__Bungoma_bg95_0_29.kmz
KENYA_v2_2021_07_10_T2_111GOPRO__Bungoma_bg97_0_80.kmz
KENYA_v2_2021_07_13_T2_100GOPRO__TransNzoia_bg99_0_100.kmz
KENYA_v2_2021_07_14_T2_Bungoma_TransNzoia_bg95_0_100.kmz
KENYA_v2_2021_07_14_T2_Bungoma_TransNzoia_bg95_200_300.kmz
KENYA_v2_2021_07_14_T2_Bungoma_TransNzoia_bg95_300_315.kmz
KENYA_v2_2021_07_17_T2_Nandi_Vihiga_bg95_0_100.kmz
KENYA_v2_2021_07_17_T2_Nandi_Vihiga_bg95_100_200.kmz
KENYA_v2_2021_07_18_T2_Busia_Kakamega_bg96_0_100.kmz
KENYA_v2_2021_07_18_T2_Busia_Kakamega_bg96_100_200.kmz
KENYA_v2_2021_07_18_T2_Busia_Kakamega_bg96_200_300_AnjaliPaliyam_imansmith.kmz
KENYA_v2_2021_07_18_T2_Busia_Kakamega_bg96_200_300_PLoh.kmz
KENYA_v2_2021_07_18_T2_Busia_Kakamega_bg96_300_400.kmz
KENYA_v2_2021_07_18_T2_Busia_Kakamega_bg96_400_500.kmz

In [4]:
SRC_KMZ_FILES = list(Path(SRC_KMZ_FOLDER).glob("*.kmz"))

In [19]:
def get_points_from_kmz(kmz_file_path):
    with zipfile.ZipFile(kmz_file_path, 'r') as kmz:
        kml_filename = [name for name in kmz.namelist() if name.endswith('.kml')][0]
        kml_data = kmz.read(kml_filename)

    # Convert KMZ file to KMZ points list
    namespace = {'kml': 'http://www.opengis.net/kml/2.2'}
    kmz_points = []

    root = ET.fromstring(kml_data)
    for placemark in root.findall('.//kml:Placemark', namespace):
        kmz_element = {}
        for child in placemark.iter():
            key = child.tag.replace('{http://www.opengis.net/kml/2.2}', '')
            kmz_element[key] = child.text

        kmz_points.append(kmz_element)

    return kmz_points

def kmz_points_to_dataframe(kmz_points):
    points = []

    for kmz_point in kmz_points:

        row = {}

        soup = BeautifulSoup(kmz_point["description"], 'html.parser')
        row["capture_info"] = soup.find_all('h2')[0].text
        row["capture_time"] = soup.find_all('p')[0].text.split(": ")[1]
        row["image_url"] = soup.find('a')['href']

        # Driving direction details
        direction_data = soup.find_all('h2')[2].next_siblings
        direction_data = [item for item in direction_data if item.name == 'p']

        row["driving_northing"] = float(direction_data[0].text.split(": ")[1])
        row["driving_easting"] = float(direction_data[1].text.split(": ")[1])
        row["is_right_hand_drive"] = direction_data[2].text.split(": ")[1] == 'True'

        # Location details
        location_data = soup.find_all('h2')[1].next_siblings
        location_data = [item for item in location_data if item.name == 'p']
        row["adm1"] = location_data[0].text.split(": ")[1]
        row["adm2"] = location_data[1].text.split(": ")[1]

        def lat_lon_parse(lat_lon_str):
            lat_lon = lat_lon_str.text.split(": ")[1] \
                .replace('(', '').replace(')', '') \
                .replace('[', '').replace(']', '')
            return json.loads(f"[{lat_lon}]")

        road_lat_lon = lat_lon_parse(location_data[2])
        field_lat_lon = lat_lon_parse(location_data[3])

        crop_type = kmz_point["name"].strip().lower().replace("\u200b", "")
        crop_point = {
            "latitude": field_lat_lon[0],
            "longitude": field_lat_lon[1],
            "is_crop": 1,
            "is_maize": int(crop_type == "maize"),
            "crop_type": crop_type,

            **row,
        }

        non_crop_point = {
            "latitude": road_lat_lon[0],
            "longitude": road_lat_lon[1],
            "is_crop": 0,
            "is_maize": 0,
            "crop_type": "",
            **row,
        }

        points.append(crop_point)
        points.append(non_crop_point)

    df = pd.DataFrame(points)
    df["gcloud_folder"] = df["image_url"].str.extract(r'street2sat-uploaded/([^/]+/[^/]+)')
    return df


In [20]:
dfs = []
for kmz_file_path in tqdm(SRC_KMZ_FILES):
    kmz_points = get_points_from_kmz(kmz_file_path)
    df = kmz_points_to_dataframe(kmz_points)
    dfs.append(df)

100%|██████████| 20/20 [00:00<00:00, 23.83it/s]


## 2. Merge CSV files into single file

In [21]:
df = pd.concat(dfs, ignore_index=True)

In [22]:
df.head()

Unnamed: 0,latitude,longitude,is_crop,is_maize,crop_type,capture_info,capture_time,image_url,driving_northing,driving_easting,is_right_hand_drive,adm1,adm2,gcloud_folder
0,0.235733,34.535798,1,1,maize,street2sat-uploaded/KENYA_v2/2021_07_18_T2/103...,2021-07-18 10:30:29,https://storage.cloud.google.com/street2sat-up...,-8.515203,-11.865159,False,Kakamega,Butere,KENYA_v2/2021_07_18_T2
1,0.235835,34.535683,0,0,,street2sat-uploaded/KENYA_v2/2021_07_18_T2/103...,2021-07-18 10:30:29,https://storage.cloud.google.com/street2sat-up...,-8.515203,-11.865159,False,Kakamega,Butere,KENYA_v2/2021_07_18_T2
2,0.237993,34.540919,1,1,maize,street2sat-uploaded/KENYA_v2/2021_07_18_T2/103...,2021-07-18 10:29:19,https://storage.cloud.google.com/street2sat-up...,-2.34595,-16.630129,False,Kakamega,Butere,KENYA_v2/2021_07_18_T2
3,0.238145,34.540893,0,0,,street2sat-uploaded/KENYA_v2/2021_07_18_T2/103...,2021-07-18 10:29:19,https://storage.cloud.google.com/street2sat-up...,-2.34595,-16.630129,False,Kakamega,Butere,KENYA_v2/2021_07_18_T2
4,0.237993,34.541008,1,1,maize,street2sat-uploaded/KENYA_v2/2021_07_18_T2/103...,2021-07-18 10:29:18,https://storage.cloud.google.com/street2sat-up...,-3.186295,-16.685696,False,Kakamega,Butere,KENYA_v2/2021_07_18_T2


In [23]:
df[df["is_crop"] == 1]["gcloud_folder"].value_counts()

Unnamed: 0_level_0,count
gcloud_folder,Unnamed: 1_level_1
KENYA_v2/2021_07_18_T2,637
KENYA_v2/2021_07_14_T2,200
KENYA_v2/2021_07_17_T2,96
KENYA_v2/2021_07_10_T2,68
KENYA_v2/2021_07_13_T2,54
KENYA_v2/2021_07_09_T2,25


In [25]:
df_2021 = df[df["gcloud_folder"].str.contains("2021")]
df_2022 = df[df["gcloud_folder"].str.contains("2022")]

In [26]:
df_2021[["is_crop", "crop_type"]].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
is_crop,crop_type,Unnamed: 2_level_1
0,,1080
1,maize,872
1,sugarcane,187
1,banana,19
1,rice,1
1,suagrcane,1


In [13]:
df_2022[["is_crop", "crop_type"]].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
is_crop,crop_type,Unnamed: 2_level_1


In [27]:
df_2021.to_csv(f"/content/drive/MyDrive/Helmets/Kenya_2021_batch202502.csv", index=False)