# Approved Crop KMZ to CSV

<a target="_blank" href="https://colab.research.google.com/github/nasaharvest/street2sat/blob/main/notebooks/CropKMZtoCSV.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

**Author**: Ivan Zvonkov

**Last Modified**: Feb 24, 2025

**Description**: Converts approved KMZ to csv (for upload to Google Earth Engine)


In [None]:
from bs4 import BeautifulSoup
from google.colab import drive
from pathlib import Path
from tqdm import tqdm

import json
import pandas as pd
import xml.etree.ElementTree as ET
import zipfile


drive.mount('/content/drive')

## 1. Convert each KMZ into a CSV file

In [None]:
SRC_KMZ_FOLDER = "/content/drive/MyDrive/SatLabel Squad/Kenya/Reviewed 2025-02-24 (KMZ)"
!ls "{SRC_KMZ_FOLDER}"

In [None]:
SRC_KMZ_FILES = list(Path(SRC_KMZ_FOLDER).glob("*.kmz"))

In [None]:
def get_points_from_kmz(kmz_file_path):
    with zipfile.ZipFile(kmz_file_path, 'r') as kmz:
        kml_filename = [name for name in kmz.namelist() if name.endswith('.kml')][0]
        kml_data = kmz.read(kml_filename)

    # Convert KMZ file to KMZ points list
    namespace = {'kml': 'http://www.opengis.net/kml/2.2'}
    kmz_points = []

    root = ET.fromstring(kml_data)
    for placemark in root.findall('.//kml:Placemark', namespace):
        kmz_element = {}
        for child in placemark.iter():
            key = child.tag.replace('{http://www.opengis.net/kml/2.2}', '')
            kmz_element[key] = child.text

        kmz_points.append(kmz_element)

    return kmz_points

def kmz_points_to_dataframe(kmz_points):
    points = []

    for kmz_point in kmz_points:

        row = {}

        soup = BeautifulSoup(kmz_point["description"], 'html.parser')
        row["capture_info"] = soup.find_all('h2')[0].text
        row["capture_time"] = soup.find_all('p')[0].text.split(": ")[1]
        row["image_url"] = soup.find('a')['href']

        # Driving direction details
        direction_data = soup.find_all('h2')[2].next_siblings
        direction_data = [item for item in direction_data if item.name == 'p']

        row["driving_northing"] = float(direction_data[0].text.split(": ")[1])
        row["driving_easting"] = float(direction_data[1].text.split(": ")[1])
        row["is_right_hand_drive"] = direction_data[2].text.split(": ")[1] == 'True'

        # Location details
        location_data = soup.find_all('h2')[1].next_siblings
        location_data = [item for item in location_data if item.name == 'p']
        row["adm1"] = location_data[0].text.split(": ")[1]
        row["adm2"] = location_data[1].text.split(": ")[1]

        def lat_lon_parse(lat_lon_str):
            lat_lon = lat_lon_str.text.split(": ")[1] \
                .replace('(', '').replace(')', '') \
                .replace('[', '').replace(']', '')
            return json.loads(f"[{lat_lon}]")

        road_lat_lon = lat_lon_parse(location_data[2])
        field_lat_lon = lat_lon_parse(location_data[3])

        crop_type = kmz_point["name"].strip().lower().replace("\u200b", "")
        crop_point = {
            "latitude": field_lat_lon[0],
            "longitude": field_lat_lon[1],
            "is_crop": 1,
            "is_maize": int(crop_type == "maize"),
            "crop_type": crop_type,

            **row,
        }

        non_crop_point = {
            "latitude": road_lat_lon[0],
            "longitude": road_lat_lon[1],
            "is_crop": 0,
            "is_maize": 0,
            "crop_type": "",
            **row,
        }

        points.append(crop_point)
        points.append(non_crop_point)

    df = pd.DataFrame(points)
    df["gcloud_folder"] = df["image_url"].str.extract(r'street2sat-uploaded/([^/]+/[^/]+)')
    return df


In [None]:
dfs = []
for kmz_file_path in tqdm(SRC_KMZ_FILES):
    kmz_points = get_points_from_kmz(kmz_file_path)
    df = kmz_points_to_dataframe(kmz_points)
    dfs.append(df)

## 2. Merge CSV files into single file

In [None]:
df = pd.concat(dfs, ignore_index=True)

In [None]:
df.head()

In [None]:
df[df["is_crop"] == 1]["gcloud_folder"].value_counts()

In [None]:
df_2021 = df[df["gcloud_folder"].str.contains("2021")]
df_2022 = df[df["gcloud_folder"].str.contains("2022")]

In [None]:
df_2021[["is_crop", "crop_type"]].value_counts()

In [None]:
df_2022[["is_crop", "crop_type"]].value_counts()

In [None]:
df_2021.to_csv(f"/content/drive/MyDrive/Helmets/Kenya_2021_batch202502.csv", index=False)