# Combining China Datasets

**Author:** Adam Yang (ayang115@umd.edu)

**Description:** Notebook for combining labeled datasets 

In [4]:
import os
import numpy as np
import pandas as pd

In [None]:
dir = "data/China_Agreement"
# Read in files
df1 = pd.read_csv("ceo-China-Provinces-Apr-Nov-2019-(Set-1)-sample-data-2022-06-02.csv")
df2 = pd.read_csv("ceo-China-Provinces-Apr-Nov-2019-(Set-2)-sample-data-2022-06-02.csv")

# Set column names
CROP_PROB = "Does this point fall within active cropland?"
LAT = "lat"
LON = "lon"

# Combine csv and clean
df = pd.concat([df1, df2])
df = df.dropna(subset=[LON, LAT, CROP_PROB])
df = df.round({LON: 8, LAT: 8})
df[CROP_PROB] = df[CROP_PROB].replace({'Crop' : 1, 'Non-crop' : 0})

# Group matching (lat, lon) and take the mean of the crop probabilities
df = df.groupby([LON, LAT], as_index=False, sort=False).agg({CROP_PROB: "mean"})

# Output
df = df.reset_index(drop=True)
df.to_csv("ceo-China-Provinces-apr-Nov-2019-combined.csv", index=False)