# Distance Metrics of Datasets
<hr style="border:0.01px solid gray"> </hr>

**Author**: Maryann Vazhapilly (marvaz@umd.edu)

**Description**: The purpose of this notebook is to display the distance metrics of all the datasets of the data/processed directory in the crop-mask repository. Distance metrics collected also reflect the average distance(mean), min, and max between training, testing, and validation sets, the density, and total number of points in every dataset. Data is collected and displayed in a tabular format.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [2]:
# Get all processed datasets in crop-mask repo
import glob
import os
datasets = []
names = []
for file_name in glob.glob('/Users/spiderman/Downloads/GitHub/crop-mask/data/processed'+'/*.csv'):
    x = pd.read_csv(file_name)
    datasets.append(x)
    base=os.path.basename(file_name)
    names.append(os.path.splitext(base)[0])
print("Number of datasets: ", len(datasets))
print(names)

Number of datasets:  24
['Ethiopia', 'open_buildings', 'geowiki_landcover_2017', 'Mali_lower_CEO_2019', 'digitalearthafrica_sahel', 'Malawi_FAO_corrected', 'Togo', 'Malawi_CEO_2019', 'Mali', 'Tanzania_CEO_2019', 'Mali_upper_CEO_2019', 'Malawi_CEO_2020', 'Uganda', 'Malawi_FAO', 'Zambia_CEO_2019', 'one_acre_fund', 'Ethiopia_Tigray_2020', 'Ethiopia_Tigray_2021', 'Ethiopia_Bure_Jimma_2020', 'digitalearthafrica_eastern', 'Kenya', 'Rwanda', 'Argentina_Buenos_Aires', 'Ethiopia_Bure_Jimma_2019']


In [3]:
# Function to calculate distance
# https://gist.github.com/rochacbruno/2883505
def distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c
    return d

In [4]:
# Function to calculate average, min, and max distance
# Adapted from notebook Distance_between_points.ipynb
def distance_metrics(dataframe1, dataframe2):
    total_sum = 0
    min_distance = math.inf
    max_distance = 0
    current_distance = 0
    distances = []
    for row in dataframe1.itertuples():
        for row2 in dataframe2.itertuples():
            current_distance = distance((row.lat, row.lon),(row2.lat, row2.lon))
            total_sum += current_distance
            distances.append(current_distance)
            if current_distance < min_distance and not (row.lat == row2.lat and row.lon == row2.lon):
                min_distance = current_distance
            if current_distance > max_distance:
                max_distance = current_distance
    avg_distance = total_sum / (len(dataframe1)*len(dataframe2))
    return avg_distance, min_distance, max_distance

In [5]:
# COMMENT FOR THE FUTURE: make a function to convert between units

In [6]:
# Function to return a number converted from degrees to km
def convert_degrees_to_km(num):
    return num * 111

In [7]:
# Function to create bounding box(area(km)) given a dataset
def bounding_box_area(df):
    minX = min(df["lon"])
    maxX = max(df["lon"])
    minY = min(df["lat"])
    maxY = max(df["lat"])
    area = abs(convert_degrees_to_km(maxX - minX)) * abs(convert_degrees_to_km(maxY - minY))
    return area

In [8]:
# Function to return density per 100km^2 given area and dataset
def density(area, df):
    return (len(df)/area)*100

In [14]:
# Get information for number of points, area, density, and
# average, min, and max distances for all datasets and put
# into dictionary
dict_data = {}
for i in range(0, len(datasets)):
    df = datasets[i]
    df = df[['lon','lat','subset']]
    if i == 2:
        continue
    unique_groups = df.subset.unique()
    grouped = df.groupby(df.subset)
    
    # Get distance metrics for training, testing, and validation subsets
    if 'training' in unique_groups:
        train = grouped.get_group("training")
        train = train.drop(['subset'], axis=1)
        train_avg, train_min, train_max = distance_metrics(train, train)
        train_avg, train_min, train_max = round(train_avg,3), round(train_min,3), round(train_max,3)
    else:
        train_avg = train_min = train_max = -1
    if 'testing' in unique_groups:
        test = grouped.get_group("testing")
        test = test.drop(['subset'], axis=1)
        test_avg, test_min, test_max = distance_metrics(test, test)
        test_avg, test_min, test_max = round(test_avg,3), round(test_min,3), round(test_max,3)
    else:
        test_avg = test_min = test_max = -1
    if 'validation' in unique_groups:
        val = grouped.get_group("validation")
        val = val.drop(['subset'], axis=1)
        val_avg, val_min, val_max = distance_metrics(val, val)
        val_avg, val_min, val_max = round(val_avg,3), round(val_min,3), round(val_max,3)
    else:
        val_avg = val_min = val_max = -1
    if train_avg != -1 and val_avg != -1:
        train_val_avg, train_val_min, train_val_max = distance_metrics(train, val)
        train_val_avg, train_val_min, train_val_max = round(train_val_avg,3), round(train_val_min,3), round(train_val_max,3)
    else:
        train_val_avg = train_val_min = train_val_max = " "
    if test_avg != -1 and val_avg != -1:
        test_val_avg, test_val_min, test_val_max = distance_metrics(test, val)
        test_val_avg, test_val_min, test_val_max = round(test_val_avg,3), round(test_val_min,3), round(test_val_max,3)
    else:
        test_val_avg = test_val_min = test_val_max = " "
    if train_avg != -1 and test_avg != -1:
        train_test_avg, train_test_min, train_test_max = distance_metrics(train, test)
        train_test_avg, train_test_min, train_test_max = round(train_test_avg,3), round(train_test_min,3), round(train_test_max,3)
    else:
        train_test_avg = train_test_min = train_test_max = " "
    
    # Add total number of points, area, and density to dictionary
    area = bounding_box_area(df)
    dens = density(area, df)
    dict_data[names[i]] = [[len(df), round(area,3), round(dens,3)]]
    
    # Add train, test, and val metrics to dictionary
    if train_avg == -1:
        train_avg = train_min = train_max = " "
        dict_data[names[i]].append([" ", " ", train_avg, train_min, train_max])
    else:
        train_area = bounding_box_area(train)
        train_dens = density(train_area, train)
        dict_data[names[i]].append([len(train), round(train_dens,3), train_avg, train_min, train_max])
    if test_avg == -1:
        test_avg = test_min = test_max = " "
        dict_data[names[i]].append([" ", " ", test_avg, test_min, test_max])
    else:
        test_area = bounding_box_area(test)
        test_dens = density(test_area, test)
        dict_data[names[i]].append([len(test), round(test_dens,3), test_avg, test_min, test_max])
    if val_avg == -1:
        val_avg = val_min = val_max = " "
        dict_data[names[i]].append([" ", " ", val_avg, val_min, val_max])
    else:
        val_area = bounding_box_area(val)
        val_dens = density(val_area, val)
        dict_data[names[i]].append([len(val), round(val_dens,3), val_avg, val_min, val_max])
    
    # Add train+val, test+val, and train+test metrics to dictionary
    dict_data[names[i]].append([train_val_avg, train_val_min, train_val_max])
    dict_data[names[i]].append([test_val_avg, test_val_min, test_val_max])
    dict_data[names[i]].append([train_test_avg, train_test_min, train_test_max])
print(dict_data)

{'Ethiopia': [[4513, 43760.095, 10.313], [4513, 10.313, 61.35, 0.0, 274.766], [' ', ' ', ' ', ' ', ' '], [' ', ' ', ' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' ']], 'open_buildings': [[8121, 267412.276, 3.037], [8121, 3.037, 223.55, 0.005, 721.971], [' ', ' ', ' ', ' ', ' '], [' ', ' ', ' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' ']], 'Mali_lower_CEO_2019': [[621, 96533.793, 0.643], [' ', ' ', ' ', ' ', ' '], [312, 0.323, 171.721, 8.005, 450.712], [309, 0.32, 171.51, 8.004, 449.523], [' ', ' ', ' '], [171.964, 8.004, 457.609], [' ', ' ', ' ']], 'digitalearthafrica_sahel': [[1683, 21500211.207, 0.008], [1683, 0.008, 2350.942, 0.0, 7309.416], [' ', ' ', ' ', ' ', ' '], [' ', ' ', ' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' ']], 'Malawi_FAO_corrected': [[511, 221470.775, 0.231], [511, 0.231, 283.74, 0.029, 786.829], [' ', ' ', ' ', ' ', ' '], [' ', ' ', ' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' ']], 'Togo': [[16

In [18]:
# Write information to table in 'test.csv'
# Can add more columns to table if needed by adding value
# to each writerow statement
import csv
with open('test.csv','w') as f:
    writer = csv.writer(f)
    writer.writerow([" ", " ", " ", " ",
                     "TRAIN", " ", " ", " ", " ",
                     "TEST", " ", " ", " ", " ",
                     "VAL", " ", " ", " ", " ",
                     "TRAIN+VAL", " ", " ",
                     "TEST+VAL", " ", " ",
                     "TRAIN+TEST", " ", " "])
    writer.writerow(["NAME", "TOTAL NUM OF POINTS", "AREA(km^2)", "DENSITY(100km^2)",
                     "TRAIN NUM OF POINTS", "TRAIN DENSITY(100km^2)", "AVG(km)","MIN(km)","MAX(km)",
                     "TEST NUM OF POINTS", "TEST DENSITY(100km^2)", "AVG(km)","MIN(km)","MAX(km)",
                     "VAL NUM OF POINTS", "VAL DENSITY(100km^2)", "AVG(km)","MIN(km)","MAX(km)",
                     "AVG(km)","MIN(km)","MAX(km)",
                     "AVG(km)","MIN(km)","MAX(km)",
                     "AVG(km)","MIN(km)","MAX(km)"])
    for k, v in dict_data.items():
        writer.writerow([k, v[0][0], v[0][1], v[0][2],
                        v[1][0], v[1][1], v[1][2], v[1][3], v[1][4],
                        v[2][0], v[2][1], v[2][2], v[2][3], v[2][4],
                        v[3][0], v[3][1], v[3][2], v[3][3], v[3][4],
                        v[4][0], v[4][1], v[4][2],
                        v[5][0], v[5][1], v[5][2],
                        v[6][0], v[6][1], v[6][2]])
print("done")

done
