The purpose of this notebook is to test the spatial auto-corrleation between different locations in the Togo dataset. This includes assessing how far away different points are from each other on average, and also within the training and test sets. Distance metrics collected also reflect the average distance(mean), min, and max.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import math
%matplotlib inline

In [4]:
df = pd.read_csv("/Users/spiderman/Downloads/GitHub/crop-mask/data/processed/Togo.csv")

In [5]:
df.head(10)

Unnamed: 0,lon,lat,start_date,end_date,source,crop_probability,num_labelers,subset,country,dataset,dest_tif,filename
0,-0.166017,11.161904,2019-04-22,2020-04-16,noncrop_merged_v2,0.0,1,training,Togo,Togo,noncrop_merged_v2/0_2019-04-22_2020-04-16.tif,lat=11.16190392_lon=-0.1660175_date=2019-04-22...
1,-0.163856,11.163926,2019-04-22,2020-04-16,crop_merged_v2,1.0,1,training,Togo,Togo,crop_merged_v2/1_2019-04-22_2020-04-16.tif,lat=11.16392591_lon=-0.16385638_date=2019-04-2...
2,-0.157025,11.136826,2019-04-22,2020-04-16,noncrop_merged_v2,0.0,1,training,Togo,Togo,noncrop_merged_v2/2_2019-04-22_2020-04-16.tif,lat=11.13682622_lon=-0.15702498_date=2019-04-2...
3,-0.155683,11.15934,2019-04-22,2020-04-16,crop_merged_v2,1.0,1,training,Togo,Togo,crop_merged_v2/3_2019-04-22_2020-04-16.tif,lat=11.1593399_lon=-0.15568345_date=2019-04-22...
4,-0.155525,11.136067,2019-04-22,2020-04-16,noncrop_merged_v2,0.0,1,training,Togo,Togo,noncrop_merged_v2/4_2019-04-22_2020-04-16.tif,lat=11.13606681_lon=-0.15552516_date=2019-04-2...
5,-0.154908,11.136198,2019-04-22,2020-04-16,crop_merged_v2,1.0,1,training,Togo,Togo,crop_merged_v2/5_2019-04-22_2020-04-16.tif,lat=11.13619805_lon=-0.15490834_date=2019-04-2...
6,-0.154001,11.136584,2019-04-22,2020-04-16,crop_merged_v2,1.0,1,training,Togo,Togo,crop_merged_v2/6_2019-04-22_2020-04-16.tif,lat=11.13658379_lon=-0.15400141_date=2019-04-2...
7,-0.142175,11.127015,2019-04-22,2020-04-16,noncrop_merged_v2,0.0,1,training,Togo,Togo,noncrop_merged_v2/7_2019-04-22_2020-04-16.tif,lat=11.12701457_lon=-0.14217493_date=2019-04-2...
8,-0.139218,11.133168,2019-04-22,2020-04-16,noncrop_merged_v2,0.0,1,validation,Togo,Togo,noncrop_merged_v2/8_2019-04-22_2020-04-16.tif,lat=11.13316826_lon=-0.13921799_date=2019-04-2...
9,-0.132295,11.125922,2019-04-22,2020-04-16,noncrop_merged_v2,0.0,1,validation,Togo,Togo,noncrop_merged_v2/9_2019-04-22_2020-04-16.tif,lat=11.12592151_lon=-0.13229529_date=2019-04-2...


In [6]:
df = df[['lon','lat','subset']]

In [15]:
grouped = df.groupby(df.subset)
train = grouped.get_group("training")
train = train.drop(['subset'], axis=1)
train.head(10)

Unnamed: 0,lon,lat
0,-0.166017,11.161904
1,-0.163856,11.163926
2,-0.157025,11.136826
3,-0.155683,11.15934
4,-0.155525,11.136067
5,-0.154908,11.136198
6,-0.154001,11.136584
7,-0.142175,11.127015
11,-0.129835,11.130126
13,-0.113144,11.098402


In [16]:
train.describe()

Unnamed: 0,lon,lat
count,999.0,999.0
mean,0.965939,8.948017
std,0.545832,1.555376
min,-0.166017,6.219475
25%,0.532095,7.503816
50%,1.134147,9.006904
75%,1.409445,10.603915
max,1.744784,11.163926


In [17]:
test = grouped.get_group("validation")
test = test.drop(['subset'], axis=1)
test.head(10)

Unnamed: 0,lon,lat
8,-0.139218,11.133168
9,-0.132295,11.125922
10,-0.130442,11.1308
12,-0.119111,11.100147
20,-0.103843,11.067593
28,-0.096283,10.756754
31,-0.093871,10.740389
33,-0.093056,10.741461
42,-0.067654,11.118322
43,-0.051642,10.893121


In [18]:
test.describe()

Unnamed: 0,lon,lat
count,277.0,277.0
mean,0.912382,9.003764
std,0.548913,1.594306
min,-0.139218,6.221596
25%,0.454662,7.528455
50%,1.062726,9.154662
75%,1.383338,10.719697
max,1.682789,11.133168


In [None]:
# list of distances between all the points
# on average what the distance between, min and max distances
# calculate all distances between two geodataframe of points in geopandas
# distance matrix
# same dataset

In [29]:
# Function to calculate distance
# https://gist.github.com/rochacbruno/2883505
def distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c
    return d

In [99]:
# Compute average distance for training set
train_sum = 0
min_distance_train = math.inf
max_distance_train = 0
current_distance = 0
for row in train.itertuples():
    for row2 in train.itertuples():
        current_distance = distance((row.lat, row.lon),(row2.lat, row2.lon))
        train_sum += current_distance
        if current_distance < min_distance_train and not (row.lat == row2.lat and row.lon == row2.lon):
            min_distance_train = current_distance
        if current_distance > max_distance_train:
            max_distance_train = current_distance
print("done")

done


In [100]:
avg_distance_train = train_sum / (len(train)*len(train))
print("Average distance between training set: ", avg_distance_train)
print("Minimum distance: ", min_distance_train)
print("Maximum distance: ", max_distance_train)

Average distance between training set:  215.45258998228815
Minimum distance:  0.02357211575617554
Maximum distance:  581.9591011484724


In [101]:
# Compute average distance for test set
test_sum = 0
min_distance_test = math.inf
max_distance_test = 0
current_distance = 0
for row in test.itertuples():
    for row2 in test.itertuples():
        current_distance = distance((row.lat, row.lon),(row2.lat, row2.lon))
        test_sum += current_distance
        if current_distance < min_distance_test and not (row.lat == row2.lat and row.lon == row2.lon):
            min_distance_test = current_distance
        if current_distance > max_distance_test:
            max_distance_test = current_distance
print("done")

done


In [105]:
avg_distance_test = test_sum / (len(test)*len(test))
print("Average distance between test set: ", avg_distance_test)
print("Minimum distance: ", min_distance_test)
print("Maximum distance: ", max_distance_test)

Average distance between test set:  219.40271154984913
Minimum distance:  0.03737098178074144
Maximum distance:  571.628431172596


In [106]:
# Compute average distance for both sets
total_sum = 0
min_distance_total = math.inf
max_distance_total = 0
current_distance = 0
for row in train.itertuples():
    for row2 in test.itertuples():
        current_distance = distance((row.lat, row.lon),(row2.lat, row2.lon))
        total_sum += current_distance
        if current_distance < min_distance_total and not (row.lat == row2.lat and row.lon == row2.lon):
            min_distance_total = current_distance
        if current_distance > max_distance_total:
            max_distance_total = current_distance
print("done")

done


In [107]:
avg_distance_total = total_sum / (len(train)*len(test))
print("Average distance between both sets: ", avg_distance_total)
print("Minimum distance: ", min_distance_total)
print("Maximum distance: ", max_distance_total)

Average distance between both sets:  217.68491174174483
Minimum distance:  0.0026029899987407294
Maximum distance:  577.8031902540911


In [None]:
# Test set points seem to be closer to each other relative to the training set