## Calculating distance between locations

Using the Haversine formula to calculate distance between coordinates

In [None]:
import pandas as pd
import numpy as np

from math import cos, asin, sqrt, pi
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

import warnings
warnings.filterwarnings('ignore')

EARTHRADIUS = 6371.0

def get_distance(row1, row2):
    """
    Uses Haversine formula to get 
    distance between coordinates
    in two columns of dataframe
    """
    lat1 = row1[0]
    lon1 = row1[1]
    lat2 = row2[0]
    lon2 = row2[1]
    p = pi/180
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    km = 2 * EARTHRADIUS * asin(sqrt(a))
    return km

In [2]:
# data = pd.read_csv('data/merged_dataset.csv')

In [3]:
# data_new = data.drop_duplicates(subset=['LATITUDE', 'LONGITUDE'], keep="first")

In [4]:
# dist_matrix = pd.DataFrame(squareform(pdist(data_new[['LATITUDE', 'LONGITUDE']].values, lambda u, v: get_distance(u, v))), index=data_new[['LATITUDE', 'LONGITUDE']].apply(tuple, axis=1), columns=data_new[['LATITUDE', 'LONGITUDE']].apply(tuple, axis=1))

### Using Coordinates as Column Header

Rows of the matrix are appended to the corresponding row in the protest dataset (using coordinates or location name)

In [8]:
# data_new['Coordinates'] = data_new[['LATITUDE', 'LONGITUDE']].apply(tuple, axis=1)

In [239]:
# dist_matrix = dist_matrix.reset_index()

In [240]:
# df_merged = pd.merge(data_new, dist_matrix, left_on='Coordinates', right_on = 'index', how='outer')

### Using Location as Column Header
Currently only Aguadilla, PR is included from non-protest locations (as last row with NAN for coordiantes is kept, rest dropped as duplicates) - likely need all of these locations included with coordinates

In [242]:
# data_loc = data.drop_duplicates(subset=['LATITUDE', 'LONGITUDE'], keep="first")

In [244]:
# dist_matrix_loc = pd.DataFrame(squareform(pdist(data_loc[['LATITUDE', 'LONGITUDE']].values, lambda u, v: get_distance(u, v))), 
#                                index=data_loc['Full Location'], columns=data_loc['Full Location'])

In [245]:
# df_merged_loc = pd.merge(data_loc, dist_matrix_loc, on='Full Location', how='outer')

In [247]:
# df_merged_loc.to_csv('data/merged_data_distances.csv')

# Using GEOID as column header

In [9]:
weekly = pd.read_csv('data/weekly_protests.csv')

In [5]:
coords = weekly.drop_duplicates(subset=['INTPTLAT10', 'INTPTLON10'], keep="first")

In [22]:
dist_matrix = pd.DataFrame(squareform(pdist(coords[['INTPTLAT10', 'INTPTLON10']].values, lambda u, v: get_distance(u, v))), 
                               index=coords['GEOID10'], columns=coords['GEOID10'])

In [23]:
dist_matrix = dist_matrix.loc[:, dist_matrix.columns.notna()]
dist_matrix = dist_matrix[dist_matrix.index.notnull()]

In [24]:
dist_matrix.to_csv('data/dist_matrix_geoid.csv')