In [68]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt
import itertools

In [69]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956
    return c * r

# 1.0 Getting Lookup file together

In [9]:
sportsdata = pd.read_pickle('/home/michael/Documents/Projects/divvydataproject/data/sporteventdata.pkl')

In [10]:
sportsdata.head()

Unnamed: 0,attend,datetime,games,location,team,gpslocation
1,,2013-10-31 19:00:00,2,unitedcenter,bulls,"41.8749965, -87.671163982"
4,,2013-11-08 19:00:00,5,unitedcenter,bulls,"41.8749965, -87.671163982"
5,,2013-11-11 19:00:00,6,unitedcenter,bulls,"41.8749965, -87.671163982"
7,,2013-11-16 19:00:00,8,unitedcenter,bulls,"41.8749965, -87.671163982"
8,,2013-11-18 19:00:00,9,unitedcenter,bulls,"41.8749965, -87.671163982"


In [32]:
sportsdatalookup = sportsdata[['location','gpslocation']].copy(deep = True)
sportsdatalookup.columns = ['id','gpslocation']
sportsdatalookup = sportsdatalookup.drop_duplicates()
sportsdatalookup['latitude'] = [float(str(x).split(',')[0]) for x in sportsdatalookup['gpslocation']]
sportsdatalookup['longitude'] = [float(str(x).split(',')[1]) for x in sportsdatalookup['gpslocation']]
sportsdatalookup = sportsdatalookup[['id','latitude','longitude']]

In [33]:
sportsdatalookup.head()

Unnamed: 0,id,latitude,longitude
1,unitedcenter,41.874997,-87.671164
6,wrigley,41.948437,-87.655334
0,uscellular,41.824663,-87.633664
1,soldierfield,41.857497,-87.616664


In [34]:
sportsdatalookup.dtypes

id            object
latitude     float64
longitude    float64
dtype: object

In [35]:
distlookup = pd.read_pickle('/home/michael/Documents/Projects/divvydataproject/data/distlookup.pkl')

In [36]:
distlookup.head()

Unnamed: 0,id,latitude,longitude
0,2,41.876393,-87.620328
1,3,41.867226,-87.615355
2,4,41.856268,-87.613348
3,5,41.874053,-87.627716
4,6,41.885041,-87.612794


In [37]:
locationdist = pd.concat([sportsdatalookup,distlookup])

In [39]:
locationdist = locationdist.reset_index(drop = True)

In [43]:
locationdist

Unnamed: 0,id,latitude,longitude
0,unitedcenter,41.874997,-87.671164
1,wrigley,41.948437,-87.655334
2,uscellular,41.824663,-87.633664
3,soldierfield,41.857497,-87.616664
4,2,41.876393,-87.620328
5,3,41.867226,-87.615355
6,4,41.856268,-87.613348
7,5,41.874053,-87.627716
8,6,41.885041,-87.612794
9,7,41.886349,-87.617517


In [77]:
ids = [x for x in locationdist['id']]

In [78]:
combinations = []
for id1 in locationdist['id']:
    for id2 in locationdist['id']:
        combinations.append([id1,id2])

In [79]:
len(combinations)

1359556

In [84]:
disttable = pd.DataFrame(combinations, columns = ['tid','fid'])
disttable = pd.merge(disttable, locationdist, how = 'left', left_on = 'tid', right_on ='id')
disttable = pd.merge(disttable, locationdist, how = 'left', left_on = 'fid', right_on ='id')
disttable.columns = ['tid','fid','idx','flat','flong','idy','tlat','tlong']
disttable = disttable[['tid','fid','flat','flong','tlat','tlong']]
disttable.tail()

Unnamed: 0,tid,fid,flat,flong,tlat,tlong
1359551,p1037,p425,41.898851,-87.639625,41.929808,-87.653653
1359552,p1037,p1004,41.898851,-87.639625,41.718413,-87.56457
1359553,p1037,p277,41.898851,-87.639625,41.798063,-87.593081
1359554,p1037,p167,41.898851,-87.639625,42.004004,-87.704652
1359555,p1037,p1037,41.898851,-87.639625,41.898851,-87.639625


In [85]:
disttable['distance'] = np.vectorize(haversine)(disttable['flat'], disttable['flong'], disttable['tlat'], disttable['tlong'])

In [86]:
disttable.to_pickle('/home/michael/Documents/Projects/divvydataproject/data/distancetable.pkl')

In [92]:
wrigley = disttable[disttable['tid'] == 'wrigley']
wrigley = wrigley[wrigley['fid'].str.slice(0,1) != 'p']
wrigley.sort_values('distance').head(6)

Unnamed: 0,tid,fid,flat,flong,tlat,tlong,distance
1167,wrigley,wrigley,41.948437,-87.655334,41.948437,-87.655334,0.0
1453,wrigley,306,41.948437,-87.655334,41.958494,-87.654966,0.038115
1270,wrigley,114,41.948437,-87.655334,41.949399,-87.654529,0.055648
1381,wrigley,231,41.948437,-87.655334,41.96167,-87.65464,0.060776
1389,wrigley,240,41.948437,-87.655334,41.954245,-87.654406,0.066142
1470,wrigley,323,41.948437,-87.655334,41.969517,-87.654691,0.07428
