In [1]:
import sys
import pandas as pd
import numpy as np

# Loading Data Sets and Cleaning

Locations from the Uber 2014 data

In [2]:
file_path = './ride_by_location.csv'

locations_df = pd.read_csv(file_path)
locations_df

Unnamed: 0,Lat,Lon,Ride.Count
0,39.6569,-74.2258,1
1,40.7435,-73.9560,1
2,40.7435,-73.9568,1
3,40.7435,-73.9573,1
4,40.7435,-73.9712,1
...,...,...,...
574553,40.7741,-73.8726,1921
574554,40.6449,-73.7822,1947
574555,40.6448,-73.7820,2079
574556,40.7685,-73.8625,2257


Points of interest data from NYC Open Data 2017: https://data.cityofnewyork.us/City-Government/Points-Of-Interest/rxuy-2muj

In [3]:
file_path = './Point_Of_Interest.csv'

points_df = pd.read_csv(file_path)
points_df

Unnamed: 0,the_geom,SEGMENTID,COMPLEXID,SAFTYPE,SOS,PLACEID,FACI_DOM,BIN,BOROUGH,CREATED,MODIFIED,FACILITY_T,SOURCE,B7SC,PRI_ADD,NAME
0,POINT (-74.00701717096757 40.724634757833414),31895,0,N,1.0,567,9,0,1.0,05/14/2009 12:00:00 AM,11/18/2011 12:00:00 AM,6,DoITT,19743001.0,0,HOLLAND
1,POINT (-73.82661642130311 40.797182526598505),306303,3378,N,2.0,568,8,0,4.0,05/14/2009 12:00:00 AM,01/09/2017 12:00:00 AM,6,DoITT,49731001.0,0,WHITESTONE
2,POINT (-73.99395441100663 40.70384707235758),144842,3960,N,2.0,576,8,0,3.0,05/14/2009 12:00:00 AM,01/22/2018 12:00:00 AM,6,DoITT,39734001.0,0,BROOKLYN
3,POINT (-73.9919414213091 40.70960010711745),162664,0,N,1.0,580,8,0,1.0,05/14/2009 12:00:00 AM,05/11/2011 12:00:00 AM,6,DoITT,19795001.0,0,MANHATTAN
4,POINT (-73.9526609766105 40.73906602249743),157362,0,N,1.0,582,8,0,3.0,05/14/2009 12:00:00 AM,03/03/2017 12:00:00 AM,6,DoITT,39740001.0,0,PULASKI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20563,POINT (-73.83068969920059 40.71630413026696),89776,0,,1.0,1037675,4,0,4.0,01/06/2022 12:00:00 AM,,13,OTHER,,0,ILSE METZGER SITTING AREA
20564,POINT (-74.20348266269197 40.51217219763163),193675,5498,G,2.0,1037715,1,0,5.0,02/08/2022 12:00:00 AM,,4,OTHER,51133501.0,0,LEMON CREEK PARK
20565,POINT (-73.83215882936936 40.76487503425277),91100,0,,2.0,1037755,4,4618575,4.0,02/28/2022 12:00:00 AM,,7,OTHER,,5204883,FOUR POINTS BY SHERATON
20566,POINT (-74.03872240103044 40.63208693434625),16678,0,,2.0,1037796,1,3400244,3.0,03/30/2022 12:00:00 AM,,4,OTHER,,5205096,SHORE ROAD PARK ZONE 2 COMFORT STATION


In [4]:
borough_points_df = points_df[['the_geom', 'NAME', 'BOROUGH']]
borough_points_df.info()
borough_points_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20568 entries, 0 to 20567
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   the_geom  20568 non-null  object 
 1   NAME      20568 non-null  object 
 2   BOROUGH   20357 non-null  float64
dtypes: float64(1), object(2)
memory usage: 482.2+ KB


Unnamed: 0,the_geom,NAME,BOROUGH
0,POINT (-74.00701717096757 40.724634757833414),HOLLAND,1.0
1,POINT (-73.82661642130311 40.797182526598505),WHITESTONE,4.0
2,POINT (-73.99395441100663 40.70384707235758),BROOKLYN,3.0
3,POINT (-73.9919414213091 40.70960010711745),MANHATTAN,1.0
4,POINT (-73.9526609766105 40.73906602249743),PULASKI,3.0
...,...,...,...
20563,POINT (-73.83068969920059 40.71630413026696),ILSE METZGER SITTING AREA,4.0
20564,POINT (-74.20348266269197 40.51217219763163),LEMON CREEK PARK,5.0
20565,POINT (-73.83215882936936 40.76487503425277),FOUR POINTS BY SHERATON,4.0
20566,POINT (-74.03872240103044 40.63208693434625),SHORE ROAD PARK ZONE 2 COMFORT STATION,3.0


Changing the NYC Open Data latitude and longitude from string to separate numerics.

In [5]:
geom = list(borough_points_df['the_geom'])
name = list(borough_points_df['NAME'])
borough = list(borough_points_df['BOROUGH'])

new_df = []

for i in range(len(geom)):
    word = geom[i]
    points = word.split(" ")[1:3]
    long = round(float(points[0].replace('(', '')), 4)
    lat = round(float(points[1].replace(')', '')), 4)
    new_df.append([lat, long, name[i], borough[i]])

In [6]:
borough_df = pd.DataFrame(new_df, columns = ['Lat', 'Lon', 'Name', 'Borough'])
borough_df

Unnamed: 0,Lat,Lon,Name,Borough
0,40.7246,-74.0070,HOLLAND,1.0
1,40.7972,-73.8266,WHITESTONE,4.0
2,40.7038,-73.9940,BROOKLYN,3.0
3,40.7096,-73.9919,MANHATTAN,1.0
4,40.7391,-73.9527,PULASKI,3.0
...,...,...,...,...
20563,40.7163,-73.8307,ILSE METZGER SITTING AREA,4.0
20564,40.5122,-74.2035,LEMON CREEK PARK,5.0
20565,40.7649,-73.8322,FOUR POINTS BY SHERATON,4.0
20566,40.6321,-74.0387,SHORE ROAD PARK ZONE 2 COMFORT STATION,3.0


# Exploration of Combining Data and Getting Locations

This is the beginning trials to test how to combine the above data sets and get location names for Uber (lat, long) pairs. The first few cells are concerned with finding points in both data sets while the following cells explore finding the point of interest in closest haversine proximity to each point in the Uber data then associating that name to it. The end results can be ignored as many of the steps are repeated in the next section to get a final result.

Join data frames to see what is in both sets

In [7]:
combine_bor_df = pd.merge(borough_df, locations_df, on = ['Lat', 'Lon'], how = 'inner')
combine_bor_df

Unnamed: 0,Lat,Lon,Name,Borough,Ride.Count
0,40.7246,-74.0070,HOLLAND,1.0,1
1,40.7096,-73.9919,MANHATTAN,1.0,2
2,40.7390,-73.9427,BORDEN,4.0,1
3,40.8782,-73.9216,HENRY HUDSON,2.0,1
4,40.8782,-73.9216,HENRY HUDSON BRIDGE NB RB,2.0,1
...,...,...,...,...,...
2427,40.7712,-73.9886,GERTRUDE EDERLE RECREATION CENTER PLAYGROUND,1.0,1
2428,40.7337,-73.9888,ZERO IRVING PLACE,1.0,3
2429,40.7135,-73.9981,KIMLAU ARCH,1.0,1
2430,40.8014,-73.9704,THE SHERWOOD,1.0,3


Top 20 locations in both

In [8]:
popular_in_data = combine_bor_df.sort_values('Ride.Count', ascending = False).head(20)
popular_in_data

Unnamed: 0,Lat,Lon,Name,Borough,Ride.Count
22,40.7663,-73.9917,W 52 ST OV AMTRAK 30 ST BRANCH,1.0,181
62,40.7565,-74.0015,W 35 ST OV AMTRAK 30 ST BRANCH,1.0,129
1651,40.711,-74.0055,SPRUCE STREET SCHOOL,1.0,109
19,40.7631,-73.9938,W 47 ST OV AMTRAK 30 ST BRANCH,1.0,62
188,40.7217,-73.9974,PETROSINO SQUARE,1.0,62
26,40.7599,-73.9962,W 42 ST OV AMTRAK 30 ST BRANCH,1.0,58
167,40.764,-73.9736,PULITZER FOUNTAIN,1.0,58
2239,40.7052,-74.0079,ANDAZ HOTEL WALL STREET,1.0,55
42,40.7522,-73.9776,PARK AVE VIADUCT OVER E 42 ST,1.0,55
1528,40.756,-73.9869,TIMES SQ AREA,1.0,52


Saving the dataframe

In [9]:
# popular_in_data.to_csv('./popular_loc_in_data.csv', index = False)

Defining Haversine distance function to be used to calculate distance between points in Uber data against entries in points of interest data.

In [10]:
import math
def distanceBetween(lat1, lon1, lat2, lon2):
    dLat = math.radians(lat2-lat1)
    dLon = math.radians(lon2-lon1)

    lat1 = math.radians(lat1)
    lat2 = math.radians(lat2)

    a = math.sin(dLat/2)**2 + math.sin(dLon/2)**2 * math.cos(lat1) * math.cos(lat2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    return c * 3958 #multiply by 100k to get distance in cm

Get the locations from Uber and NYC Open Data and change the coordinates to radians for haversine calculation.

In [11]:
dist_poi = combine_bor_df[['Lat', 'Lon']]
dist_poi['Lat'] = np.radians(dist_poi['Lat'])
dist_poi['Lon'] = np.radians(dist_poi['Lon'])

dist_data = locations_df[['Lat', 'Lon']]
dist_data['Lat'] = np.radians(locations_df['Lat'])
dist_data['Lon'] = np.radians(locations_df['Lon'])

dist_data_list = list(dist_data.to_numpy())
dist_poi_list = list(dist_poi.to_numpy()) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dist_poi['Lat'] = np.radians(dist_poi['Lat'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dist_poi['Lon'] = np.radians(dist_poi['Lon'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dist_data['Lat'] = np.radians(locations_df['Lat'])
A value is trying to be set on a copy of a slice from a DataFr

Loop through both data sets to find the index of the point of interest corresponding to the minimum distance of Uber location. ***Warning:*** This cell takes a very long time to run.

In [12]:
distance_loc_poi = []

for loc1 in dist_data_list:
    dist_loc1 = []
    for loc2 in dist_poi_list:
        dist_loc1.append(distanceBetween(loc1[0], loc1[1], loc2[0], loc2[1]))
    max_index = dist_loc1.index(min(dist_loc1))
    distance_loc_poi.append([loc1[0], loc1[1], max_index])

Change latitude and longitude back to degrees.

In [13]:
loc_poi_index_df = pd.DataFrame(distance_loc_poi, columns = ['Lat', 'Lon', 'Index'])
loc_poi_index_df['Lat'] = np.degrees(loc_poi_index_df['Lat'])
loc_poi_index_df['Lon'] = np.degrees(loc_poi_index_df['Lon'])
loc_poi_index_df

Unnamed: 0,Lat,Lon,Index
0,39.6569,-74.2258,1559
1,40.7435,-73.9560,184
2,40.7435,-73.9568,184
3,40.7435,-73.9573,184
4,40.7435,-73.9712,2333
...,...,...,...
574553,40.7741,-73.8726,1019
574554,40.6449,-73.7822,1996
574555,40.6448,-73.7820,1996
574556,40.7685,-73.8625,549


Find name of location by index and append to dataframe.

In [14]:
indices = list(loc_poi_index_df[['Index']].to_numpy())
names = list(combine_bor_df[['Name']].to_numpy())

row_name = []

for row in range(len(loc_poi_index_df)):
    index = int(indices[row])
    name = names[index]
    row_name.append(name[0])

In [15]:
loc_poi_index_df['Name'] = row_name

loc_names_df = pd.merge(locations_df, loc_poi_index_df, on = ['Lat', 'Lon'], how = 'left')
loc_names_df

Unnamed: 0,Lat,Lon,Ride.Count,Index,Name
0,39.6569,-74.2258,1,1559.0,JOEL SAMUELS PLAZA
1,40.7435,-73.9560,1,,
2,40.7435,-73.9568,1,,
3,40.7435,-73.9573,1,,
4,40.7435,-73.9712,1,,
...,...,...,...,...,...
574553,40.7741,-73.8726,1921,1019.0,LAGUARDIA AIRPORT TERMINAL B
574554,40.6449,-73.7822,1947,1996.0,JFK BUILDING 245C
574555,40.6448,-73.7820,2079,1996.0,JFK BUILDING 245C
574556,40.7685,-73.8625,2257,549.0,LGA - US AIRWAYS TERMINAL


Check for dupilcates:

In [16]:
loc_names_df.where(loc_names_df['Name'] == 'JFK BUILDING 245C').dropna()

Unnamed: 0,Lat,Lon,Ride.Count,Index,Name
155343,40.6485,-73.7821,1.0,1996.0,JFK BUILDING 245C
155361,40.6485,-73.7901,1.0,1996.0,JFK BUILDING 245C
155362,40.6485,-73.7858,1.0,1996.0,JFK BUILDING 245C
155363,40.6485,-73.7857,1.0,1996.0,JFK BUILDING 245C
155364,40.6485,-73.7848,1.0,1996.0,JFK BUILDING 245C
...,...,...,...,...,...
574549,40.6448,-73.7818,1758.0,1996.0,JFK BUILDING 245C
574550,40.6449,-73.7821,1767.0,1996.0,JFK BUILDING 245C
574554,40.6449,-73.7822,1947.0,1996.0,JFK BUILDING 245C
574555,40.6448,-73.7820,2079.0,1996.0,JFK BUILDING 245C


Since there are many duplicated entries, add up the ride.counts for each name.

In [17]:
loc_counts = loc_names_df.groupby(['Name'])['Ride.Count'].sum().reset_index()
ordered_ride_counts = loc_counts.sort_values('Ride.Count', ascending=False)
ordered_ride_counts.head(20)

Unnamed: 0,Name,Ride.Count
1058,JFK BUILDING 245C,93628
1112,LAGUARDIA AIRPORT TERMINAL B,50455
1137,LGA - US AIRWAYS TERMINAL,40466
1845,SI FERRY TRML PEDESTRIAN,39370
1696,RADEGAST HALL & BIERGARTEN,18748
1050,JANE HOTEL,16898
762,FREEDOM TRIANGLE,15295
2071,THE STANDARD HOTEL HIGH LINE,15118
1081,KATZ WOMENS HOSPITAL AT LONG ISLAND JEWISH MED...,14972
1273,MCCARREN HOTEL & POOL,13922


In [18]:
combine_borough_df = borough_df.merge(ordered_ride_counts, on = 'Name', how = 'inner')

final_borough = combine_borough_df.sort_values('Ride.Count', ascending=False)

In [19]:
final_borough

Unnamed: 0,Lat,Lon,Name,Borough,Ride.Count
1945,40.6489,-73.7841,JFK BUILDING 245C,4.0,93628
999,40.7745,-73.8728,LAGUARDIA AIRPORT TERMINAL B,4.0,50455
545,40.7706,-73.8648,LGA - US AIRWAYS TERMINAL,4.0,40466
95,40.6434,-74.0720,SI FERRY TRML PEDESTRIAN,5.0,39370
2206,40.7166,-73.9614,RADEGAST HALL & BIERGARTEN,3.0,18748
...,...,...,...,...,...
357,40.7500,-73.9717,GUINEA BISSAU MISSION TO UN,1.0,6
1749,40.7535,-73.9700,UK MISSION TO UN,1.0,6
838,40.7742,-73.9843,WALTER READE THEATER,1.0,6
645,40.7300,-73.9946,NEW YORK UNIVERSITY KIMBALL BLOCK,1.0,4


In [20]:
# final_borough.to_csv('./count_boroughs.csv', index = False)

# Top Locations By Borough

In [21]:
manhattan_df = final_borough.where(final_borough['Borough'] == 1)
manhattan_df.dropna().reset_index().head(10)

Unnamed: 0,index,Lat,Lon,Name,Borough,Ride.Count
0,224,40.7383,-74.0093,JANE HOTEL,1.0,16898.0
1,2192,40.7407,-74.0079,THE STANDARD HOTEL HIGH LINE,1.0,15118.0
2,692,40.7391,-74.0049,CORPORAL JOHN A SERAVALLI PLAYGROUND,1.0,13727.0
3,1564,40.742,-73.9922,MANHATTAN VILLAGE ACADEMY HIGH SCHOOL,1.0,12984.0
4,2024,40.725,-73.9991,APPLE STORE SOHO,1.0,12024.0
5,2153,40.751,-73.9949,PENN STATION WEST END CONCOURSE,1.0,11990.0
6,2313,40.7238,-73.9979,MUSEUM OF ICE CREAM,1.0,11757.0
7,234,40.7477,-74.0101,CHELSEA PIERS SKY RINK,1.0,11358.0
8,1936,40.745,-73.9909,PHILLIPS BETH ISRAEL SCHOOL OF NURSING,1.0,11322.0
9,2298,40.7235,-74.0083,HUNTER COLLEGE MFA BUILDING,1.0,11276.0


In [22]:
brooklyn_df = final_borough.where(final_borough['Borough'] == 3)
brooklyn_df.dropna().reset_index().head(10)

Unnamed: 0,index,Lat,Lon,Name,Borough,Ride.Count
0,2206,40.7166,-73.9614,RADEGAST HALL & BIERGARTEN,3.0,18748.0
1,2361,40.6973,-73.9318,FREEDOM TRIANGLE,3.0,15295.0
2,2149,40.7209,-73.9556,MCCARREN HOTEL & POOL,3.0,13922.0
3,1782,40.6685,-73.9791,METHODIST HOSPITAL WESLEY HOUSE,3.0,13233.0
4,1451,40.6824,-74.0059,HUGH L CAREY TUNNEL VENTILATION BUILDING,3.0,10815.0
5,115,40.664,-73.9906,5 AV OVER 27 X PROSPECT EP,3.0,10717.0
6,643,40.6987,-73.9179,HEISSER TRIANGLE,3.0,10288.0
7,2146,40.7176,-73.9578,FORNINO RESTAURANT WILLIAMSBURG,3.0,10009.0
8,1522,40.7256,-73.9445,STEWARD SQUARE,3.0,8794.0
9,188,40.7087,-73.9397,PS 196 TEN EYCK,3.0,8680.0


In [23]:
staten_df = final_borough.where(final_borough['Borough'] == 5)
staten_df.dropna().reset_index()

Unnamed: 0,index,Lat,Lon,Name,Borough,Ride.Count
0,95,40.6434,-74.072,SI FERRY TRML PEDESTRIAN,5.0,39370.0
1,1422,40.5926,-74.1296,FDNY BOROUGH COMMAND,5.0,1866.0
2,540,40.5912,-74.0975,ST ANNS RC CHURCH,5.0,1583.0
3,356,40.6076,-74.09,ST SYLVESTERS CHURCH,5.0,1407.0
4,435,40.6148,-74.1765,HILTON GARDEN INN,5.0,1276.0
5,856,40.5985,-74.163,IMMANUEL LUTHERAN CHURCH,5.0,1229.0
6,547,40.5482,-74.1482,CHRIST LUTHERAN CHURCH,5.0,541.0


In [24]:
bronx_df = final_borough.where(final_borough['Borough'] == 2)
bronx_df.dropna().reset_index().head(10)

Unnamed: 0,index,Lat,Lon,Name,Borough,Ride.Count
0,88,40.8702,-73.8458,GUN HILL ROAD OVER NYCT DYRE AVE,2.0,10717.0
1,86,40.9012,-73.9019,FIELDSTON RD OVER H HUDSON PKWY,2.0,5029.0
2,1513,40.8276,-73.926,TBPD DISTRICT 11,2.0,3469.0
3,2311,40.8808,-73.907,AMBER CHARTER SCHOOL II KINGSBRIDGE,2.0,3012.0
4,1155,40.8544,-73.8854,BISHOP PERNICONE PLAZA,2.0,2548.0
5,1571,40.8538,-73.8726,BRONX ZOO RIVER GATE,2.0,1961.0
6,1855,40.8794,-73.8857,MARGARET J MACK TRIANGLE,2.0,1940.0
7,628,40.8462,-73.9089,BOARD OF ELECTIONS,2.0,1883.0
8,89,40.8268,-73.9227,GRAND CONCOURSE OVER E 161 ST,2.0,1724.0
9,278,40.848,-73.8531,ST FRANCIS XAVIER CHURCH,2.0,1543.0


In [25]:
queens_df = final_borough.where(final_borough['Borough'] == 4)
queens_df.dropna().reset_index().head(10)

Unnamed: 0,index,Lat,Lon,Name,Borough,Ride.Count
0,1945,40.6489,-73.7841,JFK BUILDING 245C,4.0,93628.0
1,999,40.7745,-73.8728,LAGUARDIA AIRPORT TERMINAL B,4.0,50455.0
2,545,40.7706,-73.8648,LGA - US AIRWAYS TERMINAL,4.0,40466.0
3,2309,40.7543,-73.7074,KATZ WOMENS HOSPITAL AT LONG ISLAND JEWISH MED...,4.0,14972.0
4,166,40.7657,-73.9232,ST DEMETRIOS SCHOOL,4.0,9175.0
5,187,40.743,-73.9547,108 PRECINCT,4.0,8065.0
6,2100,40.7273,-73.8536,DEVRY COLLEGE,4.0,6749.0
7,151,40.769,-73.9108,STEINWAY ST OVER I-278 BQE WB,4.0,6413.0
8,505,40.6692,-73.8119,ST ANTHONY OF PADUA CHURCH,4.0,6118.0
9,909,40.7465,-73.9186,HOLY MOUNTAIN PRESCHOOL SKILLMAN AVE,4.0,5122.0
