In [118]:
import pandas as pd
import folium
import math

In [3]:
# list of dates for the downloaded files
lstDates = ['201901','201902','201903','201904','201905','201906','201907','201908','201909','201910','201911','201912',
            '202001','202002','202003','202004','202005','202006','202007','202008','202009','202010','202011','202012',
            '202101','202102','202103'
           ]

In [4]:
lstdf = []
# a loop to read the monthly csv files 
for date in lstDates:
    try:
        # read each month's csv
        pathname = r"D:\Dropbox\Dropbox\Teaching\Advanced_GIS_for_Environmental_Planning\Data\NYC\citi_bikes\{}-citibike-tripdata.csv\{}-citibike-tripdata.csv".format(date,date)
        # add (append) this to a list
        lstdf.append(pd.read_csv(pathname))
    except:
        print('got an error for ',date)

In [5]:
# put together all monthly dataframe and create one big dataframe
dfall = pd.concat(lstdf)

In [141]:
# groupby the big datarame using the start stations; you can get a summary of a variable too; for example age of bikers
dfAge = dfall.groupby(['start station id']).agg({'birth year':['median']})
# fix the levels
dfAge = dfAge.droplevel(1,axis=1).reset_index()
# rename cols
dfAge = dfAge.rename(columns={'start station id': 'station_id', 'birth year': 'med_birthYear'})
# calculate age from birth year
dfAge['Age'] = 2021 - dfAge['med_birthYear']

In [142]:
dfAge.sample(2)

Unnamed: 0,station_id,med_birthYear,Age
40,245.0,1984.0,37.0
186,435.0,1980.0,41.0


In [30]:
# create a dataframe that is consolidated based on start station; get the first lat and long value (all lat and long values should be the same)
dfStarts = (dfall[['start station id','start station latitude','start station longitude']]).groupby(['start station id']).agg({'start station latitude':['first'],'start station longitude':['first']})
# create a dataframe that is consolidated based on end station; get the first lat and long value (all lat and long values should be the same)
dfEnds = (dfall[['end station id','end station latitude','end station longitude']]).groupby(['end station id']).agg({'end station latitude':['first'],'end station longitude':['first']})

In [32]:
dfStarts = dfStarts.droplevel(1,axis=1).reset_index()
dfEnds = dfEnds.droplevel(1,axis=1).reset_index()

In [33]:
dfEnds = dfEnds.rename(columns={'end station id': 'station_id', 'end station latitude': 'latitude','end station longitude': 'longitude'})
dfStarts = dfStarts.rename(columns={'start station id': 'station_id', 'start station latitude': 'latitude','start station longitude': 'longitude'})

In [144]:
dfStarts.sample(2)

Unnamed: 0,station_id,latitude,longitude
258,514.0,40.760875,-74.002777
82,307.0,40.714275,-73.9899


In [36]:
# put together start and end stations
dfAllStations = pd.concat([dfStarts,dfEnds])
dfAllStations = dfAllStations.drop_duplicates()

In [62]:
dfAllStations.sample(2)

Unnamed: 0,station_id,latitude,longitude
1018,3975.0,40.820713,-73.911759
1313,4310.0,40.756351,-73.936515


In [147]:
print ('the total number of stations is: ', len(dfAllStations))

the total number of stations is:  1466


In [148]:
# join the age table to the station table
dfAllStationsAge = pd.merge(dfAllStations,dfAge,on='station_id',how='outer')

In [149]:
dfAllStationsAge.sample(2)

Unnamed: 0,station_id,latitude,longitude,med_birthYear,Age
856,3805.0,40.776173,-73.959757,1977.0,44.0
1403,4411.0,40.646351,-74.009271,1970.0,51.0


In [81]:
# create a list of points
locations = dfAllStationsAge[['latitude', 'longitude','station_id','med_birthYear']]
locationlist = locations.values.tolist()
len(locationlist)

1466

In [150]:
# use Folium to map the points
map = folium.Map(location=[dfAllStations['latitude'].mean(), dfAllStations['longitude'].mean()], zoom_start=12)
for point in locationlist:
    folium.Circle(
        radius=5,                                                     # the size of circles
        location=point[0:2],                                          # lat and long of the points
        popup='Id: {}; median age is: {}'.format(point[2],point[-1]), # the popup message
        color="black",                                                # color of the circle
        fill=True,
    ).add_to(map)
map

### Station connections

In [152]:
# choose the id of a station
thisStation = 3354

In [153]:
# get the lat and long of the station
thisCoors = (dfAllStations.loc[dfAllStations['station_id']==thisStation])[['latitude','longitude']]
thisCoors =thisCoors.values.tolist()[0]
thisCoors

[40.668132, -73.97363831]

In [154]:
# filter the big table to separate all trips from one start station
dfThis = dfall.loc[dfall['start station id']==thisStation]
dfThis.sample(2)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
1634075,2069,2019-07-24 19:59:55.3260,2019-07-24 20:34:25.2030,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3661.0,Montgomery St & Franklin Ave,40.666439,-73.960556,32206,Subscriber,1995,1
1297337,319,2020-05-29 17:02:49.7280,2020-05-29 17:08:09.6900,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3423.0,West Drive & Prospect Park West,40.661063,-73.979453,39445,Subscriber,1968,1


In [155]:
# groupby the filtered dataframe based on the station id of the destinations (carry some variables such as number of trips, average of duration)
dfThisDest = dfThis.groupby(['end station id']).agg({'tripduration':['count','mean']})
dfThisDest = dfThisDest.droplevel(0,axis=1).reset_index()
dfThisDest = dfThisDest.rename(columns={'end station id': 'station_id', 'mean': 'tripDuration'})
dfThisDest.sample(2)

Unnamed: 0,station_id,count,tripDuration
675,4419.0,2,1406.0
570,3836.0,5,1617.4


In [156]:
print('the number of destination from station id: {} is: '.format(thisStation),len(dfThisDest))

the number of destination from station id: 3354 is:  679


In [157]:
# if we merge/join this with the station tables that has coordinates, then we have locations!

dfThisMergedDest = pd.merge(dfAllStationsAge,dfThisDest,on='station_id',how='inner')
dfThisMergedDest.sample(2)


Unnamed: 0,station_id,latitude,longitude,med_birthYear,Age,count,tripDuration
60,295.0,40.714067,-73.992939,1983.0,38.0,8,2078.0
6,143.0,40.692395,-73.993379,1978.0,43.0,121,1496.0


In [140]:
# now let's map them!

locations = dfThisMergedDest[['latitude', 'longitude','station_id','count','tripDuration','med_birthYear']]
locationlist = locations.values.tolist()

map2 = folium.Map(location=[dfThisMergedDest['latitude'].mean(), dfThisMergedDest['longitude'].mean()], zoom_start=12)

for point in locationlist:
    folium.Circle(
        radius=math.sqrt(point[3])*5,
        location=point[0:2],
        popup='Id: {}; trip counts: {}'.format(point[2],point[3]),
        color="blue",
        fill=True,
    ).add_to(map2)
    
    
# the location of origin station  
folium.Circle(
        radius=30,
        location=thisCoors,
        popup='your stations',
        color="red",
        fill=True,
    ).add_to(map2)
    
map2