In [1]:
import pandas as pd
import folium
import math
import seaborn as sns

In [2]:
# list of dates for the downloaded files
lstDates = ['202001','202002','202003','202004','202005','202006','202007','202008','202009','202010','202011','202012']

In [3]:
lstdf = []
# a loop to read the monthly csv files 
for date in lstDates:
    try:
        # read each month's csv
        pathname = r"C:\Users\merjl\OneDrive - CUNY\Desktop\CitiBikeFilesUrbanGIS\2020\{}-citibike-tripdata.csv\{}-citibike-tripdata.csv" .format(date,date)
        # add (append) this to a list
        lstdf.append(pd.read_csv(pathname))
    except:
        print('got an error for ',date)

In [4]:
# put together all monthly dataframe and create one full dataframe
dfall = pd.concat(lstdf)

In [5]:
dfall.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19506857 entries, 0 to 1088928
Data columns (total 15 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   tripduration             int64  
 1   starttime                object 
 2   stoptime                 object 
 3   start station id         int64  
 4   start station name       object 
 5   start station latitude   float64
 6   start station longitude  float64
 7   end station id           int64  
 8   end station name         object 
 9   end station latitude     float64
 10  end station longitude    float64
 11  bikeid                   int64  
 12  usertype                 object 
 13  birth year               int64  
 14  gender                   int64  
dtypes: float64(4), int64(6), object(5)
memory usage: 2.3+ GB


In [6]:
#Group by Station ID with corosponding latitude and longitud in new dataframe 
dfLongLat = (dfall[['start station id','start station latitude','start station longitude']]).groupby(['start station id']).agg({'start station latitude':['first'],'start station longitude':['first']})

In [7]:
dfLongLat = dfLongLat.droplevel(1,axis=1).reset_index()

In [8]:
dfLongLat = pd.DataFrame(dfLongLat)

In [9]:
#Change name for future indexing
dfLongLat.rename(columns={'start station id': 'station_id', 'start station latitude':'latitude', 'start station longitude':'longitude'}, inplace = True)

In [10]:
#This dataframe now has only the station id and coordinates
dfLongLat

Unnamed: 0,station_id,latitude,longitude
0,72,40.767272,-73.993929
1,79,40.719116,-74.006667
2,82,40.711174,-74.000165
3,83,40.683826,-73.976323
4,116,40.741776,-74.001497
...,...,...,...
1208,4309,40.772768,-73.927436
1209,4310,40.756351,-73.936515
1210,4311,40.772600,-73.932663
1211,4328,40.840124,-73.939490


In [11]:
#group the full dataframe by the end station id and count total times station appears in dataframe
dfall["Endcount"] = 1
dfcountEnds = dfall.groupby(["end station id"])['Endcount'].count().reset_index()

In [12]:
dfcountEnds['Endcount'].sum()

19506857

In [13]:
dfcountEnds

Unnamed: 0,end station id,Endcount
0,72,34532
1,79,19981
2,82,12170
3,83,19257
4,116,44770
...,...,...
1244,4309,19
1245,4310,2
1246,4311,20
1247,4328,6


In [14]:
#Rename for future index matching
dfcountEnds.rename(columns = {'end station id':'station_id','end station latitude': 'latitude','end station longitude': 'longitude'}, inplace = True)

In [15]:
dfcountEnds

Unnamed: 0,station_id,Endcount
0,72,34532
1,79,19981
2,82,12170
3,83,19257
4,116,44770
...,...,...
1244,4309,19
1245,4310,2
1246,4311,20
1247,4328,6


In [16]:
#group the full dataframe by the start station id and count total times station appears in dataframe
dfall["Startcount"] = 1
dfcountStarts = dfall.groupby(["start station id" ])['Startcount'].count().reset_index()

In [17]:
#Rename for future index matching
dfcountStarts.rename(columns = {'start station id':'station_id'}, inplace = True)

In [18]:
dfcountStarts

Unnamed: 0,station_id,Startcount
0,72,34550
1,79,19560
2,82,12009
3,83,18797
4,116,44608
...,...,...
1208,4309,20
1209,4310,2
1210,4311,19
1211,4328,3


In [19]:
dfcountStarts['Startcount'].sum()

19506857

In [20]:
dfcountEnds_pd = pd.DataFrame(dfcountEnds)

In [21]:
dfcountEnds

Unnamed: 0,station_id,Endcount
0,72,34532
1,79,19981
2,82,12170
3,83,19257
4,116,44770
...,...,...
1244,4309,19
1245,4310,2
1246,4311,20
1247,4328,6


In [22]:
dfcountStarts_pd = pd.DataFrame(dfcountStarts)

In [23]:
dfcountStarts

Unnamed: 0,station_id,Startcount
0,72,34550
1,79,19560
2,82,12009
3,83,18797
4,116,44608
...,...,...
1208,4309,20
1209,4310,2
1210,4311,19
1211,4328,3


In [24]:
#Merge dataframes index matching station id to produce a dataframe of all station start and end count and coordinates
dfMerge = dfcountEnds.merge(dfcountStarts,on='station_id').merge(dfLongLat,on='station_id')

In [25]:
dfMerge

Unnamed: 0,station_id,Endcount,Startcount,latitude,longitude
0,72,34532,34550,40.767272,-73.993929
1,79,19981,19560,40.719116,-74.006667
2,82,12170,12009,40.711174,-74.000165
3,83,19257,18797,40.683826,-73.976323
4,116,44770,44608,40.741776,-74.001497
...,...,...,...,...,...
1208,4309,19,20,40.772768,-73.927436
1209,4310,2,2,40.756351,-73.936515
1210,4311,20,19,40.772600,-73.932663
1211,4328,6,3,40.840124,-73.939490


In [26]:
dfMerge['Endcount'].sum()

19506516

In [27]:
dfMerge['Startcount'].sum()

19506857

In [28]:
#Export as csv
dfMerge.to_csv(r'C:\Users\merjl\OneDrive - CUNY\Desktop\CitiBikeFilesUrbanGIS\2020RidershipMerge.csv', index = False)