# CapitalBikeShare dataset: analysis and manipulation
Important note: old dataset have the bike_id (up to 03/2020) and new ones have stations coordinates so we join the coodinate data to old datasets

In [2]:
import zipfile
import pandas as pd
import numpy as np

tripdatadir = 'tripdata/'
gpxdatadir = 'gpxdata/'
ns3datadir = 'ns3data/'


def extract_to_df(months):
    df = pd.DataFrame()
    for m in months:
        filename = m + '-capitalbikeshare-tripdata'
        try:
            with zipfile.ZipFile(tripdatadir + filename + '.zip', 'r') as zip_ref:
                zip_ref.extract(filename + '.csv', path=tripdatadir)
        except OSError:
            print('Zip file ' + filename + '.zip not found.')
        df = pd.concat([df, pd.read_csv(tripdatadir + filename + '.csv')])
    return df


## Loading datasets

### Trips dataset (up to 03/2020)

Still contains an identifier for bikes, so 03/2020 and previous ones will be the data we are using

(here we can concatenate more than one)

In [3]:
months = ['202003']
trips_dataset = extract_to_df(months)
trips_dataset


Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,1596,2020-03-01 00:01:16,2020-03-01 00:27:52,31646,Maine Ave & 9th St SW,31509,New Jersey Ave & R St NW,W24217,Member
1,448,2020-03-01 00:02:04,2020-03-01 00:09:32,31646,Maine Ave & 9th St SW,31272,Washington & Independence Ave SW/HHS,W21897,Member
2,283,2020-03-01 00:03:45,2020-03-01 00:08:28,31224,18th & L St NW,31200,Massachusetts Ave & Dupont Circle NW,W21703,Member
3,642,2020-03-01 00:05:49,2020-03-01 00:16:31,31603,1st & M St NE,31611,13th & H St NE,78571,Member
4,347,2020-03-01 00:06:53,2020-03-01 00:12:40,31101,14th & V St NW,31114,18th St & Wyoming Ave NW,W23425,Member
...,...,...,...,...,...,...,...,...,...
162525,673,2020-03-31 23:17:58,2020-03-31 23:29:12,31235,19th St & Constitution Ave NW,31265,5th St & Massachusetts Ave NW,W22920,Member
162526,514,2020-03-31 23:18:13,2020-03-31 23:26:47,31203,14th & Rhode Island Ave NW,31324,18th & New Hampshire Ave NW,21054,Member
162527,1524,2020-03-31 23:29:00,2020-03-31 23:54:25,31110,20th St & Florida Ave NW,31403,5th & Kennedy St NW,W24341,Member
162528,565,2020-03-31 23:41:17,2020-03-31 23:50:43,31603,1st & M St NE,31256,10th & E St NW,W22691,Member


### First dataset with coords. (04/2022)
Used to extract the coordinates of stations (missing in the older datasets)

In [4]:
coords_dataset = extract_to_df(['202004'])
coords_dataset


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,77A0F1B26D1597B1,docked_bike,2020-04-25 17:28:39,2020-04-25 17:35:04,Rhode Island & Connecticut Ave NW,31239,12th & L St NW,31251.0,38.905996,-77.039802,38.903819,-77.028400,casual
1,8698F10128EA4F18,docked_bike,2020-04-06 07:54:59,2020-04-06 07:57:24,21st & I St NW,31205,18th & L St NW,31224.0,38.900711,-77.046449,38.903741,-77.042452,member
2,AA07819DC0F58872,docked_bike,2020-04-22 17:06:18,2020-04-22 18:08:32,Connecticut Ave & Tilden St NW,31313,Connecticut Ave & Tilden St NW,31313.0,38.941139,-77.061977,38.941139,-77.061977,casual
3,DA909BCA92EF85AB,docked_bike,2020-04-16 15:22:40,2020-04-16 15:58:37,7th & E St SW,31294,7th & E St SW,31294.0,38.883450,-77.021741,38.883450,-77.021741,casual
4,B36F1E14D8C6757E,docked_bike,2020-04-10 13:19:41,2020-04-10 13:23:05,Potomac & Pennsylvania Ave SE,31606,8th & Eye St SE / Barracks Row,31608.0,38.880300,-76.986200,38.879200,-76.995300,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75466,0A780FE44141CC63,docked_bike,2020-04-05 11:34:01,2020-04-05 12:01:18,14th & Girard St NW,31123,Potomac & M St NW,31295.0,38.925284,-77.032375,38.905368,-77.065149,member
75467,2BF73FA35D98F654,docked_bike,2020-04-12 14:14:18,2020-04-12 14:23:11,14th & Girard St NW,31123,7th & T St NW,31109.0,38.925284,-77.032375,38.915500,-77.022200,member
75468,2E52EFE18CAA7C2C,docked_bike,2020-04-25 22:58:37,2020-04-25 23:06:20,14th & Girard St NW,31123,18th & R St NW,31278.0,38.925284,-77.032375,38.912648,-77.041834,member
75469,22756456D7C62A55,docked_bike,2020-04-29 12:58:48,2020-04-29 13:05:44,17th St & Massachusetts Ave NW,31267,18th & R St NW,31278.0,38.908142,-77.038359,38.912648,-77.041834,casual


## Extract stations coord. from newer dataset

Export stations from the first new-format dataset

In [5]:
# drop unused columns
df = coords_dataset.drop(
    columns=['ride_id', 'rideable_type', 'started_at', 'ended_at', 'member_casual'])
# final coords format
cols = ['station_id', 'station_name', 'lat', 'lon']
# merge start and end station data, and remove duplicated coords for each station_id
start = df[['start_station_id', 'start_station_name', 'start_lat', 'start_lng']]
end = df[['end_station_id', 'end_station_name', 'end_lat', 'end_lng']]
start.columns, end.columns = cols, cols
df = pd.concat([start, end]).drop_duplicates(['station_id', 'lat', 'lon']).dropna().sort_values(by='station_id')
# convert station_id to int
df['station_id'] = df['station_id'].astype(int)
# isolate one station name per id
names = df[['station_id', 'station_name']].drop_duplicates().groupby(['station_id']).first()
# some stations have multiple close-by coords: average over them, join names, rearrange columns
df = df.groupby(['station_id']).mean().reset_index().join(names, 'station_id')[cols]
# to file
df.to_csv('202004-stations-coords.csv', index=False)
print(f"Distinct station identifiers: {df['station_id'].drop_duplicates().size}")
df


Distinct station identifiers: 582


Unnamed: 0,station_id,station_name,lat,lon
0,31000,Eads St & 15th St S,38.858971,-77.053230
1,31001,18th St & S Eads St,38.857250,-77.053320
2,31002,Crystal Dr & 20th St S,38.856425,-77.049232
3,31003,Crystal Dr & 15th St S,38.860613,-77.049505
4,31004,Aurora Hills Cmty Ctr / 18th St & S Hayes St,38.857866,-77.059490
...,...,...,...,...
577,32607,S Maple Ave & S Washington St,38.879720,-77.178408
578,32608,Falls Church City Hall / Park Ave & Little Fal...,38.885434,-77.173605
579,32609,W Columbia St & N Washington St,38.885621,-77.166917
580,32900,Motivate BX Tech office,38.964406,-77.010759


Export coord. to gpx

In [6]:
stations = pd.read_csv('202004-stations-coords.csv').to_numpy()
with open(gpxdatadir + '202004-stations-coords.gpx','w') as fb:
    fb.write('<?xml version="1.0" encoding="UTF-8" standalone="no" ?>\n\n')
    fb.write('<gpx version="1.1" creator="Alessandro Aimi">\n')
    for s in stations:
        fb.write(f'    <wpt lat="{s[2]}" lon="{s[3]}">\n')
        #fb.write(f'        <name> {s[1]} </name>\n')
        #fb.write(f'        <cmt> station_id: {s[0]} </cmt>\n')
        #fb.write(f'    </wpt>\n')
    #fb.write('</gpx>\n')


Example convertion of lat,lon of a point to x,y (meters) relatively to the center:
```python
    # Example point
    lat_p = 38.879720
    lon_p = -77.178408
    # Converting the latitude is easier
    dlat = lat_p - lat_center # vertical angle delta in degrees over the earth sphere
    dlat_rad = dlat * np.pi / 180 # radians conversion
    y_p = dlat_rad * r # y in meters
    # Converting the logitude requires a reference latitude
    lat_ref = (lat_p + lat_center) / 2 # halfway latitude
    lat_ref_rad = lat_ref * np.pi / 180 # radians conversion
    r_ref = r * np.cos(lat_ref_rad) # reduced radius of reference parallel 
    dlon = lon_p - lon_center # horizontal angle delta in degrees over the earth sphere
    dlon_rad = dlon * np.pi / 180 # radians conversion
    x_p = dlon_rad * r_ref # y in meters
    (x_p,y_p)
```

Obtain coords limits, center and convert to x,y

(as a reminder, latitude is vertical and longitude is horizontal) 

In [7]:
df = pd.read_csv('202004-stations-coords.csv')

r = 6.3781 * 10**6 # Earth radius in meters

# manually chosen sensible center point
lat_center = 38.89781
lon_center = -77.02287
# print the center point
print(f"Center point: ({lat_center}, {lon_center})")
# convert lat,lon of each point to x,y (meters) relatively to the center
r_ref = r * np.cos(((df['lat'].to_numpy() + lat_center) / 2) * np.pi / 180)
df['x'] = ((df['lon'].to_numpy() - lon_center) * np.pi / 180) * r_ref
df['y'] = ((df['lat'].to_numpy() - lat_center) * np.pi / 180) * r
# drop unused columns
df = df.drop(columns=['station_name','lat','lon'])
# print max/mins
print(f"Max_x: {df['x'].max()}, min_x: {df['x'].min()}, max_y: {df['y'].max()}, min_y: {df['y'].min()}")
# to file
df.to_csv('202004-stations-xy.csv', index=False)
df


Center point: (38.89781, -77.02287)
Max_x: 17097.22295777176, min_x: -29924.529620044137, max_y: 25401.847245444515, min_y: -12821.370613035891


Unnamed: 0,station_id,x,y
0,31000,-2630.982028,-4323.512622
1,31001,-2638.813335,-4515.092354
2,31002,-2284.558323,-4606.930401
3,31003,-2308.148772,-4140.727078
4,31004,-3173.495091,-4446.519946
...,...,...,...
577,32607,-13476.875285,-2013.757906
578,32608,-13060.185234,-1377.682026
579,32609,-12480.698086,-1356.865402
580,32900,1048.754009,7413.389803


## Prepare ns3 data

(Eventually trim beforehand the dataset based on selected stations)

In [26]:
# drop unused columns and rename
df = trips_dataset.drop(columns=['Duration', 'Start station',
                                 'End station', 'Member type'])
df.columns = ['started_at',	'ended_at',
              'start_station_id', 'end_station_id', 'bike_id']
# convert datetime columns to Unix time
t_zero = pd.Timestamp(df['started_at'].min()) - pd.Timedelta('5s')
df['started_at'] = (pd.to_datetime(df['started_at']) -
                    t_zero) // pd.Timedelta('1s')
df['ended_at'] = (pd.to_datetime(df['ended_at']) -
                  t_zero) // pd.Timedelta('1s')
df = df.sort_values(by='started_at')
# join stations coordinates
stations = pd.read_csv('202004-stations-xy.csv').set_index('station_id')
df = df.join(stations.rename(
    columns={'x': 'start_x', 'y': 'start_y'}), 'start_station_id').dropna()
df = df.join(stations.rename(
    columns={'x': 'end_x', 'y': 'end_y'}), 'end_station_id').dropna()
df = df[['started_at', 'ended_at', 'start_x',
         'start_y', 'end_x', 'end_y', 'bike_id']]
# one file per bike + index (this takes time)
for id, trips in df.groupby('bike_id', as_index=True):
    trips.to_csv(ns3datadir + f'{id}.csv', index=False)
df[['bike_id']].drop_duplicates().reset_index(drop=True).to_csv(ns3datadir + 'index.csv')
df


Unnamed: 0,started_at,ended_at,start_x,start_y,end_x,end_y,bike_id
0,5,1601,-205.005371,-1933.608338,450.545684,1652.973530,W24217
1,53,501,-205.005371,-1933.608338,788.532550,-1205.805729,W21897
2,154,437,-1696.435590,660.257168,-1865.107362,1368.108605,W21703
3,278,920,1501.185078,878.138709,2999.275399,291.306939,78571
4,342,689,-799.882099,2230.607017,-1619.936526,2337.584427,W23425
...,...,...,...,...,...,...,...
162525,2675807,2676481,-1796.029814,-613.366836,363.255930,347.314796,W22920
162526,2675822,2676336,-816.913526,1201.130338,-1642.372485,1498.129016,21054
162527,2676469,2677994,-1882.362758,1958.098484,264.638511,6539.511696,W24341
162528,2677206,2677772,1501.185078,878.138709,-276.718440,-211.060530,W22691
