In [1]:
import cabi_functions
import pandas as pd

# Creating trip summary table
Our goal is to create a table showing the stations in a certain pair of stations
## Loading trip data
First we load the table of all trips, remove any with incomplete data, and convert the datatypes to the most appropriate

In [2]:
df = cabi_functions.return_trip_datatable()
# clean the NA values out
df = df.dropna()
# convert them to appropriate datatypes
df = df.convert_dtypes()


## Removing invalid trips
We have some number of trips where the end station is listed as 0. Obviously these are invalid

In [3]:
df = df[df.start_station_id>0]
df = df[df.end_station_id>0]


## Removing trips from removed stations
We also need to make sure we know the location of the station. Therefore we load the list of stations names. Maybe in the future if I can find a table of the locations of removed stations and we can add them to the visualization
### loading our current stations

In [4]:
# Define which attributes to lookup from airports.csv
cabi_stations = 'https://raw.githubusercontent.com/mlinds/cabi-data/main/data/stationLookup.csv'
station_names_list = list(pd.read_csv(cabi_stations).short_name)

### Selecting only trips involving extant stations
We rewrite the dataframe to include only stations that exist in our location lookup table

In [5]:
df = df[df.end_station_id.map(lambda x:x in station_names_list)]
df = df[df.start_station_id.map(lambda x:x in station_names_list)]

## Merging based on which stations are involved
Now to merge based on the same *pairing* of stations (e.g. we do not need to care about which is the origin and which is the destination) to calculate the popularity of that route.

In [8]:
# create a list of the sorted stations
sorted_stations = [sorted([int(x),int(y)]) for x,y in zip(df.start_station_id,df.end_station_id)]
sorted_stations_combined = [int(str(x)+str(y)) for x,y in sorted_stations]

# we also want to create an integer value for the station pair that is not sorted
unsorted_stations_combined = [int(str(x)+str(y)) for x,y in zip(df.start_station_id,df.end_station_id)]


In [56]:

# assign the station to a column in the dataframe,and group it by the unique station combo, then return the results to a seperate dataframe
grouped = df.assign(sorted_stations=sorted_stations_combined,unsrt=unsorted_stations_combined).groupby('sorted_stations')

dir_grouped = df.assign(sorted_stations=sorted_stations_combined,unsrt=unsorted_stations_combined).groupby('unsrt')

route_popularity = grouped.count().reset_index()
route_popularity = route_popularity[['sorted_stations','end_station_id']]
route_popularity.columns=['sorted','popularity']

In [57]:
def return_sorted(stationpair: int):
    a, b = sorted(
        (
            int(str(stationpair)[0:5]),
            int(str(stationpair)[5:10])
        )
    )
    return int(str(a)+str(b))


In [58]:
# directed station pairs
dir_pairs = df.groupby('unsrt').count().reset_index()
dir_pairs = dir_pairs.assign(
    sorted=[return_sorted(val) for val in dir_pairs.unsrt]
    )

In [59]:
dir_pairs = dir_pairs.merge(route_popularity,on='sorted')
dir_pairs

Unnamed: 0,unsrt,started_at,ended_at,start_station_id,end_station_id,member_casual,sorted_stations,sorted,popularity
0,3100031000,1956,1956,1956,1956,1956,1956,3100031000,1956
1,3100031002,533,533,533,533,533,533,3100031002,957
2,3100231000,424,424,424,424,424,424,3100031002,957
3,3100031003,539,539,539,539,539,539,3100031003,989
4,3100331000,450,450,450,450,450,450,3100031003,989
...,...,...,...,...,...,...,...,...,...
144547,3260932215,1,1,1,1,1,1,3221532609,1
144548,3260932216,1,1,1,1,1,1,3221632609,1
144549,3260932220,5,5,5,5,5,5,3222032609,5
144550,3260932232,5,5,5,5,5,5,3223232609,5


In [60]:
a = [int(str(val)[0:5]) for val in dir_pairs.unsrt]
b = [int(str(val)[5:10]) for val in dir_pairs.unsrt]

dir_pairs = dir_pairs.assign(st=a,en=b)


In [61]:
dir_pairs

Unnamed: 0,unsrt,started_at,ended_at,start_station_id,end_station_id,member_casual,sorted_stations,sorted,popularity,st,en
0,3100031000,1956,1956,1956,1956,1956,1956,3100031000,1956,31000,31000
1,3100031002,533,533,533,533,533,533,3100031002,957,31000,31002
2,3100231000,424,424,424,424,424,424,3100031002,957,31002,31000
3,3100031003,539,539,539,539,539,539,3100031003,989,31000,31003
4,3100331000,450,450,450,450,450,450,3100031003,989,31003,31000
...,...,...,...,...,...,...,...,...,...,...,...
144547,3260932215,1,1,1,1,1,1,3221532609,1,32609,32215
144548,3260932216,1,1,1,1,1,1,3221632609,1,32609,32216
144549,3260932220,5,5,5,5,5,5,3222032609,5,32609,32220
144550,3260932232,5,5,5,5,5,5,3223232609,5,32609,32232


In [66]:
undirected_pop = dir_pairs[['st','en','popularity']]


Unnamed: 0,st,en,popularity
72291,31229,31096,1
61178,31206,32014,1
93687,31275,32049,1
133838,31811,31241,1
133837,31811,31216,1
...,...,...,...
124242,31619,31613,66400
82191,31258,31249,69514
82190,31249,31258,69514
81170,31258,31247,78744


In [64]:
undirected_pop.to_csv('data/connections_csv.csv',columns=['st','en','popularity'],index=False)

We will now reassign the start and end, so that we can seperately plot them later