In [1]:
import cabi
import pandas as pd


# Creating trip summary table
Our goal is to create a table showing the stations in a certain pair of stations
## Loading trip data
First we load the table of all trips, and remove any with incomplete data

In [2]:
df = cabi.return_trip_datatable()
df.to_parquet("../data/interim/comb_trips.gzip", compression="gzip")


In [4]:
df = pd.read_parquet("../data/interim/comb_trips.gzip")
total = len(df)
print(f"{total} total trips")
# clean the NA values out
df = df.dropna()
total_nonna = len(df)
print(
    f"{total_nonna} after removing trips with NA values \n {total-total_nonna} trips removed due to NA values"
)


31025537 total trips
30601786 after removing trips with NA values 
 423751 trips removed due to NA values


## Removing invalid trips
We have some number of trips where the end station is listed as 0. Obviously these are invalid

In [5]:
df = df[df.start_station_id > 0]
df = df[df.end_station_id > 0]
total_mappable = len(df)
print(
    f"{total_mappable} trips after removing trips with missing origin or destination \n {total_nonna-total_mappable} trips removed"
)


30600223 trips after removing trips with missing origin or destination 
 1563 trips removed


## Removing trips from removed stations
We also need to make sure we know the location of the station. Therefore we load the list of stations names. Maybe in the future if I can find a table of the locations of removed stations and we can add them to the visualization
### loading our current stations

In [9]:
# Define which attributes to lookup from airports.csv
cabi_stations = "https://raw.githubusercontent.com/mlinds/cabi-data/main/data/processed/stationLookup.csv"
station_names_list = list(pd.read_csv(cabi_stations).short_name)


### Selecting only trips involving extant stations
We rewrite the dataframe to include only stations that exist in our location lookup table

In [25]:
df = df[df.end_station_id.map(lambda x: x in station_names_list)]
df = df[df.start_station_id.map(lambda x: x in station_names_list)]
total_current = len(df)
print(
    f"{total_current} trips after removing trips from stations that dont exist \n {total_mappable-total_current} trips were removed"
)


30282851 trips after removing trips from stations that dont exist 
 317372


## Removing self-trips
There are interesting but they aren't easy to show on a map


In [28]:
df = df[df.start_station_id != df.end_station_id]
total_nonself = len(df)
print(
    f"{total_nonself} trips after removing trips from stations that dont exist \n {total_current-total_nonself} trips were removed"
)


28926153 trips after removing trips from stations that dont exist 
 1356698 trips were removed


## Merging based on which stations are involved
Now to merge based on the same *pairing* of stations (e.g. we do not need to care about which is the origin and which is the destination) to calculate the popularity of that route.

In [30]:
# create a list of the sorted stations
sorted_stations = [
    sorted([int(x), int(y)]) for x, y in zip(df.start_station_id, df.end_station_id)
]
sorted_stations_combined = [int(str(x) + str(y)) for x, y in sorted_stations]

# we also want to create an integer value for the station pair that is not sorted
unsorted_stations_combined = [
    int(str(x) + str(y)) for x, y in zip(df.start_station_id, df.end_station_id)
]


In [31]:
df = df.assign(
    sorted_stations=sorted_stations_combined, unsrt=unsorted_stations_combined
)


In [47]:
a = (
    df.groupby("sorted_stations")
    .count()
    .reset_index()[["sorted_stations", "end_station_id"]]
)
a.columns = ["sorted", "popularity"]


Unnamed: 0,sorted,popularity
0,3100031002,987
1,3100031003,1012
2,3100031004,469
3,3100031005,2222
4,3100031006,1726
...,...,...
84010,3260632608,118
84011,3260632609,36
84012,3260732608,63
84013,3260732609,75


In [60]:
df.unsrt.value_counts().sum() == df.sorted_stations.value_counts().sum()


True

In [94]:
undirected_pop = (
    df.groupby(["start_station_id", "end_station_id", "sorted_stations"])
    .size()
    .reset_index()
    .drop(columns=0)
    .merge(a, left_on="sorted_stations", right_on="sorted")
    .drop(columns=["sorted_stations", "sorted"])
    .sort_values("popularity", ascending=False)
)

undirected_pop.columns = ["st", "en", "popularity"]
undirected_pop


Unnamed: 0,st,en,popularity
82223,31258,31247,79657
82222,31247,31258,79657
83263,31249,31258,70378
83264,31258,31249,70378
125992,31619,31613,67120
...,...,...,...
36523,31093,31642,1
21843,31045,31283,1
136592,31902,32200,1
70615,31223,31924,1


In [97]:
undirected_pop.to_csv(
    "../data/processed/connections_csv.csv",
    columns=["st", "en", "popularity"],
    index=False,
)


In [96]:
undirected_pop.popularity.sum() / 2


28900960.5

We will now reassign the start and end, so that we can seperately plot them later