# Bus Matrices from ETM (Swift data)

## Read Packages and Setup input/output

In [38]:
import os
from pathlib import Path
import pandas as pd
import geopandas as gpd
import numpy as np
from fiona.crs import from_epsg
from tqdm import tqdm
from shapely.geometry import Point
import geopy.distance
import matplotlib.pyplot as plt
import seaborn as sns

# Current working directory
# basepath = os.path.dirname(os.getcwd()
from pathlib import Path
basepath = Path.cwd().parent.parent.parent
print(basepath)
# basepath = Path.cwd().parent.parent

# Read list of stations within model coverage area
raw_data = os.path.join(basepath, '01 Raw Data' )
print(raw_data)

c:\Users\aadil.nawaz\OneDrive - Arup\Projects\CSMT\Matrix Development\Rail Demand
c:\Users\aadil.nawaz\OneDrive - Arup\Projects\CSMT\Matrix Development\Rail Demand\01 Raw Data


## Pre-processing Swift Data

### Import raw Swift LSOA OD matrices
- Data comprises ETM data for 3 months - Sept to Nov 2023
- Consists attributes:
    - Trip ID: unique identifier for each tap-on
    - Journey ID: Grouped trips identifying each OD 
    - Trip Chain ID: unique identifier for each cardholder for that day
    - Interchange ID: indicating origin, transfer, or destination leg of journey
    - Origin/Destination LSOA name and code: based on geospatial mapping of each tap. Destination is inferred based on next tap-on
    - Purpose, ticket_code, Passenger_type, o_product, o_mode are other useful attributes

In [11]:
### Read Swift
lsoa_swift = pd.read_csv(f'{raw_data}/13 Swift 2023/CV/CV/cov_od_9_10_11_23.csv')
lsoa_swift.head()

Unnamed: 0,trip_id,Ticket_Code,o_date,Time_Bin_Hour,o_card_type,o_product,o_mode,O_LSOA21CD,O_LSOA21NM,D_LSOA21CD,D_LSOA21NM,Interchange,Trip_Purpose,Passenger_Type,O_Daynm,O_Day_type,TripChain_ID,Journey_ID
0,09_01_6858,1,01/09/2023,06:00,SWIFT_PHOTO_Card,Monthly Coventry nbus Direct Debit,Bus,E01009658,Coventry 021C,E01034743,Coventry 031E,1,1,Commercial,Fri,Weekday,09_01_20230901633597010730096530,09_01_137697_137697
1,09_01_133496,1,01/09/2023,06:00,SWIFT_PHOTO_Card,Monthly Coventry nbus Direct Debit,Bus,E01034743,Coventry 031E,E01009658,Coventry 021C,3,1,Commercial,Fri,Weekday,09_01_20230901633597010730096530,09_01_137697_137697
2,09_01_41690,1,01/09/2023,10:00,SWIFT_PHOTO_Card,Monthly Coventry nbus Direct Debit,Bus,E01009540,Coventry 039B,E01034744,Coventry 031F,0,1,Commercial,Fri,Weekday,09_01_20230901633597010730777675,09_01_28668_38683
3,09_01_149977,1,01/09/2023,12:00,SWIFT_PHOTO_Card,Monthly Coventry nbus Direct Debit,Bus,E01034744,Coventry 031F,E01009540,Coventry 039B,0,1,Commercial,Fri,Weekday,09_01_20230901633597010730777675,09_01_38683_28668
4,09_01_9297,1,01/09/2023,06:00,SWIFT_PHOTO_Card,Monthly Coventry nbus Direct Debit,Bus,E01009607,Coventry 004C,E01009580,Coventry 019A,0,2,Commercial,Fri,Weekday,09_01_20230901633597010730833148,09_01_151699_80749


### Convert datetime and names of the features


In [12]:
lsoa_swift['Time_Bin_Hour'] = pd.to_datetime(lsoa_swift['Time_Bin_Hour'], format='%H:%M').dt.time
lsoa_swift['o_date'] = pd.to_datetime(lsoa_swift['o_date'], format='%d/%m/%Y').dt.date


lsoa_swift.rename(columns = {'Ticket_Code':'ticket_code',
                                'Time_Bin_Hour':'time',
                                'o_date':'date',
                                'O_LSOA21CD':'o_lsoa21cd',
                                'D_LSOA21CD':'d_lsoa21cd',
                                'Interchange':'interchange',
                                'Trip_Purpose':'trip_purpose',
                                'Passenger_Type':'passenger_type',
                                'O_Day_type':'day_type',
                                'O_Daynm':'day',
                                'TripChain_ID': 'tripchainid',
                                'Journey_ID':'journeyid',
                                'o_product':'o_product',
                                'o_mode':'o_mode',
                                'o_card_type': 'o_card_type'},inplace=True)

# Create a function to categorize the time periods
def categorize_time_period(time):
    if pd.Timestamp('07:00').time() <= time < pd.Timestamp('09:00').time():
        return 'AM'
    elif pd.Timestamp('10:00').time() <= time < pd.Timestamp('12:00').time():
        return 'IP'
    elif pd.Timestamp('16:00').time() <= time < pd.Timestamp('18:00').time():
        return 'PM'
    else:
        return 'OP'

# Apply the function to create the 'time_period' column
lsoa_swift['time_period'] = pd.to_datetime(lsoa_swift['time'], format='%H:%M:%S').dt.time.apply(categorize_time_period)

lsoa_swift.head()

Unnamed: 0,trip_id,ticket_code,date,time,o_card_type,o_product,o_mode,o_lsoa21cd,O_LSOA21NM,d_lsoa21cd,D_LSOA21NM,interchange,trip_purpose,passenger_type,day,day_type,tripchainid,journeyid,time_period
0,09_01_6858,1,2023-09-01,06:00:00,SWIFT_PHOTO_Card,Monthly Coventry nbus Direct Debit,Bus,E01009658,Coventry 021C,E01034743,Coventry 031E,1,1,Commercial,Fri,Weekday,09_01_20230901633597010730096530,09_01_137697_137697,OP
1,09_01_133496,1,2023-09-01,06:00:00,SWIFT_PHOTO_Card,Monthly Coventry nbus Direct Debit,Bus,E01034743,Coventry 031E,E01009658,Coventry 021C,3,1,Commercial,Fri,Weekday,09_01_20230901633597010730096530,09_01_137697_137697,OP
2,09_01_41690,1,2023-09-01,10:00:00,SWIFT_PHOTO_Card,Monthly Coventry nbus Direct Debit,Bus,E01009540,Coventry 039B,E01034744,Coventry 031F,0,1,Commercial,Fri,Weekday,09_01_20230901633597010730777675,09_01_28668_38683,IP
3,09_01_149977,1,2023-09-01,12:00:00,SWIFT_PHOTO_Card,Monthly Coventry nbus Direct Debit,Bus,E01034744,Coventry 031F,E01009540,Coventry 039B,0,1,Commercial,Fri,Weekday,09_01_20230901633597010730777675,09_01_38683_28668,OP
4,09_01_9297,1,2023-09-01,06:00:00,SWIFT_PHOTO_Card,Monthly Coventry nbus Direct Debit,Bus,E01009607,Coventry 004C,E01009580,Coventry 019A,0,2,Commercial,Fri,Weekday,09_01_20230901633597010730833148,09_01_151699_80749,OP


### Filtered Swift data
- Remove duplicates in the Swift data
- Filtering Swift data from 6th Nov to 26th Nov
- Remove data reference from TripID as trip ID would be used to sort trips within journey

In [13]:
#### Filter trips/journeys made on Weekdays between 6th Nov and 11th Nov. There could be some trips which spill over to next day but these are outside peak hours 
start_date = pd.to_datetime('2023-11-06')
end_date = pd.to_datetime('2023-11-26')
lsoa_filtered = lsoa_swift.loc[(pd.to_datetime(lsoa_swift['date'])>= start_date) & (pd.to_datetime(lsoa_swift['date']) <= end_date)]
lsoa_filtered = lsoa_filtered[lsoa_filtered['day_type']=='Weekday'].copy()


columns_to_retain = ['tripchainid','journeyid','trip_id','ticket_code','date','day_type',
                     'day','time','time_period','o_lsoa21cd','d_lsoa21cd','interchange',
                     'trip_purpose','passenger_type','o_product','o_card_type','o_mode']
lsoa_filtered = lsoa_filtered[columns_to_retain]

### Remove duplicates and rows without mode value
lsoa_filtered = lsoa_filtered.drop_duplicates()  ### dropping duplicates 
lsoa_filtered.dropna(subset=['o_mode'], inplace=True)
### Remove data reference from TripID and sort trips by date, tripchainid, journeyid, time and trip_id
lsoa_filtered['trip_id'] = lsoa_filtered['trip_id'].str.split("_").str[-1].astype('int64')
lsoa_filtered.sort_values(by=['date','tripchainid','journeyid','time','trip_id'])


lsoa_filtered = lsoa_filtered.reset_index(drop=True)
print(lsoa_filtered['o_mode'].value_counts())
print(lsoa_filtered.info())
lsoa_filtered.head()


o_mode
Bus     406082
Tram       313
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406395 entries, 0 to 406394
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tripchainid     406395 non-null  object
 1   journeyid       406395 non-null  object
 2   trip_id         406395 non-null  int64 
 3   ticket_code     406395 non-null  int64 
 4   date            406395 non-null  object
 5   day_type        406395 non-null  object
 6   day             406395 non-null  object
 7   time            406395 non-null  object
 8   time_period     406395 non-null  object
 9   o_lsoa21cd      406395 non-null  object
 10  d_lsoa21cd      406395 non-null  object
 11  interchange     406395 non-null  int64 
 12  trip_purpose    406395 non-null  int64 
 13  passenger_type  406395 non-null  object
 14  o_product       406395 non-null  object
 15  o_card_type     406395 non-null  object
 16  o_mode     

Unnamed: 0,tripchainid,journeyid,trip_id,ticket_code,date,day_type,day,time,time_period,o_lsoa21cd,d_lsoa21cd,interchange,trip_purpose,passenger_type,o_product,o_card_type,o_mode
0,11_05_20231105633597010779469531,11_05_86582_41635,87405,4,2023-11-06,Weekday,Mon,00:00:00,OP,E01034743,E01033643,0,1,Concessionary,WMCA Disabled,DISABLED_CONCESSION_CARD,Bus
1,11_05_20231105633597010784252336,11_05_86578_49336,87460,1,2023-11-06,Weekday,Mon,03:00:00,OP,E01009320,E01010165,0,2,Commercial,Monthly Regional nbus Direct Debit,SWIFT_PHOTO_Card,Bus
2,11_05_20231105633597010784534642,11_05_84676_34918,87389,1,2023-11-06,Weekday,Mon,00:00:00,OP,E01033561,E01032536,3,1,Commercial,4 week Regional nbus,SWIFT_PHOTO_Card,Bus
3,11_05_20231105633597010784986305,11_05_71413_24246,87364,1,2023-11-06,Weekday,Mon,00:00:00,OP,E01009638,E01009638,3,2,Commercial,4 week Regional nbus,SWIFT_Blank_TVM_Card,Bus
4,11_05_20231105633597010785010436,11_05_79609_20539,87390,1,2023-11-06,Weekday,Mon,00:00:00,OP,E01009531,E01009587,0,1,Commercial,4 week Regional nbus,SWIFT_Blank_TVM_Card,Bus


## Group data by JourneyID to get total journeys and analyse OD
- Rest of the comment

In [14]:
same_od = lsoa_filtered.groupby(['tripchainid','journeyid']).agg(no_trips = ('trip_id','count'),
                                                                          o_lsoa21cd = ('o_lsoa21cd','first'),
                                                                          d_lsoa21cd = ('d_lsoa21cd','last')
                                                                          ).reset_index()
same_od['same_od'] = np.where(same_od['o_lsoa21cd'] == same_od['d_lsoa21cd'], 1, 0)
swift_nov = pd.merge(lsoa_filtered, same_od[['journeyid','same_od','no_trips']], left_on='journeyid', right_on='journeyid', how='left')
swift_nov.head()

Unnamed: 0,tripchainid,journeyid,trip_id,ticket_code,date,day_type,day,time,time_period,o_lsoa21cd,d_lsoa21cd,interchange,trip_purpose,passenger_type,o_product,o_card_type,o_mode,same_od,no_trips
0,11_05_20231105633597010779469531,11_05_86582_41635,87405,4,2023-11-06,Weekday,Mon,00:00:00,OP,E01034743,E01033643,0,1,Concessionary,WMCA Disabled,DISABLED_CONCESSION_CARD,Bus,0,1
1,11_05_20231105633597010784252336,11_05_86578_49336,87460,1,2023-11-06,Weekday,Mon,03:00:00,OP,E01009320,E01010165,0,2,Commercial,Monthly Regional nbus Direct Debit,SWIFT_PHOTO_Card,Bus,0,1
2,11_05_20231105633597010784534642,11_05_84676_34918,87389,1,2023-11-06,Weekday,Mon,00:00:00,OP,E01033561,E01032536,3,1,Commercial,4 week Regional nbus,SWIFT_PHOTO_Card,Bus,0,1
3,11_05_20231105633597010784986305,11_05_71413_24246,87364,1,2023-11-06,Weekday,Mon,00:00:00,OP,E01009638,E01009638,3,2,Commercial,4 week Regional nbus,SWIFT_Blank_TVM_Card,Bus,1,1
4,11_05_20231105633597010785010436,11_05_79609_20539,87390,1,2023-11-06,Weekday,Mon,00:00:00,OP,E01009531,E01009587,0,1,Commercial,4 week Regional nbus,SWIFT_Blank_TVM_Card,Bus,0,1


### Assign User and validity to determine Uplift/Non-Uplift, Fare/No-Fare

In [75]:
user_validity = pd.read_excel(f'{basepath}/03 Output/04 Matrix Input/09 Bus User Type/Ticket_Types_products.xlsx',
                                 sheet_name='Ticket_Types_products')
user_validity.head()

Unnamed: 0,o_card_type,o_product,Count of Journey_ID,%GT Count of Journey_ID,user_old,validity,fare_nofare,user
0,OVER_60_CONCESSION_CARD,WMCA Age,73734,0.2625,Over 60,Concession,No-Fare,Over 60
1,SWIFT_NX_PHOTO_Child_Card,Child nbus Academic Year,18715,0.0666,Child,Academic,No-Fare,Child
2,DISABLED_CONCESSION_CARD,WMCA Disabled,17141,0.061,Disabled,Concession,No-Fare,Disabled
3,SWIFT_Blank_TVM_Card,4 week Regional nbus,14413,0.0513,Adult,Weekly,No-Fare,Adult
4,SWIFT_NX_Regional_Photo_Card,4 week Regional nbus,12352,0.044,Adult,Weekly,No-Fare,Adult


In [76]:
swift_nov_usertype = pd.merge(swift_nov, user_validity[['o_card_type','o_product','user','validity','fare_nofare']],
                              left_on=['o_card_type','o_product'], right_on=['o_card_type','o_product'], how='left')
swift_nov_usertype.head()

Unnamed: 0,tripchainid,journeyid,trip_id,ticket_code,date,day_type,day,time,time_period,o_lsoa21cd,...,trip_purpose,passenger_type,o_product,o_card_type,o_mode,same_od,no_trips,user,validity,fare_nofare
0,11_05_20231105633597010779469531,11_05_86582_41635,87405,4,2023-11-06,Weekday,Mon,00:00:00,OP,E01034743,...,1,Concessionary,WMCA Disabled,DISABLED_CONCESSION_CARD,Bus,0,1,Disabled,Concession,No-Fare
1,11_05_20231105633597010784252336,11_05_86578_49336,87460,1,2023-11-06,Weekday,Mon,03:00:00,OP,E01009320,...,2,Commercial,Monthly Regional nbus Direct Debit,SWIFT_PHOTO_Card,Bus,0,1,Adult,Monthly,No-Fare
2,11_05_20231105633597010784534642,11_05_84676_34918,87389,1,2023-11-06,Weekday,Mon,00:00:00,OP,E01033561,...,1,Commercial,4 week Regional nbus,SWIFT_PHOTO_Card,Bus,0,1,Adult,Weekly,No-Fare
3,11_05_20231105633597010784986305,11_05_71413_24246,87364,1,2023-11-06,Weekday,Mon,00:00:00,OP,E01009638,...,2,Commercial,4 week Regional nbus,SWIFT_Blank_TVM_Card,Bus,1,1,Adult,Weekly,No-Fare
4,11_05_20231105633597010785010436,11_05_79609_20539,87390,1,2023-11-06,Weekday,Mon,00:00:00,OP,E01009531,...,1,Commercial,4 week Regional nbus,SWIFT_Blank_TVM_Card,Bus,0,1,Adult,Weekly,No-Fare


### Data Check
- To see the merge of o_product type with raw filtered dataset

In [77]:
swift_nov_usertype.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406395 entries, 0 to 406394
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tripchainid     406395 non-null  object
 1   journeyid       406395 non-null  object
 2   trip_id         406395 non-null  int64 
 3   ticket_code     406395 non-null  int64 
 4   date            406395 non-null  object
 5   day_type        406395 non-null  object
 6   day             406395 non-null  object
 7   time            406395 non-null  object
 8   time_period     406395 non-null  object
 9   o_lsoa21cd      406395 non-null  object
 10  d_lsoa21cd      406395 non-null  object
 11  interchange     406395 non-null  int64 
 12  trip_purpose    406395 non-null  int64 
 13  passenger_type  406395 non-null  object
 14  o_product       406395 non-null  object
 15  o_card_type     406395 non-null  object
 16  o_mode          406395 non-null  object
 17  same_od         406395 non-nu

In [78]:
swift_nov_usertype[(swift_nov_usertype['o_card_type']=='SWIFT_NX_PHOTO_Child_Card')&(swift_nov_usertype['o_product']=='Monthly Direct Debit Term Plus')].head(10)

Unnamed: 0,tripchainid,journeyid,trip_id,ticket_code,date,day_type,day,time,time_period,o_lsoa21cd,...,trip_purpose,passenger_type,o_product,o_card_type,o_mode,same_od,no_trips,user,validity,fare_nofare
17492,11_06_20231106633597011252468420,11_06_30742_199847,51227,2,2023-11-06,Weekday,Mon,07:00:00,AM,E01009302,...,1,Commercial,Monthly Direct Debit Term Plus,SWIFT_NX_PHOTO_Child_Card,Bus,0,2,Child,Academic,No-Fare
17493,11_06_20231106633597011252468420,11_06_30742_199847,69920,2,2023-11-06,Weekday,Mon,08:00:00,AM,E01010125,...,1,Commercial,Monthly Direct Debit Term Plus,SWIFT_NX_PHOTO_Child_Card,Bus,0,2,Child,Academic,No-Fare
17494,11_06_20231106633597011252468420,11_06_199847_30742,171643,2,2023-11-06,Weekday,Mon,15:00:00,OP,E01031022,...,1,Commercial,Monthly Direct Debit Term Plus,SWIFT_NX_PHOTO_Child_Card,Bus,0,2,Child,Academic,No-Fare
17495,11_06_20231106633597011252468420,11_06_199847_30742,274845,2,2023-11-06,Weekday,Mon,15:00:00,OP,E01010129,...,1,Commercial,Monthly Direct Debit Term Plus,SWIFT_NX_PHOTO_Child_Card,Bus,0,2,Child,Academic,No-Fare
17496,11_06_20231106633597011252479682,11_06_48369_199870,53197,2,2023-11-06,Weekday,Mon,08:00:00,AM,E01010114,...,3,Commercial,Monthly Direct Debit Term Plus,SWIFT_NX_PHOTO_Child_Card,Bus,0,1,Child,Academic,No-Fare
17497,11_06_20231106633597011252479682,11_06_199870_48369,262607,2,2023-11-06,Weekday,Mon,15:00:00,OP,E01031022,...,3,Commercial,Monthly Direct Debit Term Plus,SWIFT_NX_PHOTO_Child_Card,Bus,0,1,Child,Academic,No-Fare
17498,11_06_20231106633597011252480284,11_06_221654_221899,45544,2,2023-11-06,Weekday,Mon,07:00:00,AM,E01009623,...,3,Commercial,Monthly Direct Debit Term Plus,SWIFT_NX_PHOTO_Child_Card,Bus,0,1,Child,Academic,No-Fare
17499,11_06_20231106633597011252480284,11_06_221899_221654,267188,2,2023-11-06,Weekday,Mon,15:00:00,OP,E01034743,...,3,Commercial,Monthly Direct Debit Term Plus,SWIFT_NX_PHOTO_Child_Card,Bus,0,1,Child,Academic,No-Fare
17504,11_06_20231106633597011252497585,11_06_107642_149742,34187,2,2023-11-06,Weekday,Mon,07:00:00,AM,E01009582,...,1,Commercial,Monthly Direct Debit Term Plus,SWIFT_NX_PHOTO_Child_Card,Bus,0,2,Child,Academic,No-Fare
17505,11_06_20231106633597011252497585,11_06_107642_149742,55782,2,2023-11-06,Weekday,Mon,08:00:00,AM,E01009638,...,3,Commercial,Monthly Direct Debit Term Plus,SWIFT_NX_PHOTO_Child_Card,Bus,0,2,Child,Academic,No-Fare


### Process Round trips

### Remove the same od journeys and split journeys


In [79]:
### Journeys with same origin and destination which needs to split into two journeys
swift_nov_same_od = swift_nov_usertype[swift_nov_usertype['same_od']==1].copy()
same_od_trips = swift_nov_same_od['journeyid'].nunique()


### Journeys with different origin and destination which dont require processing
swift_nov_diff_od = swift_nov_usertype[swift_nov_usertype['same_od']==0].copy()
diff_od_trips = swift_nov_diff_od['journeyid'].nunique()


print(f"Journeys with same OD: {same_od_trips} i.e. {same_od_trips*100/(same_od_trips+diff_od_trips)} %")
print(f"Journeys with different OD {diff_od_trips} i.e. {diff_od_trips*100/(same_od_trips+diff_od_trips)} %")

Journeys with same OD: 31650 i.e. 11.263305112793192 %
Journeys with different OD 249351 i.e. 88.73669488720681 %


### Rule for splitting journeys with same OD

- if no of trips = 1, then  its an internal zone trip and should be ignored as it would affect assignment
- if no of trips = 2, then  split journeys into two journeys:
   - Trip 1: O: Origin of Interchange type 1 trip, D: Destination of interchange type 1 trip
   - Trip 2: O: Origin of Interchange type 3 trip, D: Destination of interchange type 3 trip

- if no of trips > 2, then  split journeys into two journeys :
   - Trip 1: O: Origin of Interchange type 1 trip, D: Origin of Median of Interchange type 2 Trip
   - Trip 2: O: Origin of Median of Interchange type 2 trip, D: Destination of interchange type 3 trip



In [80]:
def middle(input_list):
    middle = len(input_list) // 2
    if len(input_list) % 2 != 0:
        return input_list[middle]
    else:
        return input_list[middle - 1]

In [81]:
# Assuming swift_nov_same_od is your DataFrame
journey_sameod = swift_nov_same_od['journeyid'].unique()
split_same_od_df = pd.DataFrame(columns=swift_nov_same_od.columns)

for journey in tqdm(journey_sameod, desc="Processing journeys with same od"):
    try:
        temp_df = swift_nov_same_od[swift_nov_same_od['journeyid'] == journey].copy()
        n_trips = len(temp_df)

        if n_trips == 2:
            trip_1 = temp_df.loc[temp_df['interchange'] == 1].copy()
            if not trip_1.empty:
                trip_1.loc[trip_1.index[0], 'journeyid'] = trip_1.loc[trip_1.index[0], 'journeyid'] + "_1"
                split_same_od_df = pd.concat([split_same_od_df, trip_1])

            trip_2 = temp_df.loc[temp_df['interchange'] == 3].copy()
            if not trip_2.empty:
                trip_2.loc[trip_2.index[0], 'journeyid'] = trip_2.loc[trip_2.index[0], 'journeyid'] + "_2"
                split_same_od_df = pd.concat([split_same_od_df, trip_2])

        if n_trips > 2:
            trip_1 = temp_df.loc[temp_df['interchange'] == 1].copy()


            interim_trips = temp_df.loc[temp_df['interchange'] == 2].copy()
            if trip_1.empty and not interim_trips.empty:
                trip_1 = interim_trips.iloc[[0]]

            if not interim_trips.empty:
                middle_row_trip = interim_trips.loc[interim_trips['trip_id'] == middle(interim_trips['trip_id'].values)].copy()
                if not middle_row_trip.empty:
                    trip_1.loc[trip_1.index[0], 'd_lsoa21cd'] = middle_row_trip.iloc[0]['o_lsoa21cd']
                    trip_1.loc[trip_1.index[0], 'journeyid'] = trip_1.loc[trip_1.index[0], 'journeyid'] + "_1"
                    split_same_od_df = pd.concat([split_same_od_df, trip_1])

                    trip_2 = temp_df[temp_df['interchange'] == 3].copy()
                    if not trip_2.empty:
                        trip_2.loc[trip_2.index[0], 'o_lsoa21cd'] = middle_row_trip.iloc[0]['d_lsoa21cd']
                        trip_2.loc[trip_2.index[0], 'time'] = middle_row_trip.iloc[0]['time']
                        trip_2.loc[trip_2.index[0], 'time_period'] = middle_row_trip.iloc[0]['time_period']
                        trip_2.loc[trip_2.index[0], 'journeyid'] = trip_2.loc[trip_2.index[0], 'journeyid'] + "_2"
                        split_same_od_df = pd.concat([split_same_od_df, trip_2])

    except IndexError as e:
        print(f"Error encountered for journeyid: {journey}")
        print(temp_df[['journeyid','trip_id','time','interchange','o_lsoa21cd','o_lsoa21cd']])
        # print(e)
split_same_od_df.head()

Processing journeys with same od: 100%|██████████| 31650/31650 [06:39<00:00, 79.13it/s] 


Unnamed: 0,tripchainid,journeyid,trip_id,ticket_code,date,day_type,day,time,time_period,o_lsoa21cd,...,trip_purpose,passenger_type,o_product,o_card_type,o_mode,same_od,no_trips,user,validity,fare_nofare
20,11_06_20231106633597010730833148,11_06_305897_305897_1,134527,1,2023-11-06,Weekday,Mon,12:00:00,OP,E01009607,...,1,Commercial,Monthly Coventry nbus Direct Debit,SWIFT_PHOTO_Card,Bus,1,4,Adult,Monthly,No-Fare
23,11_06_20231106633597010730833148,11_06_305897_305897_2,258919,1,2023-11-06,Weekday,Mon,13:00:00,OP,E01009607,...,1,Commercial,Monthly Coventry nbus Direct Debit,SWIFT_PHOTO_Card,Bus,1,4,Adult,Monthly,No-Fare
63,11_06_20231106633597010733618462,11_06_22109_22109_1,36629,2,2023-11-06,Weekday,Mon,07:00:00,AM,E01009612,...,1,Commercial,Child nbus Winter Term,SWIFT_PHOTO_Child_Card,Bus,1,2,Child,Academic,No-Fare
64,11_06_20231106633597010733618462,11_06_22109_22109_2,214569,2,2023-11-06,Weekday,Mon,08:00:00,AM,E01034743,...,1,Commercial,Child nbus Winter Term,SWIFT_PHOTO_Child_Card,Bus,1,2,Child,Academic,No-Fare
73,11_06_20231106633597010733620807,11_06_14363_14363_1,42646,2,2023-11-06,Weekday,Mon,07:00:00,AM,E01009662,...,1,Commercial,Child 1 week nbus,SWIFT_PHOTO_Child_Card,Bus,1,2,Child,Weekly,No-Fare


### Merge both df with different and same od together

In [82]:
swift_nov_upd = pd.concat([swift_nov_diff_od,split_same_od_df])
swift_nov_upd['o_mode'].value_counts()

o_mode
Bus     392418
Tram       294
Name: count, dtype: int64

### Groupby Journey ID 
- Aggregate trips/boardings to journeys which represent OD
- For that sort trips in a journey by trip_id, time, journeyid, tripchainid
- Groupby by tripchainid, journeyid
- Ideally only trips starting within peak period should be considered in the peak period matrices
- There could be consideration for trips that start before peak period but still going on during peak period can be considered. For such trips actual OD should be considered

In [83]:
def concat_unique_modes(series):
    unique_modes = series.astype(str).unique()
    return ", ".join(unique_modes)

jour_agg = swift_nov_upd.groupby(['tripchainid','journeyid']).agg(no_trips = ('trip_id','count'),
                                                                  date = ('date','first'),
                                                                  first_trips = ('trip_id','first'),
                                                                  last_trips = ('trip_id','last'),
                                                                  start_time = ('time','first'),
                                                                  end_time = ('time','last'),
                                                                  start_peak = ('time_period','first'),
                                                                  end_peak = ('time_period','last'),
                                                                  o_lsoa21cd = ('o_lsoa21cd','first'),
                                                                  d_lsoa21cd = ('d_lsoa21cd','last'),
                                                                  interchange = ('interchange','median'),
                                                                  passenger_type = ('passenger_type','first'),
                                                                  trip_purpose = ('trip_purpose','first'),
                                                                  ticket_code = ('ticket_code','first'),
                                                                  o_product = ('o_product','first'),
                                                                  o_card_type = ('o_card_type','first'),
                                                                  user = ('user','first'),
                                                                  validity = ('validity','first'),
                                                                  fare_nofare = ('fare_nofare','first'),
                                                                  same_od = ('same_od','first'),
                                                                  mode=('o_mode', concat_unique_modes)
                                                                  ).reset_index()
jour_agg.head()


Unnamed: 0,tripchainid,journeyid,no_trips,date,first_trips,last_trips,start_time,end_time,start_peak,end_peak,...,passenger_type,trip_purpose,ticket_code,o_product,o_card_type,user,validity,fare_nofare,same_od,mode
0,11_05_20231105633597010779469531,11_05_86582_41635,1,2023-11-06,87405,87405,00:00:00,00:00:00,OP,OP,...,Concessionary,1,4,WMCA Disabled,DISABLED_CONCESSION_CARD,Disabled,Concession,No-Fare,0,Bus
1,11_05_20231105633597010784252336,11_05_86578_49336,1,2023-11-06,87460,87460,03:00:00,03:00:00,OP,OP,...,Commercial,2,1,Monthly Regional nbus Direct Debit,SWIFT_PHOTO_Card,Adult,Monthly,No-Fare,0,Bus
2,11_05_20231105633597010784534642,11_05_84676_34918,1,2023-11-06,87389,87389,00:00:00,00:00:00,OP,OP,...,Commercial,1,1,4 week Regional nbus,SWIFT_PHOTO_Card,Adult,Weekly,No-Fare,0,Bus
3,11_05_20231105633597010785010436,11_05_79609_20539,1,2023-11-06,87390,87390,00:00:00,00:00:00,OP,OP,...,Commercial,1,1,4 week Regional nbus,SWIFT_Blank_TVM_Card,Adult,Weekly,No-Fare,0,Bus
4,11_05_20231105633597010785113073,11_05_71412_5101,1,2023-11-06,87365,87365,00:00:00,00:00:00,OP,OP,...,Commercial,1,1,4 week Regional nbus,SWIFT_Blank_TVM_Card,Adult,Weekly,No-Fare,0,Bus


### Add coordinates and sector

In [84]:
lsoa_name_coord = pd.read_csv(f'{raw_data}/11 Sector Correspondance/LSOA21_Coord.csv')
lsoa_sector = pd.read_csv(f'{raw_data}/11 Sector Correspondance/LSOA_Zones_Sectors_Correspondance.csv')
lsoa_mapping = pd.merge(lsoa_name_coord,lsoa_sector,left_on='LSOA21CD',right_on='LSOA21CD', how='left')
jour_agg_mapped = pd.merge(jour_agg,lsoa_mapping,left_on='o_lsoa21cd',right_on='LSOA21CD', how='left')
jour_agg_mapped.rename(columns={'LSOA21NM':'o_lsoa21nm','centroid_lat':'o_lat','centroid_lon':'o_lon','Sector':'o_sector'},inplace=True)
jour_agg_mapped.drop(columns='LSOA21CD', inplace=True) 
jour_agg_mapped = pd.merge(jour_agg_mapped,lsoa_mapping,left_on='d_lsoa21cd',right_on='LSOA21CD', how='left')
jour_agg_mapped.rename(columns={'LSOA21NM':'d_lsoa21nm','centroid_lat':'d_lat','centroid_lon':'d_lon','Sector':'d_sector'},inplace=True)
jour_agg_mapped.drop(columns='LSOA21CD', inplace=True) 

jour_agg_mapped.head()

Unnamed: 0,tripchainid,journeyid,no_trips,date,first_trips,last_trips,start_time,end_time,start_peak,end_peak,...,same_od,mode,o_lsoa21nm,o_lat,o_lon,o_sector,d_lsoa21nm,d_lat,d_lon,d_sector
0,11_05_20231105633597010779469531,11_05_86582_41635,1,2023-11-06,87405,87405,00:00:00,00:00:00,OP,OP,...,0,Bus,Coventry 031E,52.409515,-1.513131,Coventry Central,Birmingham 082F,52.453954,-1.862875,West (England + Wales)
1,11_05_20231105633597010784252336,11_05_86578_49336,1,2023-11-06,87460,87460,03:00:00,03:00:00,OP,OP,...,0,Bus,Birmingham 081F,52.453009,-1.769186,West (England + Wales),Solihull 017C,52.434265,-1.653869,FMA West (+Solilhull)
2,11_05_20231105633597010784534642,11_05_84676_34918,1,2023-11-06,87389,87389,00:00:00,00:00:00,OP,OP,...,0,Bus,Birmingham 050E,52.485012,-1.884991,Birmingham City Centre,Coventry 001F,52.454237,-1.473431,Coventry North East
3,11_05_20231105633597010785010436,11_05_79609_20539,1,2023-11-06,87390,87390,00:00:00,00:00:00,OP,OP,...,0,Bus,Coventry 010D,52.424477,-1.566653,Coventry South West,Coventry 011A,52.436946,-1.510351,Coventry North West
4,11_05_20231105633597010785113073,11_05_71412_5101,1,2023-11-06,87365,87365,00:00:00,00:00:00,OP,OP,...,0,Bus,Coventry 024C,52.412803,-1.503559,Coventry North West,Coventry 010D,52.424477,-1.566653,Coventry South West


In [85]:
# Function to calculate distance
def calculate_distance(row):
    coords_1 = (row['o_lat'], row['o_lon'])
    coords_2 = (row['d_lat'], row['d_lon'])
    return geopy.distance.geodesic(coords_1, coords_2).km  # Distance in kilometers

# Calculate distance for each trip
jour_agg_mapped['distance'] = jour_agg_mapped.apply(calculate_distance, axis=1)

In [40]:
jour_agg_mapped.to_csv(f'{basepath}/03 Output/09 Swift in CSMT/Processed_swift_v5.csv')

## Read Output directly to avoid rework


In [86]:
# jour_agg = pd.read_csv(f'{basepath}/03 Output/09 Swift in CSMT/jour_validation_v3.csv')
jour_agg_mapped['start_time'] = pd.to_datetime(jour_agg_mapped['start_time'], format='%H:%M:%S').dt.time
jour_agg_mapped['end_time'] = pd.to_datetime(jour_agg_mapped['end_time'], format='%H:%M:%S').dt.time
jour_agg_mapped['date'] = pd.to_datetime(jour_agg_mapped['date'], format='%Y-%m-%d').dt.date
jour_agg_mapped.head()

Unnamed: 0,tripchainid,journeyid,no_trips,date,first_trips,last_trips,start_time,end_time,start_peak,end_peak,...,mode,o_lsoa21nm,o_lat,o_lon,o_sector,d_lsoa21nm,d_lat,d_lon,d_sector,distance
0,11_05_20231105633597010779469531,11_05_86582_41635,1,2023-11-06,87405,87405,00:00:00,00:00:00,OP,OP,...,Bus,Coventry 031E,52.409515,-1.513131,Coventry Central,Birmingham 082F,52.453954,-1.862875,West (England + Wales),24.296434
1,11_05_20231105633597010784252336,11_05_86578_49336,1,2023-11-06,87460,87460,03:00:00,03:00:00,OP,OP,...,Bus,Birmingham 081F,52.453009,-1.769186,West (England + Wales),Solihull 017C,52.434265,-1.653869,FMA West (+Solilhull),8.11387
2,11_05_20231105633597010784534642,11_05_84676_34918,1,2023-11-06,87389,87389,00:00:00,00:00:00,OP,OP,...,Bus,Birmingham 050E,52.485012,-1.884991,Birmingham City Centre,Coventry 001F,52.454237,-1.473431,Coventry North East,28.177209
3,11_05_20231105633597010785010436,11_05_79609_20539,1,2023-11-06,87390,87390,00:00:00,00:00:00,OP,OP,...,Bus,Coventry 010D,52.424477,-1.566653,Coventry South West,Coventry 011A,52.436946,-1.510351,Coventry North West,4.073092
4,11_05_20231105633597010785113073,11_05_71412_5101,1,2023-11-06,87365,87365,00:00:00,00:00:00,OP,OP,...,Bus,Coventry 024C,52.412803,-1.503559,Coventry North West,Coventry 010D,52.424477,-1.566653,Coventry South West,4.484923


### Redetermine Peak for analysis Based on following definition
- AM: start_time<9:00:00, end_time>=07:00:00
- PM: start_time<1800:00, end_time>=16:00:00
- IP: start_time<12:00:00, end_time>=10:00:00
- Other: other time period

In [48]:
jour_agg_mapped['start_peak'].value_counts()

start_peak
OP    164015
AM     57096
IP     44316
PM     39010
Name: count, dtype: int64

In [49]:
jour_agg_mapped['end_peak'].value_counts()

end_peak
OP    156903
AM     57385
PM     45992
IP     44157
Name: count, dtype: int64

In [50]:
# def determine_peak(start_time, end_time):
#     if start_time < pd.to_datetime('09:00:00').time() and end_time >= pd.to_datetime('07:00:00').time():
#         return 'AM'
#     elif start_time < pd.to_datetime('18:00:00').time() and end_time >= pd.to_datetime('16:00:00').time():
#         return 'PM'
#     elif start_time < pd.to_datetime('12:00:00').time() and end_time >= pd.to_datetime('10:00:00').time():
#         return 'IP'
#     else:
#         return 'Other'
    
# jour_agg_mapped['jour_peak'] = jour_agg_mapped.apply(lambda row: determine_peak(row['start_time'], row['end_time']), axis=1)

# jour_agg_mapped['jour_peak'].value_counts()

In [62]:
jour_agg_mapped['trip_purpose'].value_counts()

trip_purpose
1    245035
3     37902
2     21500
Name: count, dtype: int64

### Mapping Journey Purpose

In [87]:
purpose = {1:'Others',2:'Work',3:'Education'}
jour_agg_mapped['trip_purpose'] = jour_agg_mapped['trip_purpose'].map(purpose)

jour_agg_mapped.loc[:,'user_type'] = jour_agg_mapped['user'].apply(lambda x: 'Adult' if x in ['Adult', 'Student'] else 'Non-Adult')

# mask = (jour_agg_mapped['trip_purpose'] == 'Education') & (jour_agg_mapped['user_type'] == 'Adult')
# jour_agg_mapped = jour_agg_mapped[~mask]

In [64]:
jour_agg_mapped.head()

Unnamed: 0,tripchainid,journeyid,no_trips,date,first_trips,last_trips,start_time,end_time,start_peak,end_peak,...,o_lsoa21nm,o_lat,o_lon,o_sector,d_lsoa21nm,d_lat,d_lon,d_sector,distance,user_type
0,11_05_20231105633597010779469531,11_05_86582_41635,1,2023-11-06,87405,87405,00:00:00,00:00:00,OP,OP,...,Coventry 031E,52.409515,-1.513131,Coventry Central,Birmingham 082F,52.453954,-1.862875,West (England + Wales),24.296434,Non-Adult
1,11_05_20231105633597010784252336,11_05_86578_49336,1,2023-11-06,87460,87460,03:00:00,03:00:00,OP,OP,...,Birmingham 081F,52.453009,-1.769186,West (England + Wales),Solihull 017C,52.434265,-1.653869,FMA West (+Solilhull),8.11387,Adult
2,11_05_20231105633597010784534642,11_05_84676_34918,1,2023-11-06,87389,87389,00:00:00,00:00:00,OP,OP,...,Birmingham 050E,52.485012,-1.884991,Birmingham City Centre,Coventry 001F,52.454237,-1.473431,Coventry North East,28.177209,Adult
3,11_05_20231105633597010785010436,11_05_79609_20539,1,2023-11-06,87390,87390,00:00:00,00:00:00,OP,OP,...,Coventry 010D,52.424477,-1.566653,Coventry South West,Coventry 011A,52.436946,-1.510351,Coventry North West,4.073092,Adult
4,11_05_20231105633597010785113073,11_05_71412_5101,1,2023-11-06,87365,87365,00:00:00,00:00:00,OP,OP,...,Coventry 024C,52.412803,-1.503559,Coventry North West,Coventry 010D,52.424477,-1.566653,Coventry South West,4.484923,Adult


In [65]:
jour_agg_mapped['trip_purpose'].value_counts()

trip_purpose
Others       245035
Education     37902
Work          21500
Name: count, dtype: int64

### Groupby to create LSOA matrix
- Option 1: Grouped by end_peak
- Option 2: Grouped by start_peak

Option 1

In [88]:
total_dates = jour_agg_mapped['date'].nunique()
print(total_dates)


swift_agg = jour_agg_mapped.groupby(['o_lsoa21cd','d_lsoa21cd','user_type','trip_purpose','fare_nofare','end_peak']).agg(trips = ('journeyid','count')).reset_index()
swift_agg['trips'] = swift_agg['trips']/total_dates
swift_agg.head()

swift_agg['user_fare_pur_peak'] = swift_agg['user_type']+'-'+swift_agg['fare_nofare']+'-'+swift_agg['trip_purpose'] +'-'+ swift_agg['end_peak']
swift_agg['user_fare_peak'] = swift_agg['user_type']+'-'+swift_agg['fare_nofare'] +'-'+ swift_agg['end_peak']
swift_agg = swift_agg[swift_agg['end_peak']!='OP']
swift_agg = swift_agg.reset_index(drop=True)
swift_agg.head()


15


Unnamed: 0,o_lsoa21cd,d_lsoa21cd,user_type,trip_purpose,fare_nofare,end_peak,trips,user_fare_pur_peak,user_fare_peak
0,E01008881,E01009638,Adult,Others,No-Fare,AM,0.133333,Adult-No-Fare-Others-AM,Adult-No-Fare-AM
1,E01008881,E01009638,Non-Adult,Others,No-Fare,IP,0.066667,Non-Adult-No-Fare-Others-IP,Non-Adult-No-Fare-IP
2,E01008881,E01009704,Non-Adult,Education,No-Fare,AM,0.066667,Non-Adult-No-Fare-Education-AM,Non-Adult-No-Fare-AM
3,E01008881,E01031021,Adult,Others,No-Fare,AM,0.4,Adult-No-Fare-Others-AM,Adult-No-Fare-AM
4,E01008883,E01009320,Non-Adult,Others,No-Fare,IP,0.066667,Non-Adult-No-Fare-Others-IP,Non-Adult-No-Fare-IP


In [89]:
swift_agg['user_fare_pur_peak'].value_counts()

user_fare_pur_peak
Non-Adult-No-Fare-Others-IP       6235
Non-Adult-No-Fare-Others-PM       6105
Adult-No-Fare-Others-PM           4179
Non-Adult-No-Fare-Others-AM       4114
Adult-No-Fare-Others-AM           3664
Adult-No-Fare-Others-IP           2901
Non-Adult-No-Fare-Education-AM    2170
Adult-No-Fare-Work-AM             1358
Non-Adult-No-Fare-Education-PM    1145
Adult-No-Fare-Work-PM              841
Adult-No-Fare-Work-IP              429
Non-Adult-No-Fare-Work-AM          177
Non-Adult-No-Fare-Work-PM          124
Non-Adult-No-Fare-Education-IP     111
Adult-Fare-Others-PM                74
Adult-Fare-Others-IP                58
Adult-Fare-Work-AM                  48
Adult-Fare-Others-AM                46
Non-Adult-No-Fare-Work-IP           35
Adult-Fare-Work-PM                  30
Adult-Fare-Work-IP                   5
Name: count, dtype: int64

In [90]:
swift_agg['user_fare_peak'].value_counts()

user_fare_peak
Non-Adult-No-Fare-PM    7374
Non-Adult-No-Fare-AM    6461
Non-Adult-No-Fare-IP    6381
Adult-No-Fare-AM        5022
Adult-No-Fare-PM        5020
Adult-No-Fare-IP        3330
Adult-Fare-PM            104
Adult-Fare-AM             94
Adult-Fare-IP             63
Name: count, dtype: int64

### Read Zone LSOA Mapping


In [91]:
zone_lsoa_pc = gpd.read_file(f'{basepath}/03 Output/09 Swift in CSMT/csmt_lsoa.shp')
zone_lsoa_mapping = zone_lsoa_pc[['zone','model_area','LSOA21CD','overlap']]
zone_lsoa_mapping.head()


Unnamed: 0,zone,model_area,LSOA21CD,overlap
0,101,External,E01005908,0.06
1,101,External,E01005912,0.09434
2,101,External,E01007610,0.013699
3,101,External,E01007835,0.052632
4,101,External,E01007842,0.037037


### Assign Swift to Lsoa CSMT zone overlap

In [92]:
exp_df = []
for idx, od in tqdm(swift_agg.iterrows(), total = len(swift_agg)):
    csmt_o = zone_lsoa_mapping[zone_lsoa_mapping['LSOA21CD']==od['o_lsoa21cd']]
    csmt_d = zone_lsoa_mapping[zone_lsoa_mapping['LSOA21CD']==od['d_lsoa21cd']]

    for id1, org in csmt_o.iterrows():
        for id2, dest in csmt_d.iterrows():
            df = {}
            df['org'] = org['zone']
            df['dest'] = dest['zone']
            overlap = org['overlap']*dest['overlap']
            df['trips'] = overlap*od['trips']
            df['dseg_w_pur'] = od['user_fare_pur_peak']
            df['dseg'] = od['user_fare_peak']
            exp_df.append(df)
swift_csmt = pd.DataFrame(exp_df)

print(swift_csmt['trips'].sum())
#### Check of total before and after distribution
print(swift_agg['trips'].sum())


100%|██████████| 33849/33849 [02:13<00:00, 254.01it/s]


9835.600000000004
9835.600000000002


In [93]:
swift_csmt.head()

Unnamed: 0,org,dest,trips,dseg_w_pur,dseg
0,1029,13801,0.011594,Adult-No-Fare-Others-AM,Adult-No-Fare-AM
1,1029,13802,0.014493,Adult-No-Fare-Others-AM,Adult-No-Fare-AM
2,1029,13803,0.037681,Adult-No-Fare-Others-AM,Adult-No-Fare-AM
3,1029,13804,0.017391,Adult-No-Fare-Others-AM,Adult-No-Fare-AM
4,1029,13805,0.052174,Adult-No-Fare-Others-AM,Adult-No-Fare-AM


In [94]:

### Two hour matrices
swift_csmt.to_csv(f'{basepath}/03 Output/09 Swift in CSMT/swift_csmt_od_24062024.csv')
