In [60]:
import pandas as pd
data = pd.read_csv('145_trips1.csv')

In [61]:
from datetime import timedelta

# Function to convert time string to datetime, adjusting for 24 hour+ times
def convert_time(time_str):
    h, m, s = map(int, time_str.split(':'))
    if h >= 24:
        h = h - 24
    return timedelta(hours=h, minutes=m, seconds=s)

# Apply the function to the 'arrival_time' column
data['arrival_time_new'] = data['arrival_time'].apply(convert_time)

# Check the data types again
data[['trip_id', 'stop_sequence', 'arrival_time_new']].dtypes

trip_id                      object
stop_sequence                 int64
arrival_time_new    timedelta64[ns]
dtype: object

In [62]:
# Group by 'trip_id' and sort within each group by 'stop_sequence'
grouped = data.sort_values('stop_sequence').groupby('trip_id')

# Initialize an empty list to hold the unique trip_ids where the arrival time decreases
decreasing_arrival_times = []

# Iterate over each group
for name, group in grouped:
    # Check if the 'arrival_time' series is monotonically increasing
    if not group['arrival_time_new'].is_monotonic:
        # If not, add the trip_id to the list
        decreasing_arrival_times.append(name)
        print(name)

# Count the unique trip_ids
num_unique_trips = len(decreasing_arrival_times)

num_unique_trips

2508.y1001.60-145-b12-1.331.I
9101.y1003.60-145-b12-1.331.I


  if not group['arrival_time_new'].is_monotonic:


2

In [63]:
def get_day_of_week(trip_id):
    if 'y1001' in trip_id:
        return 'weekday'
    elif 'y1002' in trip_id:
        return 'Saturday'
    elif 'y1003' in trip_id:
        return 'Sunday'
    else:
        return 'unknown'  # or any default value for when none of the keywords are in the trip_id

# Create a new column 'day_of_the_week' using the apply function
data['day_of_the_week'] = data['service_id'].apply(get_day_of_week)

In [65]:
# Count unique 'trip_id' for each 'day_of_the_week'
unique_trip_ids = data.groupby('day_of_the_week')['trip_id'].nunique()

print('Number of unique trip ids for each day of the week:\n', unique_trip_ids)

Number of unique trip ids for each day of the week:
 day_of_the_week
Saturday     47
Sunday       62
weekday     104
Name: trip_id, dtype: int64


In [66]:
data.to_csv("1_weekday_sat_sun.csv",index=False)

In [67]:
# Filter rows where 'day_of_the_week' is 'weekday'
df_weekday = data[data['day_of_the_week'] == 'weekday']

# Reset the index
df_weekday = df_weekday.reset_index(drop=True)

In [69]:
from datetime import timedelta

# Function to convert time string to datetime, adjusting for 24 hour+ times
def convert_time(time_str):
    h, m, s = map(int, time_str.split(':'))
    if h >= 24:
        h = h - 24
    return timedelta(hours=h, minutes=m, seconds=s)

# Apply the function to the 'arrival_time' column
df_weekday['arrival_time_new_1'] = df_weekday['arrival_time'].apply(convert_time)


In [70]:
# Group by 'trip_id' and sort within each group by 'stop_sequence'
grouped_weekday = df_weekday.sort_values('stop_sequence').groupby('trip_id')

# Create a new dataframe to hold the first stop of each trip
first_stops = grouped_weekday.first().reset_index()

# Find the range of 'arrival_time_new'
min_time = first_stops['arrival_time_new_1'].min()
max_time = first_stops['arrival_time_new_1'].max()

# Create five equally spaced time intervals within this range
time_intervals = pd.cut(first_stops['arrival_time_new'], bins=5, labels=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'])

# Assign these intervals (days) to a new 'day' column
first_stops['day'] = time_intervals

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Display the first few rows of the dataframe
first_stops.head()

Unnamed: 0,trip_id,route_id,service_id,shape_id,trip_headsign,direction_id,trip_id.1,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,arrival_time_new,day_of_the_week,arrival_time_new_1,day
0,1606.y1001.60-145-b12-1.330.I,60-145-b12-1,y1001,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,1606.y1001.60-145-b12-1.330.I,09:40:00,09:40:00,8350DB007574,1,Heuston Station,0,0,0.0,0 days 09:40:00,weekday,0 days 09:40:00,Tuesday
1,1608.y1001.60-145-b12-1.330.I,60-145-b12-1,y1001,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,1608.y1001.60-145-b12-1.330.I,13:10:00,13:10:00,8350DB007574,1,Heuston Station,0,0,0.0,0 days 13:10:00,weekday,0 days 13:10:00,Wednesday
2,1610.y1001.60-145-b12-1.330.I,60-145-b12-1,y1001,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,1610.y1001.60-145-b12-1.330.I,16:50:00,16:50:00,8350DB007574,1,Heuston Station,0,0,0.0,0 days 16:50:00,weekday,0 days 16:50:00,Thursday
3,1612.y1001.60-145-b12-1.330.I,60-145-b12-1,y1001,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,1612.y1001.60-145-b12-1.330.I,20:30:00,20:30:00,8350DB007574,1,Heuston Station,0,0,0.0,0 days 20:30:00,weekday,0 days 20:30:00,Friday
4,1614.y1001.60-145-b12-1.331.I,60-145-b12-1,y1001,60-145-b12-1.331.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,1614.y1001.60-145-b12-1.331.I,23:00:00,23:00:00,8350DB007574,1,Aston Quay,0,0,0.0,0 days 23:00:00,weekday,0 days 23:00:00,Friday


In [73]:
# Merge 'day' column from 'first_stops' back to 'weekday_data'
df_weekday = pd.merge(df_weekday, first_stops[['trip_id', 'day']], on='trip_id', how='left')

df_weekday.head()

Unnamed: 0,route_id,service_id,trip_id,shape_id,trip_headsign,direction_id,trip_id.1,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,arrival_time_new,day_of_the_week,arrival_time_new_1,day_x,day_y,day
0,60-145-b12-1,y1001,2203.y1001.60-145-b12-1.330.I,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,2203.y1001.60-145-b12-1.330.I,18:30:00,18:30:00,8350DB007574,1,Heuston Station,0,0,0.0,0 days 18:30:00,weekday,0 days 18:30:00,Thursday,Thursday,Thursday
1,60-145-b12-1,y1001,2203.y1001.60-145-b12-1.330.I,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,2203.y1001.60-145-b12-1.330.I,18:32:32,18:32:32,8350DB004177,2,Heuston Station,0,0,448.42,0 days 18:32:32,weekday,0 days 18:32:32,Thursday,Thursday,Thursday
2,60-145-b12-1,y1001,2203.y1001.60-145-b12-1.330.I,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,2203.y1001.60-145-b12-1.330.I,18:33:09,18:33:09,8350DB004178,3,Heuston Station,0,0,696.52,0 days 18:33:09,weekday,0 days 18:33:09,Thursday,Thursday,Thursday
3,60-145-b12-1,y1001,2203.y1001.60-145-b12-1.330.I,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,2203.y1001.60-145-b12-1.330.I,18:34:24,18:34:24,8350DB004179,4,Heuston Station,0,0,1187.07,0 days 18:34:24,weekday,0 days 18:34:24,Thursday,Thursday,Thursday
4,60-145-b12-1,y1001,2203.y1001.60-145-b12-1.330.I,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,2203.y1001.60-145-b12-1.330.I,18:35:24,18:35:24,8350DB002993,5,Heuston Station,0,0,1607.52,0 days 18:35:24,weekday,0 days 18:35:24,Thursday,Thursday,Thursday


In [74]:
# Drop the 'day_x' column
df_weekday = df_weekday.drop(columns=['day_x','day'])

# Rename 'day_y' to 'day'
df_weekday = df_weekday.rename(columns={'day_y': 'day'})

# Display the first few rows of the dataframe
df_weekday.head()

Unnamed: 0,route_id,service_id,trip_id,shape_id,trip_headsign,direction_id,trip_id.1,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,arrival_time_new,day_of_the_week,arrival_time_new_1,day
0,60-145-b12-1,y1001,2203.y1001.60-145-b12-1.330.I,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,2203.y1001.60-145-b12-1.330.I,18:30:00,18:30:00,8350DB007574,1,Heuston Station,0,0,0.0,0 days 18:30:00,weekday,0 days 18:30:00,Thursday
1,60-145-b12-1,y1001,2203.y1001.60-145-b12-1.330.I,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,2203.y1001.60-145-b12-1.330.I,18:32:32,18:32:32,8350DB004177,2,Heuston Station,0,0,448.42,0 days 18:32:32,weekday,0 days 18:32:32,Thursday
2,60-145-b12-1,y1001,2203.y1001.60-145-b12-1.330.I,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,2203.y1001.60-145-b12-1.330.I,18:33:09,18:33:09,8350DB004178,3,Heuston Station,0,0,696.52,0 days 18:33:09,weekday,0 days 18:33:09,Thursday
3,60-145-b12-1,y1001,2203.y1001.60-145-b12-1.330.I,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,2203.y1001.60-145-b12-1.330.I,18:34:24,18:34:24,8350DB004179,4,Heuston Station,0,0,1187.07,0 days 18:34:24,weekday,0 days 18:34:24,Thursday
4,60-145-b12-1,y1001,2203.y1001.60-145-b12-1.330.I,60-145-b12-1.330.I,Kilmacanogue (Esso Garage) - Outside Heuston T...,1,2203.y1001.60-145-b12-1.330.I,18:35:24,18:35:24,8350DB002993,5,Heuston Station,0,0,1607.52,0 days 18:35:24,weekday,0 days 18:35:24,Thursday


In [75]:
df_weekday.to_csv("2_just_weekdays.csv",index=False)

In [76]:
df_not_weekday = data[data['day_of_the_week'] != 'weekday']
df_not_weekday.to_csv("3_just_weekends.csv")

In [77]:
# Load both CSVs into DataFrames
df1 = pd.read_csv('2_just_weekdays.csv')
df2 = pd.read_csv('3_just_weekends.csv')

# Define a function to apply to each row of df1
def set_extra_col_value(row):
    if row['day_of_the_week'] == 'saturday':
        return 'Saturday'
    
    if row['day_of_the_week'] == 'sunday':
        return 'Sunday'

# Add the 'extra_col' to df1 with a value depending on 'column_name'
df2['day'] = df2.apply(set_extra_col_value, axis=1)

# Append df1 to df2
df3 = df2.append(df1, ignore_index=True)

  df3 = df2.append(df1, ignore_index=True)


In [78]:
df3.to_csv("4_appened_data.csv",index=False)

In [80]:
# Replace 'day_of_the_week' values where there is 'weekday' with the 'day' value
df3.loc[df3['day_of_the_week'] == 'weekday', 'day_of_the_week'] = df3['day']

# Remove specified columns
columns_to_drop = ['drop_off_type', 'pickup_type', 'day', 'arrival_time_new_1', 'trip_id.1', 
                   'direction_id', 'trip_headsign', 'shape_id', 'route_id']
df3 = df3.drop(columns=columns_to_drop)

# Display the first few rows of the dataframe
df3.head()

Unnamed: 0.1,Unnamed: 0,service_id,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,shape_dist_traveled,arrival_time_new,day_of_the_week
0,0.0,y1003,8563.y1003.60-145-b12-1.330.I,17:53:37,17:53:37,8250DB003142,24,Heuston Station,7863.69,0 days 17:53:37,Sunday
1,1.0,y1003,8563.y1003.60-145-b12-1.330.I,17:55:05,17:55:05,8250DB003143,25,Heuston Station,8478.67,0 days 17:55:05,Sunday
2,2.0,y1003,8563.y1003.60-145-b12-1.330.I,17:56:01,17:56:01,8250DB003144,26,Heuston Station,8930.37,0 days 17:56:01,Sunday
3,3.0,y1003,8563.y1003.60-145-b12-1.330.I,17:56:38,17:56:38,8250DB003145,27,Heuston Station,9247.7,0 days 17:56:38,Sunday
4,4.0,y1003,8563.y1003.60-145-b12-1.330.I,17:57:19,17:57:19,8250DB003146,28,Heuston Station,9649.17,0 days 17:57:19,Sunday


In [81]:
df3.to_csv("5_new_cleaned_data_labeled.csv",index=False)

In [82]:
# Count unique 'trip_id' for each 'day_of_the_week'
unique_trip_ids = df3.groupby('day_of_the_week')['trip_id'].nunique()

print('Number of unique trip ids for each day of the week:\n', unique_trip_ids)

Number of unique trip ids for each day of the week:
 day_of_the_week
Friday       14
Monday       26
Saturday     47
Sunday       62
Thursday     21
Tuesday      21
Wednesday    22
Name: trip_id, dtype: int64


In [83]:
filtered_df = df3[df3['stop_headsign'].str.strip() == 'Heuston Station']

In [84]:
filtered_df.to_csv("6_hueston_trips_final.csv",index=False)

In [85]:
# Remove specified columns
columns_to_drop = ['service_id', 'stop_id', 'stop_headsign']
filtered_df = filtered_df.drop(columns=columns_to_drop)

# Display the first few rows of the dataframe
filtered_df.to_csv("7_cleaned_trips.csv",index=False)