# Package Activation

In [None]:
# Enable 3rd party extensions for data importing, cleaning, manipulation, calculation, and aggregation
import pandas as pd
import numpy as np
from datetime import *

from google.colab import drive
from google.colab import files

# Connect this Google Collab notebook to your GDrive
drive.mount('/content/drive')

pd.set_option('mode.chained_assignment', None)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Selector

In [None]:
# User input to select a different dataset stored in the repository, moves from 03_2023 to 04_2023
# Month_Year Shifter
# Format: MM_YYYY
# Example: 03_2023

# month_year = input("Please Input Month_Year:")

month_year = '05_2023'

In [None]:
# User input to select a different dataset stored in the repository, moves from 01_03_2023 to 02_03_2023
# Bi-Weekly Shifter
# Format: Bi-Week_MM_YYYY
# Example: 01_03_2023

# week_number = input("Please Input Bi-Week_Month_Year:")

week_number = '01_05_2023'

# Data Cleaning

# Import reservations data

In [None]:
# Load reservations data stored in repository [Reservations.csv]
df_reservations = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Input/" + "/Reservations.csv")
# df_reservations = pd.read_csv('Reservations.csv')

# Remove whitespace from driver's name, sometimes when Fleet is updating data, they can leave a space at the end of the name
df_reservations['driver_name'] = df_reservations['driver_name'].str.strip()

# Convert variable 'attempt_datetime' from a string to a datetime variable so we can calculate time between and format time
# The original dateformat changes based on how you download the data from Redash/Metabase, I recommend using a function to force the date format on SQL
df_reservations['attempt_datetime'] = pd.to_datetime(df_reservations['attempt_datetime'], format='%d/%m/%y %H:%M').dt.strftime('%Y-%m-%d %H:%M:%S')

# Convert variable 'reservation_date' from a string to a datetime variable so we can calculate time between and format time
# # The original dateformat changes based on how you download the data from Redash/Metabase, I recommend using a function to force the date format on SQL
df_reservations['reservation_date'] = pd.to_datetime(df_reservations['reservation_date'], format='%d/%m/%y').dt.strftime('%Y-%m-%d')

# Covert variable 'pickup_postcode' from a float to a string and remove .0 at the end
df_reservations['pickup_postcode'] = df_reservations['pickup_postcode'].astype(str).apply(lambda x: x.replace('.0',''))

df_reservations['distance_m'] = df_reservations['distance_m'].replace('Null',np.nan)

df_reservations['distance_m'] = df_reservations['distance_m'].astype(float)

# Flag to indicate if reservation waypoint was updated >=1,000m away from shipper location
df_reservations['faraway_flag'] = np.where(df_reservations['distance_m']>=1000,1,0)
# 1 if df_reservations['distance_m'].astype(float) >= 1000.00 else 0 

# Create new column to represent the event type
df_reservations['Event'] = 'reservation attempt'

# Create new column to distinguish events based on the driver ID and the reservation date
df_reservations['UICol']  = df_reservations['driver_id'].astype(str) + df_reservations['reservation_date'].astype(str)

# Filter only attempted reservations 
df_reservations = df_reservations.loc[(df_reservations['reservation_status'].isin(['Success', 'Fail']))]
df_reservations = df_reservations.reset_index().drop(columns=['index'],axis=1)

# Drop columns 
df_reservations = df_reservations.drop(['reservation_id',
                                        'shipper_name',
                                        'driver_type', 
                                       ],axis = 1)


# Reindex the columns
df_reservations = df_reservations.reindex(columns=['driver_id',
                                                   'driver_name',
                                                   'route_id',
                                                   'pickup_postcode',
                                                   'attempt_datetime',
                                                   'reservation_status',
                                                   'reservation_date',
                                                   'distance_m',
                                                   'faraway_flag',
                                                   'legacy_shipper_id',
                                                   'UICol',
                                                   'Event'])

# Rename headers 
df_reservations.rename(columns = {'driver_id': 'Driver ID',
                                  'driver_name': 'Driver name',
                                  'route_id':'Route ID',
                                  'pickup_postcode': 'Postcode',
                                  'attempt_datetime': 'Start time',
                                  'reservation_status': 'Reservation status',
                                  'reservation_date': 'Route date',
                                  'distance_m': 'Distance updated',
                                  'faraway_flag': 'Faraway flag',
                                  'legacy_shipper_id': 'Legacy Shipper ID'}, inplace = True)

# Sort data set based on Driver ID then Start time
df_reservations.sort_values(by=['Driver ID', 'Start time'], inplace=True,
                            ascending = [True, True])

df_reservations.head(5)


Unnamed: 0,Driver ID,Driver name,Route ID,Postcode,Start time,Reservation status,Route date,Distance updated,Faraway flag,Legacy Shipper ID,UICol,Event
22605,1013338,West 3 - Staff - Syaparudin Aris,1935053,129809,2023-05-08 11:23:00,Success,2023-05-08,34.234835,0,199887,10133382023-05-08,reservation attempt
13348,1013338,West 3 - Staff - Syaparudin Aris,1935053,609930,2023-05-08 12:54:00,Success,2023-05-08,40.865165,0,28449,10133382023-05-08,reservation attempt
22208,1013338,West 3 - Staff - Syaparudin Aris,1935053,608599,2023-05-08 13:10:00,Success,2023-05-08,42.79912,0,246154,10133382023-05-08,reservation attempt
19939,1013338,West 3 - Staff - Syaparudin Aris,1935053,608609,2023-05-08 13:17:00,Success,2023-05-08,39.099922,0,35925,10133382023-05-08,reservation attempt
24401,1013338,West 3 - Staff - Syaparudin Aris,1935053,608609,2023-05-08 13:20:00,Success,2023-05-08,71.784901,0,213030,10133382023-05-08,reservation attempt


# Import PUDO drop data

In [None]:
# Load data
df_drop = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Input/" + "/DP drop.csv")
# df_drop = pd.read_csv('DP drop.csv')

# Clean up data set
# Format datetimes 
# df_drop['DP_id'] = df_drop['DP_id'].fillna(0).astype(int)
df_drop['Driver ID'] = df_drop['Driver ID'].astype(int)

df_drop['Start time'] = pd.to_datetime(df_drop['Start time'], format='%Y-%m-%dT%H:%M:%S').dt.strftime('%Y-%m-%d %H:%M:%S')
df_drop['Route date'] = pd.to_datetime(df_drop['Start time'], format='%Y-%m-%d %H:%M:%S').dt.strftime('%Y-%m-%d')

# Create new columns 
df_drop['UICol']  = df_drop['Driver ID'].astype(str) + df_drop['Route date'].astype(str)
df_drop['Event'] = 'dp_drop'
df_drop['Postcode'] = df_drop['postcode']

# Rename columns 
df_drop.rename(columns = {'route_id': 'Route ID'},
               inplace = True) 
                                  
# df_drop = df_drop[df_drop['Driver ID'] != 0]

# Reindex columns 
df_drop = df_drop.reindex(columns = ['Driver ID',
                                   'Driver name',
                                   'Route ID',
                                  #  'Driver team',
                                   'DP_id',
                                   'Route date',
                                   'Postcode',
                                   'Start time',
                                   'UICol',
                                   'Event'])

df_drop.head()

Unnamed: 0,Driver ID,Driver name,Route ID,DP_id,Route date,Postcode,Start time,UICol,Event
0,1459038,PUDO West - Staff - Shaffiee Affendy,1936007,7514,2023-05-09,670609,2023-05-09 16:39:58,14590382023-05-09,dp_drop
1,1551316,PUDO North - Staff - Syamri Othman,1938910,22124,2023-05-12,786015,2023-05-12 10:24:25,15513162023-05-12,dp_drop
2,1458253,East 4 - Staff - Siti Nurul Fatin Binte Abdul ...,1938927,10121,2023-05-12,519599,2023-05-12 12:49:50,14582532023-05-12,dp_drop
3,1459038,PUDO West - Staff - Shaffiee Affendy,1936007,2421,2023-05-09,671524,2023-05-09 16:59:33,14590382023-05-09,dp_drop
4,1220043,PUDO North - Staff - Zaid Rosli,1935986,4191,2023-05-09,530332,2023-05-09 10:13:30,12200432023-05-09,dp_drop


# Import PUDO inbound data

In [None]:
# Load data
df_inbound = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Input/" + "/PUDO inbound.csv")
# df_inbound = pd.read_csv('PUDO inbound.csv')

# Clean up data set
# Format datetimes 
df_inbound['Driver ID'] = df_inbound['Driver ID'].fillna(0).astype(int)


df_inbound['Start time'] = pd.to_datetime(df_inbound['inbounded_at'], format='%Y-%m-%dT%H:%M:%S').dt.strftime('%Y-%m-%d %H:%M:%S')
df_inbound['Route date'] = pd.to_datetime(df_inbound['Route date'], format='%Y-%m-%d').dt.strftime('%Y-%m-%d')

# Create new columns 
df_inbound['UICol']  = df_inbound['Driver ID'].astype(str) + df_inbound['Route date'].astype(str)
df_inbound['Event'] = 'pudo_inbound' 

df_inbound = df_inbound[df_inbound['Driver ID'] != 0]

# Reindex columns 
df_inbound= df_inbound.reindex(columns = ['Driver ID',
                                   'Driver name',
                                   'Driver team',
                                   'Route date',
                                   'Start time',
                                   'UICol',
                                   'Event','Route ID'])

df_inbound.head()

Unnamed: 0,Driver ID,Driver name,Driver team,Route date,Start time,UICol,Event,Route ID
0,1220043,PUDO North - Staff - Zaid Rosli,,2023-05-17,2023-05-17 09:39:28,12200432023-05-17,pudo_inbound,1942604
1,1570796,PUDO North - Staff - Sumathy,,2023-05-13,2023-05-13 11:21:22,15707962023-05-13,pudo_inbound,1939838
2,1352407,East 3 - Jauhar Maknun Bin Razali,,2023-05-15,2023-05-15 10:14:23,13524072023-05-15,pudo_inbound,1940819
3,1473707,East 3 - Staff - Suren Rajakumaran Suren,,2023-05-10,2023-05-10 10:17:34,14737072023-05-10,pudo_inbound,1936887
4,1566829,East 1 - Staff - Fauzi Ismail,,2023-05-17,2023-05-17 10:03:16,15668292023-05-17,pudo_inbound,1942636


# Import Geotab data

In [None]:
# Load data
df_geotab = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Input/" + "/Geotab.csv")
# df_geotab = pd.read_csv('Geotab.csv')

# Clean up data 
# Format datetimes

df_geotab['Driver Name'] = df_geotab['Driver Name'].str.strip()

# df_geotab['Trip Started'] = pd.to_datetime(df_geotab['Trip Started'], format='%d/%m/%Y %I:%M:%S %p').dt.strftime('%Y-%m-%d %H:%M:%S')
# df_geotab['Trip Ended'] = pd.to_datetime(df_geotab['Trip Ended'], format='%d/%m/%Y %I:%M:%S %p').dt.strftime('%Y-%m-%d %H:%M:%S')

df_geotab['Trip Started'] = pd.to_datetime(df_geotab['Trip Started'], format='%d/%m/%Y %H:%M').dt.strftime('%Y-%m-%d %H:%M:%S')
df_geotab['Trip Ended'] = pd.to_datetime(df_geotab['Trip Ended'], format='%d/%m/%Y %H:%M').dt.strftime('%Y-%m-%d %H:%M:%S')

df_geotab['Route date'] = pd.to_datetime(pd.to_datetime(df_geotab['Trip Started']),format='%Y/%m/%d').dt.strftime('%Y-%m-%d')
df_geotab['Driver ID'] = df_geotab['Driver ID'].astype(str).apply(lambda x: x.replace('.0',''))
df_geotab['Driver ID'] = df_geotab['Driver ID'].astype(int)

# Create new columns 
df_geotab['UICol']  = df_geotab['Driver ID'].astype(str) + df_geotab['Route date'].astype(str)
df_geotab['Event'] = 'geotab'

# Drop unecessary columns 
df_geotab = df_geotab.drop([
                            'Device ID',
                            'User ID',
                            'Max Speed',
                            'Idling Duration',
                            'Location',
                            # 'Zone Types',
                            'Start During Work Hours',
                            'Device Group',
                            'Driver Group',
                            'Depart Date',
                            'Stop During Work Hours'
                           ],
                           axis = 1
                          )

# Reindex columns 
df_geotab = df_geotab.reindex(columns = ['Driver ID',
                                         'Driver Name',
                                         'Zone Types',
                                        #'Driver Team',
                                         'Trip Started',
                                         'Trip Ended',
                                         'Driving Duration',
                                         'Distance',
                                         'Stop Duration',
                                         'Route date',
                                         'UICol',
                                         'Event'])

# Rename columns 
df_geotab.rename(columns = {'Trip Started': 'Start time',
                            'Trip Ended': 'End time',
                            'Driver Name': 'Driver name',
                            # 'Driver Team': 'Driver team'
                           },
                 inplace = True)

# Drop rows with NaN values
df_geotab = df_geotab.dropna(axis=0)

# Reset index 
df_geotab = df_geotab.reset_index().drop(columns=['index'],axis=1)

# Sort data set 
df_geotab.sort_values(by=['Driver ID', 'Start time'], inplace=True,
                      ascending = [True, True])

df_geotab.head(5)

Unnamed: 0,Driver ID,Driver name,Zone Types,Start time,End time,Driving Duration,Distance,Stop Duration,Route date,UICol,Event
1211,1013338,West 3 - Staff - Syaparudin Aris,0,2023-05-11 18:21:00,2023-05-11 18:36:00,0:14,25.232443,0:26,2023-05-11,10133382023-05-11,geotab
1212,1013338,West 3 - Staff - Syaparudin Aris,0,2023-05-11 19:03:00,2023-05-11 19:03:00,0:00,0.0,14:12,2023-05-11,10133382023-05-11,geotab
1213,1013338,West 3 - Staff - Syaparudin Aris,0,2023-05-12 09:15:00,2023-05-12 09:16:00,0:00,8.278785,0:32,2023-05-12,10133382023-05-12,geotab
1214,1013338,West 3 - Staff - Syaparudin Aris,0,2023-05-12 09:48:00,2023-05-12 09:48:00,0:00,0.0,0:00,2023-05-12,10133382023-05-12,geotab
1215,1013338,West 3 - Staff - Syaparudin Aris,0,2023-05-12 09:49:00,2023-05-12 10:09:00,0:19,3.477367,0:09,2023-05-12,10133382023-05-12,geotab


# Concat all tables

In [None]:
df_concat = pd.concat([df_reservations, df_geotab, df_drop, df_inbound])
df_concat.sort_values(by = ['UICol', 'Start time'], inplace = True,ascending = [True, True])
df_concat = df_concat.reset_index(drop=True)
df_concat['Start time'] = pd.to_datetime(df_concat['Start time'], format='%Y-%m-%d %H:%M:%S')
df_concat['End time'] = pd.to_datetime(df_concat['End time'], format='%Y-%m-%d %H:%M:%S')
df_concat['Route date'] = pd.to_datetime(df_concat['Route date'], format='%Y-%m-%d')

df_concat.head(5)

Unnamed: 0,Driver ID,Driver name,Route ID,Postcode,Start time,Reservation status,Route date,Distance updated,Faraway flag,Legacy Shipper ID,UICol,Event,Zone Types,End time,Driving Duration,Distance,Stop Duration,DP_id,Driver team
0,1000949,Jeroldine DisplayName,1939638.0,,2023-05-12 11:53:28,,2023-05-12,,,,10009492023-05-12,pudo_inbound,,NaT,,,,,
1,1013338,West 3 - Staff - Syaparudin Aris,1935053.0,129809.0,2023-05-08 11:23:00,Success,2023-05-08,34.234835,0.0,199887.0,10133382023-05-08,reservation attempt,,NaT,,,,,
2,1013338,West 3 - Staff - Syaparudin Aris,1935053.0,609930.0,2023-05-08 12:54:00,Success,2023-05-08,40.865165,0.0,28449.0,10133382023-05-08,reservation attempt,,NaT,,,,,
3,1013338,West 3 - Staff - Syaparudin Aris,1935053.0,608599.0,2023-05-08 13:10:00,Success,2023-05-08,42.79912,0.0,246154.0,10133382023-05-08,reservation attempt,,NaT,,,,,
4,1013338,West 3 - Staff - Syaparudin Aris,1935053.0,608609.0,2023-05-08 13:17:00,Success,2023-05-08,39.099922,0.0,35925.0,10133382023-05-08,reservation attempt,,NaT,,,,,


In [None]:
# df_concat[ df_concat['Stop Duration'].str.len() > 5]['Stop Duration']
stop_duration = []
for i in df_concat['Stop Duration']:
  if len(i) > 5:
    stop_duration.append('0:00')
  else:
    stop_duration.append(i)

df_concat['Stop Duration'] = stop_duration

In [None]:
# TEMP
df_concat = df_concat.drop(23141)

# df_concat = df_concat.drop(1024)
# df_concat[ df_concat['Driving Duration'].str.len() > 5]

KeyError: ignored

# Data Structuring

In [None]:
# Data standardization: Ensures all values in a column are of the same type to prevent data type checks later on and potential code issues
df_concat['Driver ID'] = df_concat['Driver ID'].astype(str)
df_concat['Driver name'] = df_concat['Driver name'].astype(str)
df_concat['Route ID'] = df_concat['Route ID'].astype(float) # Can't be made into int as there are nan or missing values
df_concat['Postcode'] = df_concat['Postcode'].astype(str)

df_concat['Reservation status'] = df_concat['Reservation status'].astype(str)
df_concat['Legacy Shipper ID'] = df_concat['Legacy Shipper ID'].astype(float)
df_concat['UICol'] = df_concat['UICol'].astype(str)
df_concat['Event'] = df_concat['Event'].astype(str)

df_concat['Driving Duration'] = df_concat['Driving Duration'].astype(str)
df_concat['Distance'] = df_concat['Distance'].astype(float)
df_concat['Stop Duration'] = df_concat['Stop Duration'].astype(str)
df_concat['Driver team'] = df_concat['Driver team'].astype(str)
df_concat['DP_id'] = df_concat['DP_id'].astype(float)

In [None]:
# Export/save the combined dataset into the repository so the team can check the raw combined data
df_concat.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "/df_concat.csv", index=False)

# Import StaffAny Data

In [None]:
# Import StaffAny data from the repository - Temporary solution before we streamline the process
df_staffany = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/StaffAny/staff_data.csv")
# df_staffany = pd.read_csv('staff_data.csv')


# df_staffany_hours_1 = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/StaffAny/StaffAny_01_04-09_04_2023.csv",skiprows=2)
# df_staffany_hours_2 = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/StaffAny/StaffAny_10_04-16_04_2023.csv",skiprows=2)

df_staffany_hours_1 = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/StaffAny/StaffAny_08_05-14_05_2023.csv",skiprows=2)
df_staffany_hours_2 = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/StaffAny/StaffAny_15_05-21_05_2023.csv",skiprows=2)

# df_staffany_hours_1 = pd.read_csv("StaffAny_08_05-14_05_2023.csv",skiprows=2)
# df_staffany_hours_2 = pd.read_csv("StaffAny_15_05-21_05_2023.csv",skiprows=2)

# Structure & Clean StaffAny Data

In [None]:
df_staffany['Employee ID'] = df_staffany['Employee ID'].str.strip()
df_staffany_hours_1['Employee ID'] = df_staffany_hours_1['Employee ID'].str.strip()
df_staffany_hours_2['Employee ID'] = df_staffany_hours_2['Employee ID'].str.strip()

In [None]:
df_staffany_hours_1_2 = df_staffany_hours_1.dropna(subset=['Name'])[['Name','Section','Employee ID','Date','Scheduled In','Scheduled Out','Time In','Time Out','Scheduled Hours']]
df_staffany_hours_2_2 = df_staffany_hours_2.dropna(subset=['Name'])[['Name','Section','Employee ID','Date','Scheduled In','Scheduled Out','Time In','Time Out','Scheduled Hours']]

df_staffany_hours_1_2['Scheduled In'] = df_staffany_hours_1_2['Scheduled In'].replace('-',np.nan)
df_staffany_hours_2_2['Scheduled In'] = df_staffany_hours_2_2['Scheduled In'].replace('-',np.nan)

df_staffany_hours_1_2['Scheduled Out'] = df_staffany_hours_1_2['Scheduled Out'].replace('-',np.nan)
df_staffany_hours_2_2['Scheduled Out'] = df_staffany_hours_2_2['Scheduled Out'].replace('-',np.nan)

df_staffany_hours_1_2['Time In'] = df_staffany_hours_1_2['Time In'].replace('-',np.nan)
df_staffany_hours_2_2['Time In'] = df_staffany_hours_2_2['Time In'].replace('-',np.nan)

In [None]:
df_staffany_hours_1_2['Date'] = pd.to_datetime(df_staffany_hours_1_2['Date'], format='%d %b %Y, %a')
df_staffany_hours_1_2['Time In'] = pd.to_datetime(df_staffany_hours_1_2['Time In'], format='%I:%M %p')
df_staffany_hours_1_2['Scheduled In'] = pd.to_datetime(df_staffany_hours_1_2['Scheduled In'], format='%I:%M %p')
df_staffany_hours_1_2['Scheduled Out'] = pd.to_datetime(df_staffany_hours_1_2['Scheduled Out'], format='%I:%M %p')

df_staffany_hours_2_2['Date'] = pd.to_datetime(df_staffany_hours_2_2['Date'], format='%d %b %Y, %a')
df_staffany_hours_2_2['Time In'] = pd.to_datetime(df_staffany_hours_2_2['Time In'], format='%I:%M %p')
df_staffany_hours_2_2['Scheduled In'] = pd.to_datetime(df_staffany_hours_2_2['Scheduled In'], format='%I:%M %p')
df_staffany_hours_2_2['Scheduled Out'] = pd.to_datetime(df_staffany_hours_2_2['Scheduled Out'], format='%I:%M %p')

In [None]:
df_staffany_hours = pd.concat([df_staffany_hours_1_2,df_staffany_hours_2_2])

In [None]:
staff_any = df_staffany_hours.merge(df_staffany,how='inner',on='Employee ID')
staff_any['Driver Strength ID'] = staff_any['Driver Strength ID'].astype(str)

In [None]:
# Some rows of data missing Time Out
staff_any2 = staff_any.dropna(subset=['Scheduled In','Time In','Time Out']).copy()

In [None]:
# Need to account for OT past 12am

new_time_out = []
for i in range(0, len(staff_any2['Date'])):
    temp = staff_any2.iloc[i]
    if temp['Time Out'].endswith('(+1)'):
        temp_time_out = pd.to_datetime(temp['Time Out'][:-4].strip(), format='%I:%M %p')
        new_time_out.append(datetime.combine(temp['Date']+timedelta(days=1), temp_time_out.time()))
    elif temp['Time Out'] == '-':
        new_time_out.append(np.nan)
    else:
        temp_time_out = pd.to_datetime(temp['Time Out'].strip(), format='%I:%M %p')
        new_time_out.append(datetime.combine(temp['Date'], temp_time_out.time()))

staff_any2['Scheduled In'] = staff_any2.apply(lambda x:datetime.combine(x['Date'],x['Scheduled In'].time()),1)
staff_any2['Scheduled Out'] = staff_any2.apply(lambda x:datetime.combine(x['Date'],x['Scheduled Out'].time()),1)
staff_any2['Time In'] = staff_any2.apply(lambda x:datetime.combine(x['Date'],x['Time In'].time()),1)
staff_any2['Time Out'] = new_time_out

In [None]:
# FIX when joining to Postcodes Enriched by Postcode/postcode
staff_any2['Driver Strength ID'] = staff_any2['Driver Strength ID'].str.strip('.0')

In [None]:
staff_any2.head()

Unnamed: 0,Name,Section,Employee ID,Date,Scheduled In,Scheduled Out,Time In,Time Out,Scheduled Hours,Full Name,Driver Strength ID,Vocation
0,Kasmat Bin Hashim,PUDO (Central/ West),P-056,2023-05-08,2023-05-08 10:00:00,2023-05-08 20:00:00,2023-05-08 09:53:00,2023-05-08 20:13:00,8.0,Kasmat Bin Hashim,136798,Class 3 Driver I
1,Kasmat Bin Hashim,PUDO (Central/ West),P-056,2023-05-09,2023-05-09 10:00:00,2023-05-09 20:00:00,2023-05-09 09:51:00,2023-05-09 20:00:00,8.0,Kasmat Bin Hashim,136798,Class 3 Driver I
2,Kasmat Bin Hashim,PUDO (Central/ West),P-056,2023-05-10,2023-05-10 10:00:00,2023-05-10 20:00:00,2023-05-10 09:51:00,2023-05-10 20:00:00,8.0,Kasmat Bin Hashim,136798,Class 3 Driver I
3,Kasmat Bin Hashim,PUDO (Central/ West),P-056,2023-05-11,2023-05-11 10:00:00,2023-05-11 20:00:00,2023-05-11 09:48:00,2023-05-11 20:00:00,8.0,Kasmat Bin Hashim,136798,Class 3 Driver I
4,Kasmat Bin Hashim,PUDO (Central/ West),P-056,2023-05-12,2023-05-12 10:00:00,2023-05-12 20:00:00,2023-05-12 09:46:00,2023-05-12 20:00:00,8.0,Kasmat Bin Hashim,136798,Class 3 Driver I


In [None]:
new_df_concat = pd.DataFrame({})
overnight_routes = []
for i in df_concat['Driver ID'].unique():
    in_out_checker = []
    new_route_id = []
    sa_scheduled_in = []
    sa_scheduled_out = []
    sa_time_in = []
    sa_time_out = []
    sa_scheduled_hours = []
    
    temp = df_concat[df_concat['Driver ID']==i.strip()].reset_index(drop=True)
    temp_sa = staff_any2[staff_any2['Driver Strength ID']==i.strip()].reset_index(drop=True)
    for j in temp['Start time']:
        for x in range(0,len(temp_sa['Scheduled In'])):
            scheduled_in = temp_sa.iloc[x]['Scheduled In']
            scheduled_out = temp_sa.iloc[x]['Scheduled Out']
            time_out = temp_sa.iloc[x]['Time Out']
            time_in = temp_sa.iloc[x]['Time In']
            scheduled_hours = temp_sa.iloc[x]['Scheduled Hours']
            route_id = str(i) + '_' + scheduled_in.date().strftime('%Y-%m-%d')
            
            if scheduled_in < j < time_out:
                in_out_checker.append(1) # 1 to keep
                new_route_id.append(route_id)
                sa_scheduled_in.append(scheduled_in)
                sa_scheduled_out.append(scheduled_out)
                sa_time_in.append(time_in)
                sa_time_out.append(time_out)
                sa_scheduled_hours.append(scheduled_hours)
                break
        else:
            in_out_checker.append(0) # 0 to remove
            new_route_id.append(np.nan)
            sa_scheduled_in.append(np.nan)
            sa_scheduled_out.append(np.nan)
            sa_time_in.append(np.nan)
            sa_time_out.append(np.nan)
            sa_scheduled_hours.append(np.nan)
            
    temp['in_out_checker'] = in_out_checker
    temp['new_route_id'] = new_route_id
    temp['scheduled_in'] = sa_scheduled_in
    temp['scheduled_out'] = sa_scheduled_out
    temp['time_in'] = sa_time_in
    temp['time_out'] = sa_time_out
    temp['scheduled_hours'] = sa_scheduled_hours
    
    new_df_concat = pd.concat([new_df_concat,temp])

In [None]:
new_df_concat[new_df_concat['Stop Duration']!='nan']['Stop Duration']

70      0:26
71     14:12
72      0:32
73      0:00
74      0:09
       ...  
205     0:13
206     0:01
207    00:00
392     2:15
393     0:03
Name: Stop Duration, Length: 22702, dtype: object

In [None]:
# new_df_concat[new_df_concat['time_out'].dt.time == time(0, 0,0)]['time_out'].dt.time
# For drivers that clock out at 12 midnight exactly, for column time_out only the date will show but the time is there

# Exclude events outside of StaffAny times

In [None]:
df_concat2 = new_df_concat[new_df_concat['in_out_checker']==1].reset_index(drop=True)

In [None]:
# Export/save the cut combined dataset into the repository so the team can check the cut combined data
df_concat2.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "/df_concat2.csv", index=False)

# Convert Driving Duration 

In [None]:
# Create a function to calculate the number of minutes in a Hour:Minute format
def fxy (hour,minute):
    return (hour*60) + minute

# Convert the Driving Duration into a datetime format so we can pull the hour and minute individually
new_df_concat['Driving Duration (modified)'] = pd.to_datetime(new_df_concat['Driving Duration'], format='%H:%M')
new_df_concat['Driving Duration (min)'] = new_df_concat.apply(lambda x : fxy(x['Driving Duration (modified)'].hour,x['Driving Duration (modified)'].minute), axis=1)
new_df_concat['Stop Duration'] = pd.to_datetime(new_df_concat['Stop Duration'], format='%H:%M')
new_df_concat['Stop Duration'] = new_df_concat.apply(lambda x : fxy(x['Stop Duration'].hour,x['Stop Duration'].minute), axis=1)

# Repeat the previous steps for the data set that removed geotab events after the first geotab event after the last productive event (Let's call this the 'cleaned' dataset)
df_concat2['Driving Duration (modified)'] = pd.to_datetime(df_concat2['Driving Duration'], format='%H:%M')
df_concat2['Driving Duration (min)'] = df_concat2.apply(lambda x : fxy(x['Driving Duration (modified)'].hour,x['Driving Duration (modified)'].minute), axis=1)
df_concat2['Stop Duration'] = pd.to_datetime(df_concat2['Stop Duration'], format='%H:%M')
df_concat2['Stop Duration'] = df_concat2.apply(lambda x : fxy(x['Stop Duration'].hour,x['Stop Duration'].minute), axis=1)

ValueError: ignored

# Check for POH events before conversion


In [None]:
df_concat2['Event'].value_counts()

reservation attempt    29531
geotab                 14885
dp_drop                 3201
pudo_inbound             251
Name: Event, dtype: int64

In [None]:
df_concat2.dtypes

Driver ID                              object
Driver name                            object
Route ID                              float64
Postcode                               object
Start time                     datetime64[ns]
Reservation status                     object
Route date                     datetime64[ns]
Distance updated                      float64
Faraway flag                          float64
Legacy Shipper ID                     float64
UICol                                  object
Event                                  object
Zone Types                             object
End time                       datetime64[ns]
Driving Duration                       object
Distance                              float64
Stop Duration                          object
DP_id                                 float64
Driver team                            object
in_out_checker                          int64
new_route_id                           object
scheduled_in                   dat

# Geotab zone type to POH

In [None]:
df_concat3 = pd.DataFrame({})
for i in df_concat2['new_route_id'].unique(): #[30:31]:
    temp = df_concat2[df_concat2['new_route_id']==i].reset_index()
    
    df_temp = pd.DataFrame({})
    indexes = []
    
    for j in range(0,len(temp['Event'])):
        temp2 = temp.iloc[j]
        if temp2['Start time'] >= temp['Start time'].min() + timedelta(hours=1):
            if temp2['Zone Types'] == 'Hub' or temp2['Zone Types'] == 'Station':
                indexes.append(j)
    start = 0
    for x in indexes:
        temp2 = temp.iloc[start:x+1]
        start = x+1
        to_dup = temp.iloc[x]

        temp3 = pd.DataFrame({ 'Driver ID':[to_dup['Driver ID']], 'Driver name':[to_dup['Driver name']], 'Route ID':[to_dup['Route ID']],
                              
                               'Stop duration':[to_dup['Route date']],'Route date':[to_dup['Route date']], 'Event':['POH'],
                              
                               'Driver team':[to_dup['Driver team']], 'in_out_checker':[to_dup['in_out_checker']], 'new_route_id':[to_dup['new_route_id']],
                              
                               'scheduled_in':[to_dup['scheduled_in']], 'scheduled_out':[to_dup['scheduled_out']], 'time_in':[to_dup['time_in']],
                              
                               'time_out':[to_dup['time_out']], 'scheduled_hours':[to_dup['scheduled_hours']]})
        
        df_temp = pd.concat([df_temp,temp2,temp3])
    else:
        df_temp = pd.concat([df_temp,temp.iloc[start:]])
    
    df_temp = df_temp.reset_index(drop=True)
    df_concat3 = pd.concat([df_concat3,df_temp])

In [None]:
df_concat3 = df_concat3.reset_index(drop=True)

In [None]:
df_concat3['Event'].value_counts()

reservation attempt    29531
geotab                 14885
dp_drop                 3201
POH                     1418
pudo_inbound             251
Name: Event, dtype: int64

In [None]:
new_df_concat.sort_values(by = ['new_route_id', 'Start time'], inplace = True,ascending = [True, True])
new_df_concat = new_df_concat.reset_index(drop=True)

In [None]:
df_concat3.sort_values(by = ['new_route_id', 'Start time'], inplace = True,ascending = [True, True])
df_concat3 = df_concat3.reset_index(drop=True)

# Create columns to save event timings

In [None]:
# Create 'empty' columns to represent each productive time
new_df_concat['service_time'] = np.nan
new_df_concat['dp_service_time'] = np.nan
new_df_concat['poh_time'] = np.nan

new_df_concat['break_time'] = np.nan
new_df_concat['break_drive_time'] = np.nan

new_df_concat['route_collection_time'] = np.nan

new_df_concat['gantt_start'] = np.nan
new_df_concat['gantt_finish'] = np.nan

In [None]:
# Create 'empty' columns to represent each productive time for the 'cleaned' dataset
df_concat3['service_time'] = np.nan
df_concat3['dp_service_time'] = np.nan
df_concat3['poh_time'] = np.nan

df_concat3['break_time'] = np.nan
df_concat3['break_drive_time'] = np.nan

df_concat3['route_collection_time'] = np.nan

df_concat3['gantt_start'] = np.nan
df_concat3['gantt_finish'] = np.nan

# Event time calculations

In [None]:
# Create functions to calculate each productive event time 

def service_time(fm_data):
    # Go through every row of data that is inputted into the function, stops at 1 row before te last row
    for x in range(len(fm_data)-1):
        # Get the current row of data and the next row of data
        cur = fm_data.iloc[x]
        nex = fm_data.iloc[x+1]

        # Check if the current event is a driving (geotab) event the the following is a rsvn attempt
        if cur['Event'] == 'geotab' and nex['Event'] == 'reservation attempt':
            index = x+1
            # Create variable to find the next geotab event
            finder = 0
            # Keeps running until...
            while True:
                # Reach the end of the input data
                if index + 1 >= len(fm_data['Event']):
                    break

                # # Finds a POH event [QUESTION]
                # elif fm_data.iloc[index+1]['Event'] == 'POH': #If there is a POH event after a rsvn event, do not count time
                #     break
                
                # Finds the next geotab event
                elif fm_data.iloc[index+1]['Event'] == 'geotab':
                    # Set the index variable finder
                    finder = index + 1
                    break
                # In any other case, push the index forward by 1 and go to the next row of data
                else: 
                    index += 1
            # If there is not next geotab event, just skip this part of the data
            if finder == 0: 
                continue
            # If the next geotab has been found, store that row of data
            nex_stop = fm_data.iloc[finder]
            # Calculate the time between the Start of the next geotab event and the End of the current geotab event
            time = (nex_stop['Start time'] - cur['End time']).total_seconds() / 60
            # Set the productive event row to the calculated time
            fm_data['service_time'].iloc[x+1] = time

            fm_data['gantt_start'].iloc[x+1] = cur['End time']
            fm_data['gantt_finish'].iloc[x+1] = nex_stop['Start time']
    # Return the updated data back
    return fm_data

def dp_service_time(fm_data):
    for x in range(0, len(fm_data)-1):
        cur = fm_data.iloc[x]
        nex = fm_data.iloc[x+1]
                
        if cur['Event'] == 'geotab' and nex['Event'] == 'dp_drop':
            index = x+1 # Represents index of dp_drop event
            finder = 0 # To find next stop event 
            while True: 
                if index + 1 >= len(fm_data['Event']):
                    break
                elif fm_data.iloc[index+1]['Event'] == 'geotab':
                    finder = index + 1
                    break
                else: 
                    index += 1
            if finder == 0:
                continue 
            nex_stop = fm_data.iloc[finder]
            time = (nex_stop['Start time'] - cur['End time']).total_seconds() / 60.0
            fm_data['dp_service_time'].iloc[x+1] = time

            fm_data['gantt_start'].iloc[x+1] = cur['End time']
            fm_data['gantt_finish'].iloc[x+1] = nex_stop['Start time']
    return fm_data

# To be reviewed & maybe changed
def poh_time(fm_data):
    for x in range(0, len(fm_data)-1):
        cur = fm_data.iloc[x]
        nex = fm_data.iloc[x+1]
        if cur['Event'] == 'geotab' and nex['Event'] == 'POH':
            index = x+1 # Represents index of POH event
            finder = 0 # To find next stop event 
            while True: 
                if index + 1 >= len(fm_data['Event']):
                    break
                elif fm_data.iloc[index+1]['Event'] == 'geotab':
                    finder = index + 1
                    break
                else: 
                    index += 1
            if finder == 0: 
                continue 
            nex_stop = fm_data.iloc[finder]
            time = (nex_stop['Start time'] - cur['End time']).total_seconds() / 60.0

            fm_data['poh_time'].iloc[x+1] = time
            
            fm_data['gantt_start'].iloc[x+1] = cur['End time']
            fm_data['gantt_finish'].iloc[x+1] = nex_stop['Start time']
    return fm_data

# Issues
# Overlap between break time and driving time - quite hard to cut but will try
# Overlap between Route Collection/PUDO Inbound with DP Drop

def break_time(fm_data):
    # Create a skipper variable to 
    skipper = 0 
    for x in range(0, len(fm_data) - 1):
        # As long as skipper variable is not 0, keep skipping rows, a just in case due to previous coding issues
        if skipper != 0:
            skipper -= 1
            continue

        cur = fm_data.iloc[x]
        nex = fm_data.iloc[x+1]
        
        # If there are consecutive geotab events
        if cur['Event'] == 'geotab' and nex['Event'] == 'geotab':
            # Create temp variable to hold index of the next geotab
            index = x+1
            # Create temp variable to find the index of the next productive event
            finder = 0 
            while True: 
                if index + 1 == len(fm_data): 
                    break
                # If the next next event is a geotab event, increase index variable
                elif fm_data.iloc[index+1]['Event'] == 'geotab': 
                    index += 1
                # If the next next event is not a geotab event, set finder variable to its location then exit
                elif fm_data.iloc[index+1]['Event'] != 'geotab':
                    finder = index 
                    break
            # The number of rows of data to skip is the finder variable (index of next productive event) minus the index we start fom (x)
            skipper = finder - x 

            # If skipper variable is 1 or 0, set skipper to 0 and skip
            if skipper <= 1: 
                skipper = 0 
                continue 
              
            # Store row of last geotab event before productive event
            last_stop = fm_data.iloc[finder]
            
            # Calculate the time between the first consecutive geotab event and the last geotab event before a productive event
            time = (last_stop['Start time'] - cur['End time']).total_seconds() / 60.0
            # Calculate the break driving time to be the sum of all driving minutes (from geotab) between the first consecutive geotab event to the last geotab event before a productive event
            break_drive_time = fm_data['Driving Duration (min)'].iloc[x+1:finder].sum()

            # Set the break time to the consecutive geotab calculation
            # Set the break drive time to the sum of all driving minutes
            fm_data['break_time'].iloc[x+1] = time
            fm_data['break_drive_time'].iloc[x+1] = break_drive_time

            fm_data['gantt_start'].iloc[x+1] = cur['End time']
            fm_data['gantt_finish'].iloc[x+1] = last_stop['Start time']
          
        # This condition was added to account for overlap between the break time calculation and route inbound calculation
        # If there is a pudo inbound event followed by a geotab event
        elif cur['Event'] == 'pudo_inbound' and nex['Event'] == 'geotab':
            # Create a variable to store index of the geotab evenet
            index = x+1
            finder = 0

            # Keep running until ...
            while True:
                # Reach the end of the input data
                if index + 1 == len(fm_data): 
                    break
                # If the next next event is a geotab event, increase index variable
                elif fm_data.iloc[index+1]['Event'] == 'geotab': 
                    index += 1
                # If the next next event is not a geotab event, set finder variable to its location then exit
                elif fm_data.iloc[index+1]['Event'] != 'geotab':
                    finder = index 
                    break
            # The number of rows of data to skip is the finder variable (index of next productive event) minus the index we start fom (x)
            skipper = finder - x 
            # If skipper variable is 1 or 0, set skipper to 0 and skip
            if skipper <= 1: 
                skipper = 0 
                continue
                
    return fm_data
# What if there is no geotab event after pudo_inbound but after a productive event
def route_collection_time(fm_data):
    # Filter the input data to look for pudo inbound events
    temp = fm_data[fm_data['Event']=='pudo_inbound']
    # If there (is) no pudo inbound events in the input data, just return the data
    if len(temp) == 0:
        return fm_data
    else:
        # If there (is) a pudo inbound event, reset the input data index for further manipulation 
        fm_data = fm_data.reset_index(drop=True)
        
        # Find the earliest pudo inbound event time and the index of it
        van_ibs = fm_data[fm_data['Event']=='pudo_inbound']
        first_van_ibs = van_ibs['Start time'].min()
        van_ibs_index = van_ibs['Start time'].idxmin()
        
        # Create a temporary variable that segments the data from the earliest pudo inbound event onwards
        fm_data2 = fm_data.iloc[van_ibs_index:]
        
        # If there are no reservation attempt events ...
        if len(fm_data2[fm_data2['Event']=='reservation attempt']) == 0:
            return fm_data
        # OR If there are no DP drop events after the earliest pudo inbound event, just return the data
        elif len(fm_data2[fm_data2['Event']=='dp_drop']) == 0:
            return fm_data
        
        # Find the indexes of the earliest reservation attempt and DP drop event after the earliest pudo inbound event
        rsvn_index = fm_data2[fm_data2['Event']=='reservation attempt']['Start time'].idxmin()
        pudo_drop_index = fm_data2[fm_data2['Event']=='dp_drop']['Start time'].idxmin()
        
        # Find the earliest of the 2 events prior
        min_index = min(rsvn_index,pudo_drop_index)
        # Segment the data to stop at the earliest of the 2 events prior ([pudo_inbound : rsvn/dp_drop])
        fm_data3 = fm_data2.iloc[:min_index]

        # Filter the 'sandwiched' data to look for geotab events
        fm_data4 = fm_data3[fm_data3['Event']=='geotab']
        # If there are no geotab events left, just return the data
        if len(fm_data4) == 0:
            return fm_data
        else:
            # If there is any geotab events left, find the last geotab event in the 'sandwiched' data
            geo_max = fm_data4['Start time'].max()
            
        # Calculate the time between the first pudo inbound event and the last geotab event in the 'sandwiched' data
        time = (geo_max - first_van_ibs).total_seconds() / 60.0
        # Calculate the break driving time to be the sum of all driving minutes (from geotab) between the first consecutive geotab event to the last geotab event before a productive event
        fm_data.loc[fm_data.index[van_ibs_index],'route_collection_time'] = time

        fm_data.loc[fm_data.index[van_ibs_index],'gantt_start'] = first_van_ibs
        fm_data.loc[fm_data.index[van_ibs_index],'gantt_finish'] = geo_max


        return fm_data

In [None]:
# Will no longer give different results from below because of new_route_id method
# # Rename dataframe to run smoother with existing code
# fm = new_df_concat.copy()

# # Create an empty dataframe with columns from the renamed dataframe
# fm_result = fm.iloc[:0]

# # Go through every unique route date in the data
# for route in fm['new_route_id'].unique():
#     fm_temp = fm[fm['new_route_id']==route]
#     # Run all previously created time calculation functions
#     fm_temp2 = service_time(fm_temp)
#     fm_temp3 = dp_service_time(fm_temp2)
#     fm_temp4 = poh_time(fm_temp3)
        
#     fm_temp5 = break_time(fm_temp4)
        
#     fm_temp6 = route_collection_time(fm_temp5)
        
#     # Store all newly calculated data in the empty dataframe created
#     fm_result = pd.concat([fm_result, fm_temp6], ignore_index=True, axis=0)

In [None]:
# fm_result.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "/fm_result.csv", index=False)

In [None]:
# Repeat the above for the cleaned dataset

fm2 = df_concat3.copy()
fm_result2 = fm2.iloc[:0] # Create empty results dataframe for this loop

for route in fm2['new_route_id'].unique():
    fm_temp = fm2[fm2['new_route_id']==route]
    
    fm_temp2 = service_time(fm_temp)
    fm_temp3 = dp_service_time(fm_temp2)
    fm_temp4 = poh_time(fm_temp3)
        
    fm_temp5 = break_time(fm_temp4)
        
    fm_temp6 = route_collection_time(fm_temp5)
        
    # Store all newly calculated data in the empty dataframe created
    fm_result2 = pd.concat([fm_result2, fm_temp6], ignore_index=True, axis=0)

# Break to Time Out

In [None]:
fm_result3 = pd.DataFrame({})

for i in fm_result2['new_route_id'].unique():
    temp = fm_result2[fm_result2['new_route_id']==i].reset_index()
    
    break_over = 0
    
    last_event = temp.iloc[-1]
    if (last_event['Event'] == 'geotab' and last_event['Event'] == 'POH'):
        if temp['time_out'].iloc[0]>last_event['End time']:
            break_over = (temp['time_out'].iloc[0]-last_event['End time']).seconds/60
            save = pd.DataFrame({'new_route_id':[i],'in_out_checker':[1],'Event':['Break'],'Start time':[last_event['End time']],'break_time':[break_over],
                           'scheduled_in':[temp.iloc[0]['scheduled_in']], 'scheduled_out':[temp.iloc[0]['scheduled_out']],
                           'time_in':[temp.iloc[0]['time_in']], 'time_out':[temp.iloc[0]['time_out']], 'scheduled_hours':[temp.iloc[0]['scheduled_hours']],
                           'gantt_start':[last_event['End time']], 'gantt_finish':[temp['time_out'].iloc[0]]})
      
    elif (last_event['Event'] != 'geotab' or last_event['Event'] != 'POH'):
        if temp['time_out'].iloc[0]>last_event['Start time']:
            break_over = (temp['time_out'].iloc[0]-last_event['Start time']).seconds/60
            save = pd.DataFrame({'new_route_id':[i],'in_out_checker':[1],'Event':['Break'],'Start time':[last_event['Start time']],'break_time':[break_over],
                           'scheduled_in':[temp.iloc[0]['scheduled_in']], 'scheduled_out':[temp.iloc[0]['scheduled_out']],
                           'time_in':[temp.iloc[0]['time_in']], 'time_out':[temp.iloc[0]['time_out']], 'scheduled_hours':[temp.iloc[0]['scheduled_hours']],
                           'gantt_start':[last_event['Start time']], 'gantt_finish':[temp['time_out'].iloc[0]]})
    
    df_temp = pd.concat([temp,save]).reset_index(drop=True)
    fm_result3 = pd.concat([fm_result3,df_temp])

In [None]:
fm_result3 = fm_result3.reset_index(drop=True)

In [None]:
fm_result3.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "/fm_result3.csv", index=False)

In [None]:
# fm_result2[fm_result2['new_route_id']=='1093943_2023-05-19'][['Event','break_time','Start time','time_out']].tail(10)

In [None]:
# fm_result3[fm_result3['new_route_id']=='1093943_2023-05-19'][['Event','break_time','Start time','time_out']].tail(10)

# Import Postcodes Enriched

In [None]:
# Import Postcodes Enriched
pe = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Input/" + "/postcodes_enriched.csv")
# pe = pd.read_csv('postcodes_enriched.csv')

In [None]:
# Change the data type of the column 'postcode' in Postcodes Enriched to a String from a Float
pe['postcode'] = pe['postcode'].astype(str)

In [None]:
# pe[pe['route_zone'].str.len()>6] # Most likely due to whitespace, will be removed in time

In [None]:
# fm_result['Postcode'] = fm_result['Postcode'].str.strip()
fm_result3['Postcode'] = fm_result3['Postcode'].str.strip()

In [None]:
fm_result3 = fm_result3.drop(['level_0','index'],axis=1)

# Combine data with Postcodes Enriched

In [None]:
# Join Postcodes Enriched's data (Postcode, Building Type) to the data
# fm_result = fm_result.merge(pe[['postcode','bldg_type']],left_on='Postcode',right_on='postcode',how='left')
fm_result3 = fm_result3.merge(pe[['postcode','bldg_type']],left_on='Postcode',right_on='postcode',how='left')

In [None]:
############################

# Data Chrun for Theo

In [None]:
# test = fm_result3[fm_result3['new_route_id']=='1093943_2023-05-19']
#[['new_route_id','Event','Distance','Postcode','Driving Duration (min)']]

In [None]:
fm_result_theo = pd.DataFrame({})
issue_routes = []
for i in fm_result3['new_route_id'].unique():
  temp = fm_result3[fm_result3['new_route_id']==i].reset_index(drop=True)

  if temp.iloc[0]['Event'] == 'geotab':
    start_index = 0
    while temp.iloc[start_index]['Event'] == 'geotab':
      start_index += 1
  else:
    start_index = 0

  temp2 = temp[ (temp['Event']!='geotab') & (temp['Event']!='Break')].reset_index()
  # print(temp2['Event'])
  try:
    end_index = temp2.iloc[-1]['index']
  except:
    issue_routes.append(i)
    continue

  to_append = temp.iloc[start_index:end_index+1]
  fm_result_theo = pd.concat([fm_result_theo,to_append])

In [None]:
fm_result_theo = fm_result_theo.reset_index(drop=True)

In [None]:
fm_result_theo

Unnamed: 0,Driver ID,Driver name,Route ID,Postcode,Start time,Reservation status,Route date,Distance updated,Faraway flag,Legacy Shipper ID,...,service_time,dp_service_time,poh_time,break_time,break_drive_time,route_collection_time,gantt_start,gantt_finish,postcode,bldg_type
0,1013338,West 3 - Staff - Syaparudin Aris,1935053.0,129809,2023-05-08 11:23:00,Success,2023-05-08,34.234835,0.0,199887.0,...,,,,,,,,,129809,Office
1,1013338,West 3 - Staff - Syaparudin Aris,1935053.0,609930,2023-05-08 12:54:00,Success,2023-05-08,40.865165,0.0,28449.0,...,,,,,,,,,609930,Office
2,1013338,West 3 - Staff - Syaparudin Aris,1935053.0,608599,2023-05-08 13:10:00,Success,2023-05-08,42.799120,0.0,246154.0,...,,,,,,,,,608599,Office
3,1013338,West 3 - Staff - Syaparudin Aris,1935053.0,608609,2023-05-08 13:17:00,Success,2023-05-08,39.099922,0.0,35925.0,...,,,,,,,,,608609,Office
4,1013338,West 3 - Staff - Syaparudin Aris,1935053.0,608609,2023-05-08 13:20:00,Success,2023-05-08,71.784901,0.0,213030.0,...,,,,,,,,,608609,Office
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46178,1588737,East 3 - Staff - Mohammed Asy'ari,1944498.0,409286,2023-05-19 16:51:00,Success,2023-05-19,24.082546,0.0,168152.0,...,,,,,,,,,409286,Condo
46179,1588737,East 3 - Staff - Mohammed Asy'ari,1944498.0,409286,2023-05-19 16:51:00,Success,2023-05-19,24.670129,0.0,168152.0,...,,,,,,,,,409286,Condo
46180,1588737,East 3 - Staff - Mohammed Asy'ari,1944498.0,409286,2023-05-19 16:54:00,Success,2023-05-19,1158.199533,1.0,176158.0,...,,,,,,,,,409286,Condo
46181,1588737,East 3 - Staff - Mohammed Asy'ari,1944498.0,409286,2023-05-19 16:54:00,Success,2023-05-19,1159.822960,1.0,176158.0,...,,,,,,,,,409286,Condo


In [None]:
fm_result_theo.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "/after_raw_theo.csv", index=False)

In [None]:
# issue_routes

In [None]:
new_route_id = []
dist_bw_postcodes = []
travelling_time_bw_postcodes = []
# postcodes = [] Waiting on Fel
unique_postcodes = []

for i in fm_result_theo['new_route_id'].unique():
  temp = fm_result_theo[fm_result_theo['new_route_id']==i].reset_index(drop=True)

  new_route_id.append(i)
  dist_bw_postcodes.append(temp['Distance'].sum())
  travelling_time_bw_postcodes.append(temp['Driving Duration (min)'].sum())
  unique_postcodes.append(temp['Postcode'].nunique())

In [None]:
df_theo = pd.DataFrame({'new_route_id':new_route_id,'dist_bw_postcodes':dist_bw_postcodes,
                        'travelling_time_bw_postcodes':travelling_time_bw_postcodes,'unique_postcodes':unique_postcodes})

In [None]:
#############################

# Save data Before & After cleaning + Raw & Aggregate

In [None]:
# fm_result.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "/before_raw.csv", index=False)

In [None]:
# # Aggregate the current data and count how many events there are for each route
# a = fm_result.groupby(['Route date','Driver ID','Event']).size().unstack(fill_value=0)
# a2 = a.reset_index()
# a2.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "/before.csv", index=False)
# a2.head()

In [None]:
fm_result3.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "/after_raw.csv", index=False)

In [None]:
# Repeat the above for the cleaned dataset
b = fm_result3.groupby(['Route date','Driver ID','Event']).size().unstack(fill_value=0)
b2 = b.reset_index()
b2.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "/after.csv", index=False)
b2.head()

Event,Route date,Driver ID,reservation attempt,POH,geotab,dp_drop,pudo_inbound
0,2023-05-08,1013338,36,0,0,0,0
1,2023-05-08,1023598,27,2,3,0,0
2,2023-05-08,1045278,44,4,12,0,0
3,2023-05-08,1061018,56,3,19,0,0
4,2023-05-08,1065688,12,5,26,9,1


# Creation of FM Postcode Index

In [None]:
# Remove any rows of data with no service time or Postcode. These 2 values are needed for FM Postcode Index
index = fm_result3.dropna(subset=['service_time','Postcode']).reset_index(drop=True)

# Create empty lists to store extracted values
postcode = []
driver_id = []
driver_name = []
driver_team = []
service_time = []

# Go through each row of data
for i in range(0,len(index['Postcode'])):
    # Create a temporary variable to store each row of data
    temp = index.iloc[i]
    # If the row of data's Postcode is not NaN/missing but a string representation of nan then skip
    if temp['Postcode'] == 'nan':
        continue
    
    # Add each data point to the empty lists
    postcode.append(temp['Postcode'])
    driver_id.append(temp['Driver ID'])
    driver_name.append(temp['Driver name'])
    driver_team.append(temp['Driver team'])
    service_time.append(temp['service_time'])

In [None]:
# Create a dataframe to store data to create FM Postcode Index
fm_postcode_index = pd.DataFrame({'postcode':postcode, 'driver_id':driver_id, 'driver_name':driver_name, 'driver_team':driver_team, 'service_time':service_time})

# Sort the FM Postcode Index by postcode
fm_postcode_index = fm_postcode_index.sort_values(by='postcode').reset_index(drop=True)
fm_postcode_index

Unnamed: 0,postcode,driver_id,driver_name,driver_team,service_time
0,100077,1435806,PUDO Central - Staff - Zulfadhli Khairudin,,11.0
1,100077,1435806,PUDO Central - Staff - Zulfadhli Khairudin,,10.0
2,100077,1435806,PUDO Central - Staff - Zulfadhli Khairudin,,24.0
3,100077,1435806,PUDO Central - Staff - Zulfadhli Khairudin,,8.0
4,100077,1435806,PUDO Central - Staff - Zulfadhli Khairudin,,9.0
...,...,...,...,...,...
6794,99418,1411165,Central 3 - Staff - Taufiq Adain,,6.0
6795,99418,1411165,Central 3 - Staff - Taufiq Adain,,5.0
6796,99418,1411165,Central 3 - Staff - Taufiq Adain,,6.0
6797,99418,1411165,Central 3 - Staff - Taufiq Adain,,5.0


In [None]:
fm_postcode_index.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "/fm_postcodes_index.csv", index=False)

# Import shippers with no pickup scan

In [None]:
# Import list of shippers with no pickup scan
shippers = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Input/" + "/shippers_no_pu_scan.csv")
# shippers = pd.read_csv('shippers_no_pu_scan.csv')

In [None]:
# Note on what does column names mean
# Distance = Driving distance
# Distance updated = FM faraway reservations' distance

# Overall data aggregation - For final output


In [None]:
# Create empty lists to store exisiting and newly created data
route_date = []
route_id = []
new_route_id = []

driver_id = []
driver_name = []
team = []

service_time = []
handover_time = []
break_time = []
breakdrive_time = []
driving_time = []
dp_drop_time = []
route_collection_time = []
first_start_time = []
last_end_time = []

geotab_events = []
rsvn_events = []
dp_events = []
poh_events = []

dps_serviced = []
bldg_breakdown = []
shippers_no_pu_scan = []

no_geotab_flag = []

sa_overtime_flag = []
scheduled_hours = []
scheduled_in_time = []
scheduled_out_time = []
time_in_time = []
time_out_time = []
last_productive_event_type = []
last_productive_event_time = []

early_route_end_flag = []

# faraway_reservations = []

total_distance_in_route = []
total_productive_distance = []

for i in fm_result3['new_route_id'].unique():
    # Create a temporary variable to store a specific route
    temp = fm_result3[fm_result3['new_route_id']==i]
    new_route_id.append(i)

    # faraway_reservations.append(temp['Faraway flag'].sum())
    
    tdd = 0
    for j in range(0,len(temp)):
        temp2 = temp.iloc[j]
        if temp2['Event'] == 'geotab' and np.isnan(temp2['break_time']):
            tdd += temp2['Distance']
            
    total_productive_distance.append(tdd)
    total_distance_in_route.append(temp['Distance'].sum())

    # Filter for only geotab events
    check = temp[temp.Event=='geotab']
    # If there are no geotab events in the dataset, flag is set to 1
    if len(check) == 0:
        no_geotab_flag.append(1)
    # If there is at least 1 geotab event in the dataset, flag is set to 0
    else:
        no_geotab_flag.append(0)
    
    diff = pd.Timedelta(temp['time_out'].values[0] - temp['scheduled_out'].values[0])
    if diff.total_seconds()/60 >= 5:
        sa_overtime_flag.append(1)
    else:
        sa_overtime_flag.append(0)
    
    scheduled_hours.append(temp['scheduled_hours'].values[0])
    scheduled_in_time.append(temp['scheduled_in'].dropna().iloc[0])
    scheduled_out_time.append(temp['scheduled_out'].dropna().iloc[0])
    time_in_time.append(temp['time_in'].dropna().iloc[0])
    time_out_time.append(temp['time_out'].dropna().iloc[0])

    temp2 = temp[ (temp['Event']!='geotab') & (temp['Event']!='Break') ].copy()
    temp3 = temp2.sort_values(by='Start time').reset_index(drop=True)
    if len(temp3) == 0:
        last_productive_event_type.append(np.nan)
        last_productive_event_time.append(np.nan)
    else:
        temp4 = temp3.iloc[-1]
        last_productive_event_type.append(temp4['Event'])
        last_productive_event_time.append(temp4['Start time'])
    
    route_date.append(temp['Route date'].dropna().values[0])

    # See if there is at least 1 entry of 'Route ID' in the route. try...except catch used here to prevent potential code errors
    try:
        route_id.append(temp['Route ID'].dropna().values[0])
    except:
        route_id.append(np.nan)

    driver_id.append(temp['Driver ID'].dropna().values[0])
    # See if there is at least 1 entry of 'Driver name' in the route

    try:
        # driver_name.append(temp[temp['Event']=='reservation attempt']['Driver name'].values[0].strip())
        driver_name.append(temp['Driver name'].dropna().values[0].strip())
    except:
        driver_name.append(np.nan)
        
    # Get the number of each event in the route
    events = temp['Event'].value_counts()

    # See if there are any specific event in 'events'
    try:
        geotab_events.append(events['geotab'])
    except:
        geotab_events.append(0)
    try:
        rsvn_events.append(events['reservation attempt'])
    except:
        rsvn_events.append(0)
    try:
        dp_events.append(events['dp_drop'])
    except:
        dp_events.append(0)
    try:
        poh_events.append(events['POH'])
    except:
        poh_events.append(0)

    temp2 = temp.reset_index(drop = True)

    # Calculate Service Time
    service_time.append(temp2['service_time'].sum())

    # Calculate DP Service Time
    dp_drop_time.append(temp2['dp_service_time'].sum())

    # Calculate POH Time
    handover_time.append(temp2['poh_time'].sum())

    # Calculate Break Time
    break_time.append(temp2['break_time'].sum()) 
    breakdrive_time.append(temp2['break_drive_time'].sum())
        
    # Calculate Driving Time
    driving_time.append(temp2['Driving Duration (min)'].sum() - temp2['break_drive_time'].sum())
        
    # Calculate Route Collection Time
    route_collection_time.append(temp2['route_collection_time'].sum())
    
    # Get the first Start time in route
    first_start_time.append(temp2['Start time'].dropna().min())
    # Get the last End time in route
    last_end_time.append(temp2['End time'].dropna().max())

    bldg_breakdown.append(dict(temp2['bldg_type'].value_counts()))
        
    shippers_no_pu_scan.append( sum(temp2['Legacy Shipper ID'].dropna().astype(int).isin(shippers['Legacy ID']) ) )
        
    dps_serviced.append(temp2['DP_id'].nunique())
        
    try:
        team.append(temp2[temp2['Event']=='reservation attempt']['Driver name'].values[0].split('-',1)[0].strip())
    except:
        team.append(np.nan)
        
    
    temp3 = temp2[ (temp2.Event!='geotab') & (temp2.Event!='Break')]['Start time'].dropna().sort_values().reset_index(drop=True)
    try:
        save = temp3.iloc[-1]
        save2 = temp3.iloc[-2]
    except:
        early_route_end_flag.append(np.nan)
        continue
        
    if pd.Timedelta(temp3.iloc[-1]-temp3.iloc[-2]).total_seconds()/60 >= 60:
        early_route_end_flag.append(1)
    else:
        early_route_end_flag.append(0)

In [None]:
# Create a dataframe for the final aggregated dataset
results = pd.DataFrame({'Route ID': route_id, 'New Route ID':new_route_id, 'Route date':route_date,
                        'Driver ID':driver_id,'Driver Name':driver_name, 'Driver Team':team,
                        'Service Time':service_time,'Handover Time':handover_time,
                        
                        'Driving Time':driving_time, 'Break Time': break_time, 'Break Driving Time':breakdrive_time,
                        
                        'Route Collection Time':route_collection_time, 'DP Service Time':dp_drop_time,
                        'First Start Time':first_start_time, 'Last End Time':last_end_time,
                        'StaffAny Overtime Flag':sa_overtime_flag, 

                        'Scheduled Hours':scheduled_hours,
                        'Scheduled In Time':scheduled_in_time, 'Scheduled Out Time':scheduled_out_time, 'Time In':time_in_time, 'Time Out':time_out_time, 'Last Productive Event Type':last_productive_event_type, 'Last Productive Event Time':last_productive_event_time,

                        'Total Distance in Route':total_distance_in_route, 'Total Productive Distance':total_productive_distance,

                        'Early Route End Flag':early_route_end_flag,
                        'No GeoTab Events Flag':no_geotab_flag,

                        'Count GeoTab Events': geotab_events, 'Count RSVN Events':rsvn_events,
                        'Count DP Drop Events': dp_events, 'Count POH Events': poh_events,
                        
                        'Building Breakdown':bldg_breakdown, 'Count of Shippers with no PU Scan':shippers_no_pu_scan,
                        'DPs Serviced':dps_serviced
                        # 'Faraway Reservations':faraway_reservations
                       })

# Creation of new columns - Help FM filter and understand the data better

In [None]:
# Create additional aggregate variables that help FM understand the output data better
results['Total Calculated Time'] = results['Service Time'] + results['Handover Time'] + results['Driving Time'] + results['Break Time'] + results['Route Collection Time'] + results['DP Service Time']
results['Total Working Time'] = results['Service Time'] + results['Handover Time'] + results['Driving Time'] + results['Route Collection Time'] + results['DP Service Time']

results['Calculated Overtime Flag'] = np.where(results['Total Working Time']>=results['Scheduled Hours']*60.0,1,0)
results['Invalid Overtime Flag'] = np.where(results['StaffAny Overtime Flag']!=results['Calculated Overtime Flag'],1,0)

In [None]:
# Create new columns to represent the percentages each event makes up of a route
results['geo_percent'] = results['Count GeoTab Events'] / (results['Count GeoTab Events'] + results['Count RSVN Events'] + results['Count DP Drop Events'] + results['Count POH Events']) * 100
results['rsvn_percent'] = results['Count RSVN Events'] / (results['Count GeoTab Events'] + results['Count RSVN Events'] + results['Count DP Drop Events'] + results['Count POH Events']) * 100
results['dp_drop_percent'] = results['Count DP Drop Events'] / (results['Count GeoTab Events'] + results['Count RSVN Events'] + results['Count DP Drop Events'] + results['Count POH Events']) * 100
results['poh_percent'] = results['Count POH Events'] / (results['Count GeoTab Events'] + results['Count RSVN Events'] + results['Count DP Drop Events'] + results['Count POH Events']) * 100

# Problematic Flag Creation

In [None]:
problematic_flag = []

# ['geo_percent']<20) & (results['geo_percent']>60))) | (results['Count GeoTab Events']==0)
for i in range(0,len(results)):
    temp = results.iloc[i]
    if temp['geo_percent'] >= 20 and temp['geo_percent'] <= 60:
        problematic_flag.append(0)
    elif temp['geo_percent'] < 20:
        problematic_flag.append(1)
    elif temp['geo_percent'] > 60:
        problematic_flag.append(1)
    elif temp['Count GeoTab Events'] != 0:
        problematic_flag.append(0)
    elif temp['Count GeoTab Events'] == 0:
        problematic_flag.append(1)

results['Problematic Flag'] = problematic_flag

# Parcels Picked

In [None]:
# Import the number of parcels picked by each driver on each route date
# ON HOLD FOR NOW
# parcels_picked = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Input/" + "/parcels_picked.csv")

# parcels_picked['route_id'] = parcels_picked['route_id']
# parcels_picked['parcels_picked'] = parcels_picked['parcels_picked'].astype(int)
# results2 = results.merge(parcels_picked, left_on='Route ID', right_on='route_id', how='left')

# Final Output: proper_routes.csv

In [None]:
# Rename column names to desired format and structure
results2 = results.rename(columns={'Route date':'Route Date',
                                   'First Start Time':'Route Start Time','Last End Time':'Route End Time',
                                   'geo_percent':'Geotab Percent',
                                   'rsvn_percent':'RSVN Percent','dp_drop_percent':'DP Drop Percent',
                                   'poh_percent':'POH Percent'})
                                    
                                    # 'parcels_picked':'Parcels Picked'})

In [None]:
# Reorder the columns of the output dataset
results3 = results2[['Route ID','New Route ID','Route Date',
                     'Driver ID','Driver Name','Driver Team',
                     'Count of Shippers with no PU Scan','Building Breakdown',
                     'DPs Serviced',
                    #  'Faraway Reservations',
                     
                     'Service Time','Handover Time','Driving Time','Break Time',
                     'Break Driving Time','Route Collection Time','DP Service Time',
                     
                     'Total Calculated Time','Total Working Time',
                     'Route Start Time','Route End Time',

                     'Scheduled In Time','Scheduled Out Time','Time In','Time Out','Scheduled Hours','Last Productive Event Type','Last Productive Event Time',
                     
                     'StaffAny Overtime Flag','Calculated Overtime Flag','Invalid Overtime Flag', 'Total Distance in Route', 'Total Productive Distance',
                     
                     'Problematic Flag','No GeoTab Events Flag','Early Route End Flag']]

In [None]:
results3.head()

Unnamed: 0,Route ID,New Route ID,Route Date,Driver ID,Driver Name,Driver Team,Count of Shippers with no PU Scan,Building Breakdown,DPs Serviced,Service Time,...,Last Productive Event Type,Last Productive Event Time,StaffAny Overtime Flag,Calculated Overtime Flag,Invalid Overtime Flag,Total Distance in Route,Total Productive Distance,Problematic Flag,No GeoTab Events Flag,Early Route End Flag
0,1935053.0,1013338_2023-05-08,2023-05-08,1013338,West 3 - Staff - Syaparudin Aris,West 3,0,"{'Office': 34, 'Industrial/Factory/Warehouse':...",0,0.0,...,reservation attempt,2023-05-08 21:39:00,1,0,1,0.0,0.0,1,1,0.0
1,1935973.0,1013338_2023-05-09,2023-05-09,1013338,West 3 - Staff - Syaparudin Aris,West 3,0,"{'Office': 20, 'Industrial/Factory/Warehouse': 1}",0,0.0,...,reservation attempt,2023-05-09 16:11:00,0,0,0,0.0,0.0,1,1,0.0
2,1936870.0,1013338_2023-05-10,2023-05-10,1013338,West 3 - Staff - Syaparudin Aris,West 3,0,{'Office': 13},0,0.0,...,reservation attempt,2023-05-10 15:04:00,0,0,0,0.0,0.0,1,1,0.0
3,,1013338_2023-05-11,2023-05-11,1013338,West 3 - Staff - Syaparudin Aris,,0,{},0,0.0,...,,NaT,0,0,0,25.232443,25.232443,1,0,
4,,1013338_2023-05-12,2023-05-12,1013338,West 3 - Staff - Syaparudin Aris,,0,{},0,0.0,...,,NaT,0,0,0,13.146244,13.146244,1,0,


In [None]:
results3.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "proper_routes.csv")

In [None]:
results_theo = results3.merge(df_theo, left_on = 'New Route ID', right_on = 'new_route_id',how='left')

In [None]:
results_theo.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/" + month_year + "/" + week_number + "/Output/" + "/proper_routes_theo.csv", index=False)

# Gantt Chart Data Accumulator




In [None]:
# Read repo file to store new batch of gantt_data
# Only run this once a new dataset

# gantt_repo = pd.read_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/Gantt Chart Data/gantt_repo.csv")
# gantt_data = fm_result3[['new_route_id','Driver name','Event','Start time','End time','Driving Duration (min)',
#                          'service_time','dp_service_time','poh_time','break_time', 'break_drive_time','route_collection_time','gantt_start','gantt_finish']]
# gantt_data['timestamp'] = week_number

# gantt_repo = pd.concat([gantt_repo,gantt_data], ignore_index=True, axis=0)
# gantt_repo.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/Gantt Chart Data/gantt_repo.csv",index=False)
# gantt_data.to_csv("/content/drive/My Drive/FM Productivity - test/Data Repository/Gantt Chart Data/gantt_repo_" + week_number + ".csv", index=False)

# Route Checking

In [None]:
# results4[ (results4['Problematic Flag']==0)].sample(20)['New Route ID'] # Can use new_route_id to select a route
# new_df_concat[ (new_df_concat['in_out_checker']==0) & (new_df_concat['Event']=='geotab') ][['new_route_id']]

In [None]:
# Before and After Cleaning
# print(staff_any2[ (staff_any2['Driver Strength ID']=='1324379') & (staff_any2['Date']=='2023-04-15') ][['Scheduled In','Time Out']])
# print('------------------------------------------')
# print(new_df_concat[new_df_concat['new_route_id']=='1324379_2023-04-15'].head()[['Start time','in_out_checker']])
# print(new_df_concat[new_df_concat['new_route_id']=='1324379_2023-04-15'].tail()[['Start time','in_out_checker']])
# print('------------------------------------------')
# print(df_concat2[df_concat2['new_route_id']=='1324379_2023-04-15'].head()[['Start time','in_out_checker']])
# print(df_concat2[df_concat2['new_route_id']=='1324379_2023-04-15'].tail()[['Start time','in_out_checker']])

In [None]:
# new_df_concat
# df_concat2
# new_df_concat[new_df_concat['new_route_id']=='1324379_2023-04-15']