# 2023: Week 46 - Late For School
### November 15, 2023

 
 - Input data
 - Exclude null values from the Arrival Time field. These are days the student wasn't present
 - Turn the Scheduled Start Time and Arrival Time fields into Date Time fields
 - Calculate how many minutes late each student is each day
 - If they arrive before the Scheduled Start Time, they'll have a negative value for this field
 - Rank the days of the week in order of Average Lateness
 - Convert the Average Lateness into 2 fields: 1 for Minutes and 1 for Seconds
 - Create Output 1
 - Rank the students by the percentage of days they were “very late” (more than 5 minutes) during the year
 - Create Output 2

In [40]:
import os
import pandas as pd
import numpy as np
import datetime as dt

In [41]:
# input data
df = pd.read_csv("Prep School Card Entries.csv")

In [42]:
# some cleaning
df.columns=[i.lower().strip().replace(' ','_') for i in df.columns]

In [43]:
# Exclude nulls 
df=df.loc[~df.arrival_time.isna()].reset_index(drop=True).copy()

In [44]:
# Parse dates
df.scheduled_start_time = pd.to_datetime(df.date + ' ' + df.scheduled_start_time)
df.arrival_time = pd.to_datetime(df.date + ' ' + df.arrival_time)

# update column types
df.date=pd.to_datetime(df.date)

# calculate minutes late
df['minutes_late'] =df.arrival_time - df.scheduled_start_time
df.minutes_late=df.minutes_late.dt.total_seconds()/60


In [45]:
# Rank Dow by average lateness
# Group and rank
df_dow=df.groupby(by=['day_of_week']).mean(numeric_only=True).drop(columns='student_id').sort_values('minutes_late').copy()

# get minutes and seconds

df_dow['minutes'] = df_dow.minutes_late.astype('int64')
df_dow['seconds'] = df_dow.minutes_late - df_dow.minutes 

df_dow.seconds=df_dow.seconds*60
df_dow.seconds=df_dow.seconds.astype('int')

In [46]:
# Create output 1

df_dow.to_csv('output1.csv')
# os.startfile('output1.csv')

In [83]:
# % Days very late by student
df['very_late'] = df.minutes_late>5
df_students=df.groupby('student_id').agg({'date':'count','very_late':'sum'}).reset_index()
df_students['perc_late'] = df_students['very_late']/df_students['date']
# df_students['perc_late']=df_students['perc_late'] *100
df_students['perc_late']=[round(i*100,1) for i in df_students['perc_late']]

# rank

df_students=df_students.sort_values('perc_late',ascending=False).reset_index(drop=True)
df_students['rank']=df_students.perc_late.rank(ascending=False)
df_students[['rank','student_id', 'perc_late', ]].to_csv('output2.csv',index=False)
os.startfile('output2.csv')