In [13]:
# Imports
import pandas as pd
import altair as alt

# Loading the data
url = 'https://drive.google.com/uc?export=download&id=14O91N5OlVkvdGxXNJUj5jIsV5RexhzbB'
sessions = pd.read_csv(url)
# Dropping columns where data is missing
sessions = sessions.dropna()
# Dropping unnecessary columns
sessions_drop = sessions.drop(columns=['hashedEmail','original_start_time','original_end_time'])

Rows with missing data is dropped as well as unnecessary 

In [14]:
# Creating a new column for day of the week with Monday as 0 and Sunday as 6
sessions_drop['start_time'] = pd.to_datetime(sessions['start_time'], format='%d/%m/%Y %H:%M')
sessions_drop['end_time'] = pd.to_datetime(sessions['end_time'], format='%d/%m/%Y %H:%M')
sessions_days = sessions_drop.assign(day_of_week = sessions_drop['start_time'].dt.dayofweek)
# Calculate the duration of each access in minutes
sessions_days['duration_minutes'] = (sessions_days['end_time'] - sessions_days['start_time']) / pd.Timedelta(minutes=1)

In [15]:
# Calculate the average duration grouped by the day of the week
avg_duration = sessions_days.groupby('day_of_week')['duration_minutes'].mean().reset_index()
# Make a plot 
duration_plot = alt.Chart(avg_duration).mark_bar().encode(
    x=alt.X('duration_minutes:Q').title('Average duration of each session(minutes)'),
    y=alt.Y('day_of_week:N').title('Day of week'),
    color=alt.Color('day_of_week:N', legend=alt.Legend(title='Day of the week(Monday=0)'))
)

In [21]:
# Converting the start_time into measures of hours with minutes as decimals
sessions_days['start_time'] = (
    pd.to_datetime(sessions['start_time'], dayfirst=True).dt.hour +
    pd.to_datetime(sessions['start_time'], dayfirst=True).dt.minute / 60
)

# Extracting data for only weekdays
week_days = sessions_days[sessions_days['day_of_week'].isin([0, 1, 2, 3, 4])]

# Create a plot about the duration of each session and start time on weekdays
start_duration_weekday = alt.Chart(week_days, title='Duration of session relative to time on weekdays').mark_point(opacity=0.4, color='black').encode(
    x=alt.X('start_time:Q',
        title='Hour of Day',
        axis=alt.Axis(values=list(range(0, 24)), tickMinStep=1)),
    y=alt.Y('duration_minutes').title('Duration of session (minutes)'),
    color=alt.Color('day_of_week:N')
).properties(
    width=500
)
# Duration faceted by each day of the week

start_duration_weekday

In [23]:
# Extract data for weekends
week_ends = sessions_days[sessions_days['day_of_week'].isin([5,6])]

# Create a plot about the duration of each session and start time on weekdays
start_duration_weekends = alt.Chart(week_ends, title='Duration of session relative to time on weekends').mark_point(opacity=0.4, color='black').encode(
    x=alt.X('start_time:Q',
        title='Hour of Day',
        axis=alt.Axis(values=list(range(0, 24)), tickMinStep=1)),
    y=alt.Y('duration_minutes').title('Duration of session (minutes)'),
    color=alt.Color('day_of_week:N')
).properties(
    width=500
)
start_duration_weekends