**Import Packages**

In [1]:
import pandas as pd
import numpy as np

**Constants**

In [2]:
START_TIME = pd.to_datetime('2025-04-10 18:15:00', format='%Y-%m-%d %H:%M:%S') # Start time of online conference

**Import Clean Data**

In [3]:
# Set folder path
folder_path = r"..\data\clean"

# Import clean zoom data
zoom_df = pd.read_csv(folder_path+r'\clean-zoom-data.csv')

# Convert join_time and leave_time to pd.DateTime()
zoom_df['join_time'] = pd.to_datetime(zoom_df['join_time'], format='%Y-%m-%d %H:%M:%S')
zoom_df['leave_time'] = pd.to_datetime(zoom_df['leave_time'],format='%Y-%m-%d %H:%M:%S')

# Import clean eventbrite data
eventbrite_df = pd.read_csv(folder_path+r'\clean-eventbrite-data.csv')

In [4]:
zoom_df.head()

Unnamed: 0,user_name,email,join_time,leave_time,time_in_session_in_minutes
0,Vi,artcici9@gmail.com,2025-04-10 18:30:41,2025-04-10 21:03:36,153
1,Rosil Burnett-Gibbs,gibbsfamily32@gmail.com,2025-04-10 18:34:07,2025-04-10 20:59:44,146
2,Keesha,keesha.williams@yahoo.com,2025-04-10 18:34:15,2025-04-10 18:39:18,6
3,Keesha - Toronto,keesha.williams@yahoo.com,2025-04-10 18:39:13,2025-04-10 20:03:25,85
4,Simone,simoneibyrne@gmail.com,2025-04-10 19:02:31,2025-04-10 20:26:08,84


In [5]:
eventbrite_df.head()

Unnamed: 0,first_name,last_name,email,city,province_territory,postal_zip_code,please_specify
0,Camille,Williams,camillekwilliams@yahoo.ca,Toronto,ON,M3A 3J9,
1,Elaine,Goulbourne,Elaine.goulbourne@wchospital.ca,Toronto,ON,M5S1B2,
2,Ciara,M Shanks,shanksciara@gmail.com,MILWAUKEE,International (residing outside of Canada),53218,Wisconsin
3,Mercy,Osayi,mercywodicommunity@gmail.com,Kitchener,ON,N2A2P4,
4,Texas State Officer Constance,Jones,crjones2264@gmail.com,Houston,MB,77042,


# Pre-processing
- Merge datasets on email
- Find the following subsets:
    + Registered and Attended --> _merge col: 'both' → email in registrations and attendance
    + Registered and Did not Attend --> _merge col: 'left_only' → in registrations only (registered but did not attend)
    + Did not Register and Attended --> _merge col: 'right_only' → in attendance only (attended but did not register)
- Combine data sets for Registered and Attended + Not Registered and Attended
    + This will be used for the breakdown of attendees by location
- Resolve Multiple Instances of Join and Leave Times
    + The approach for this will be to group by email and keep the minimum join time and the max leave time
    + Calculate Time in Session by subtracting Join Time from Leave Time

In [6]:
# Merge data sets
merged = eventbrite_df.merge(
    zoom_df[['email']],           # keep only email from zoom to avoid column collisions
    on='email',
    how='outer',                  # keep union so you can see all combos
    indicator=True
)

merged.head()

Unnamed: 0,first_name,last_name,email,city,province_territory,postal_zip_code,please_specify,_merge
0,Mo,Akins,1life2lve@gmail.com,Hamilton,ON,L8T3A7,,left_only
1,Charlene,Mcfarlane,2canwinpr@gmail.com,Whitby,ON,L1N 6W6,,left_only
2,Althea,Mcknight,424x8wc7m6@privaterelay.appleid.com,Toronto,ON,M4B2E5,,left_only
3,Alisha,Edouard,Ajansandy@gmail.com,Toronto,ON,M6h 2t3,,left_only
4,Yvonne,azaglo,Amaley@gmail.com,Brampton,ON,L7A 4Z8,,left_only


**Create Subsets for Further Analysis**

In [7]:
# registered AND attended
registered_and_attended = merged[merged['_merge'] == 'both'].copy()

# registered AND did NOT attend (registered only)
registered_not_attended = merged[merged['_merge'] == 'left_only'].copy()

# attended AND did NOT register (attended only) -- useful extra subset
attended_not_registered = merged[merged['_merge'] == 'right_only'].copy()

# Create one DataFrame for all attendees, regardeless of registration
all_attendees = pd.concat([registered_and_attended, attended_not_registered], axis=0)
all_attendees = all_attendees.reset_index(drop=True)

In [8]:
# Save counts for each case
registered_and_attended_count = len(registered_and_attended)
attended_not_registered_count = len(attended_not_registered)
all_attendees_count = len(all_attendees)
registered_not_attended_count = len(registered_not_attended)

# Display values
print(f'registered & attended: {registered_and_attended_count}')
print(f'not registered & attended: {attended_not_registered_count}')
print(f'total attendees: {all_attendees_count}')
print(f'registered & not attended: {registered_not_attended_count}')

registered & attended: 448
not registered & attended: 158
total attendees: 606
registered & not attended: 773


**Eventbrite Data Pre-processing**

In [9]:
registered_and_attended.shape

(448, 8)

In [18]:
# Remove duplicates
registered_and_attended.drop_duplicates(subset=["email", "city", "province_territory", "please_specify"], inplace=True)

# Create DataFrame for Canada only. Remove nulls and blanks from province_territory column
registered_and_attended_can = registered_and_attended[registered_and_attended["province_territory"].notna() & (registered_and_attended["province_territory"] != "")]
registered_and_attended_can.reset_index(drop=True, inplace=True)

# Create DataFrame for International only. Keep non-null values in please_specify column
registered_and_attended_int = registered_and_attended[registered_and_attended["please_specify"].notna()]
registered_and_attended_int.reset_index(drop=True, inplace=True)

In [19]:
registered_and_attended_can.head()

Unnamed: 0,first_name,last_name,email,city,province_territory,postal_zip_code,please_specify,_merge
0,Aisha,Karidio,a.karidio@gmail.com,Edmonton,AB,T6W0P1,,both
1,Nicole,Gray,aa33nics@hotmail.com,Newmarket,ON,L3X1X5,,both
2,Andrea,Cain,acain@rogers.com,Ajax,ON,L1Z1K4,,both
3,Dawn,Martin,adassa2000@yahoo.com,Bampton,ON,L7A 1M1,,both
4,Andrene,Vitelli,afvitelli@gmail.com,Toronto,ON,M3A1Y8,,both


In [None]:
registered_and_attended_int.head()

Unnamed: 0,first_name,last_name,email,city,province_territory,postal_zip_code,please_specify,_merge
0,Angela,Prescod,alprescod60@gmail.com,Port St. Lucie,International (residing outside of Canada),34987,US,both
1,Kiara,Moore,brekiara@gmail.com,Pittsburgh,International (residing outside of Canada),15228,United States,both
2,brenda,Jones,brendagrantj@aol.com,Boston,International (residing outside of Canada),2130,United States,both
3,Careen,Garvey-Palmer,careen.garveypalmer@gmail.com,Loganville,International (residing outside of Canada),30052,USA,both
4,Melanie,Trelles,coheirencia@gmail.com,San José,International (residing outside of Canada),95035,California,both


**Zoom Data Pre-processing**

In [11]:
# Aggregate: first join (min) and last leave (max)
agg_span = (
    zoom_df
    .groupby('email', as_index=False)
    .agg(first_join=('join_time', 'min'), # keep min. join time
         last_leave =('leave_time', 'max'), # keep max. leave time
         session_count = ('join_time', 'size'))  # number of records for each email
)

# Compute duration in conference
agg_span['time_in_session'] = np.ceil(((agg_span['last_leave'] - agg_span['first_join']).dt.total_seconds() / 60)) # Convert time delta to minutes and round to the nearest minute

# Compute join_minute
agg_span['join_minute'] = np.ceil(((agg_span['first_join'] - START_TIME).dt.total_seconds() / 60))
# Compute left_minute
agg_span['leave_minute'] = np.ceil(((agg_span['last_leave'] - START_TIME).dt.total_seconds() / 60))

# Create a mask for attendee(s) who joined before the start time
join_before_start_time_mask = agg_span['join_minute'] < 0

# Remove attendee(s) who joined before the start time
agg_span = agg_span[~join_before_start_time_mask]

# Remove unnecessary columns
agg_span = agg_span.drop(columns=['email', 'first_join', 'last_leave', 'session_count'])
agg_span.head()

Unnamed: 0,time_in_session,join_minute,leave_minute
0,156.0,14.0,170.0
1,114.0,57.0,170.0
2,141.0,29.0,169.0
3,32.0,25.0,57.0
4,143.0,16.0,159.0


**Save Pre-Processed Data**

In [12]:
# Export clean zoom data
agg_span.to_csv(r'..\data\preprocessed\zoom-session-data.csv', index=False)

# Export clean eventbrite data
eventbrite_df.to_csv(r'..\data\preprocessed\clean-eventbrite-data.csv', index=False)