**Import Packages**

In [1]:
import pandas as pd

**Import Clean Data**

In [6]:
# Set folder path
folder_path = r"..\data\clean"

# Import clean zoom data
zoom_df = pd.read_csv(folder_path+r'\clean-zoom-data.csv')

# Import clean eventbrite data
eventbrite_df = pd.read_csv(folder_path+r'\clean-eventbrite-data.csv')

In [7]:
zoom_df.head()

Unnamed: 0,user_name,email,join_time,leave_time,time_in_session_in_minutes
0,Vi,artcici9@gmail.com,2025-04-10 18:30:41,2025-04-10 21:03:36,153
1,Rosil Burnett-Gibbs,gibbsfamily32@gmail.com,2025-04-10 18:34:07,2025-04-10 20:59:44,146
2,Keesha,keesha.williams@yahoo.com,2025-04-10 18:34:15,2025-04-10 18:39:18,6
3,Keesha - Toronto,keesha.williams@yahoo.com,2025-04-10 18:39:13,2025-04-10 20:03:25,85
4,Simone,simoneibyrne@gmail.com,2025-04-10 19:02:31,2025-04-10 20:26:08,84


In [8]:
eventbrite_df.head()

Unnamed: 0,first_name,last_name,email,city,province_territory,postal_zip_code,please_specify
0,Camille,Williams,camillekwilliams@yahoo.ca,Toronto,ON,M3A 3J9,
1,Elaine,Goulbourne,Elaine.goulbourne@wchospital.ca,Toronto,ON,M5S1B2,
2,Ciara,M Shanks,shanksciara@gmail.com,MILWAUKEE,International (residing outside of Canada),53218,Wisconsin
3,Mercy,Osayi,mercywodicommunity@gmail.com,Kitchener,ON,N2A2P4,
4,Texas State Officer Constance,Jones,crjones2264@gmail.com,Houston,MB,77042,


**Pre-processing**
- Merge datasets on email
- Find the following subsets:
    + Registered and Attended --> _merge col: 'both' → email in registrations and attendance
    + Registered and Did not Attend --> _merge col: 'left_only' → in registrations only (registered but did not attend)
    + Did not Register and Attended --> _merge col: 'right_only' → in attendance only (attended but did not register)
- Combine data sets for Registered and Attended + Not Registered and Attended
    + This will be used for the breakdown of attendees by location
- Resolve Multiple Instances of Join and Leave Times
    + The approach for this will be to group by email and keep the minimum join time and the max leave time
- Calculate Time in Session

In [9]:
# Merge data sets
merged = eventbrite_df.merge(
    zoom_df[['email']],           # keep only email from zoom to avoid column collisions
    on='email',
    how='outer',                  # keep union so you can see all combos
    indicator=True
)

merged.head()

Unnamed: 0,first_name,last_name,email,city,province_territory,postal_zip_code,please_specify,_merge
0,Mo,Akins,1life2lve@gmail.com,Hamilton,ON,L8T3A7,,left_only
1,Charlene,Mcfarlane,2canwinpr@gmail.com,Whitby,ON,L1N 6W6,,left_only
2,Althea,Mcknight,424x8wc7m6@privaterelay.appleid.com,Toronto,ON,M4B2E5,,left_only
3,Alisha,Edouard,Ajansandy@gmail.com,Toronto,ON,M6h 2t3,,left_only
4,Yvonne,azaglo,Amaley@gmail.com,Brampton,ON,L7A 4Z8,,left_only


**Create Subsets for Further Analysis**

In [14]:
# registered AND attended
registered_and_attended = merged[merged['_merge'] == 'both'].copy()

# registered AND did NOT attend (registered only)
registered_not_attended = merged[merged['_merge'] == 'left_only'].copy()

# attended AND did NOT register (attended only) -- useful extra subset
attended_not_registered = merged[merged['_merge'] == 'right_only'].copy()

# Create one DataFrame for all attendees, regardeless of registration
all_attendees = pd.concat([registered_and_attended, attended_not_registered], axis=0)
all_attendees = all_attendees   .reset_index(drop=True)


In [21]:
# Save counts for each case
registered_and_attended_count = len(registered_and_attended)
attended_not_registered_count = len(attended_not_registered)
all_attendees_count = len(all_attendees)
registered_not_attended_count = len(registered_not_attended)

# Display values
print(f'registered & attended: {registered_and_attended_count}')
print(f'not registered & attended: {attended_not_registered_count}')
print(f'total attendees: {all_attendees_count}')
print(f'registered & not attended: {registered_not_attended_count}')

registered & attended: 448
not registered & attended: 158
total attendees: 606
registered & not attended: 773
