**Import Packages**

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

**Create Functions**

In [24]:
# Function to normalize DataFrame columns
def normalize_columns(df):
    """
    Normalize column names by:
    - Converting to lowercase
    - Replacing spaces and certain special characters (- / #) with underscores
    - Removing all other special characters (keep letters, numbers, and underscores)
    - Collapsing multiple underscores into one
    - Stripping leading/trailing underscores
    """
    new_cols = (
        df.columns
        .str.lower()
        .str.replace(r'[\s\-\/#]+', '_', regex=True)      # replace space, -, /, # with _
        .str.replace(r'[^a-z0-9_]', '', regex=True)       # remove remaining special chars
        .str.replace(r'_+', '_', regex=True)              # collapse multiple underscores
        .str.strip('_')                                   # trim leading/trailing _
    )
    df.columns = new_cols
    return df


**Import & Inspect Raw Data**

In [3]:
zoom_df_raw = pd.read_csv(r'..\data\raw\attendee_20250410.csv',skiprows=34)
zoom_df_raw.head()

Unnamed: 0,Attended,User Name (Original Name),Email,Join Time,Leave Time,Time in Session (minutes),Is Guest,Country/Region Name
0,Yes,Vi,artcici9@gmail.com,04/10/2025 06:30:41 PM,04/10/2025 09:03:36 PM,153,Yes,Canada
1,Yes,Rosil Burnett-Gibbs,gibbsfamily32@gmail.com,04/10/2025 06:34:07 PM,04/10/2025 08:59:44 PM,146,Yes,Canada
2,Yes,Keesha,keesha.williams@yahoo.com,04/10/2025 06:34:15 PM,04/10/2025 06:39:18 PM,6,Yes,Canada
3,Yes,Keesha - Toronto,keesha.williams@yahoo.com,04/10/2025 06:39:13 PM,04/10/2025 08:03:25 PM,85,Yes,Canada
4,Yes,Simone,simoneibyrne@gmail.com,04/10/2025 07:02:31 PM,04/10/2025 08:26:08 PM,84,Yes,Canada


In [4]:
eventbrite_df_raw = pd.read_excel(r'..\data\raw\Eventbrite Report-2025-04-10.xlsx')
eventbrite_df_raw.head()

Unnamed: 0,Order #,Order Date,First Name,Last Name,Email,Quantity,Price Tier,Ticket Type,Attendee #,Group,...,"Would you like us to contact you about future events, opportunities to participate in research studies, or other volunteer opportunities to support the health of Black women?",Please specify,Please specify:,Please specify.1,Please specify:.1,Please specify:.2,"Please specity (Trans refers to a person who identifies with a gender other than the one assigned to them at birth, or to a person whose gender identity and gender expression differs from stereotypical masculine and feminine norms):",Please specify.2,Please specify.3,Please specify.4
0,11638705683,2025-02-04 16:29:10,Camille,Williams,camillekwilliams@yahoo.ca,1,,Best Health For Black Women 2025,19139001583,,...,Yes,,,,,Member of the project team,,,,
1,11681475383,2025-02-10 08:55:22,Info Requested,Info Requested,Info Requested,1,,Best Health For Black Women 2025,19203545333,,...,,,,,,,,,,
2,11720732823,2025-02-14 11:07:56,Elaine,Goulbourne,Elaine.goulbourne@wchospital.ca,1,,Best Health For Black Women 2025,19260726553,,...,Yes,,,,,,,RN,,
3,11737136963,2025-02-16 12:08:51,Ciara,M Shanks,shanksciara@gmail.com,1,,Best Health For Black Women 2025,19284881443,,...,Yes,Wisconsin,,,,Eventbrite while signing up for another event,,,,
4,11768272123,2025-02-19 10:27:27,Mercy,Osayi,mercywodicommunity@gmail.com,1,,Best Health For Black Women 2025,19328112363,,...,Yes,,,,,,,,,


In [6]:
zoom_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Attended                   599 non-null    object
 1   User Name (Original Name)  599 non-null    object
 2   Email                      599 non-null    object
 3   Join Time                  599 non-null    object
 4   Leave Time                 599 non-null    object
 5   Time in Session (minutes)  599 non-null    int64 
 6   Is Guest                   599 non-null    object
 7   Country/Region Name        599 non-null    object
dtypes: int64(1), object(7)
memory usage: 37.6+ KB


In [46]:
eventbrite_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1487 entries, 0 to 1486
Data columns (total 45 columns):
 #   Column                                                                                                                                                                                                                                    Non-Null Count  Dtype         
---  ------                                                                                                                                                                                                                                    --------------  -----         
 0   Order #                                                                                                                                                                                                                                   1487 non-null   int64         
 1   Order Date                                                                    

In [10]:
zoom_df_raw.columns

Index(['Attended', 'User Name (Original Name)', 'Email', 'Join Time',
       'Leave Time', 'Time in Session (minutes)', 'Is Guest',
       'Country/Region Name'],
      dtype='object')

In [11]:
eventbrite_df_raw.columns

Index(['Order #', 'Order Date', 'First Name', 'Last Name', 'Email', 'Quantity',
       'Price Tier', 'Ticket Type', 'Attendee #', 'Group', 'Order Type',
       'Currency', 'Total Paid', 'Fees Paid', 'Eventbrite Fees',
       'Eventbrite Payment Processing', 'Attendee Status', 'Home Address 1',
       'Home Address 2', 'Home City', 'Home State', 'Home Zip', 'Home Country',
       'Home Phone', 'City', 'Province/Territory', 'Postal/Zip Code',
       'What is your current gender identity?', 'Please select your age group',
       'Which of the following best describes your background?',
       'I am (select all that apply)',
       'I am interested in information about (select all that apply)',
       'Are there any specific questions you would like to see answered during this event? *Please note – it may not be possible for us to address all questions during the event.',
       'How did you hear about this event?',
       'I consent to be contacted by the event organizer for feedback on m

**Data Cleaning**
- Create copies of the DataFrames
- Keep columns needed for analysis
- Ensure columns are of correct type
- Remove duplicate values
- Save cleaned data sets

In [44]:
# Create copy of zoom data
zoom_df = zoom_df_raw[['User Name (Original Name)', 
                       'Email', 
                       'Join Time',
                       'Leave Time', 
                       'Time in Session (minutes)']].copy()

# Rename columns
zoom_df.rename(columns={'User Name (Original Name)': 'User Name',
                        'Time in Session (minutes)': 'Time in Session in minutes'}, inplace=True)

# Normalize columns
zoom_df = normalize_columns(zoom_df)

# Reset index
zoom_df.reset_index(drop=True, inplace=True)

# Convert join_time and leave_time to pd.DateTime()
zoom_df['join_time'] = pd.to_datetime(zoom_df['join_time'], format='%m/%d/%Y %I:%M:%S %p')
zoom_df['leave_time'] = pd.to_datetime(zoom_df['leave_time'],format='%m/%d/%Y %I:%M:%S %p')

# Remove duplicate values
zoom_df = zoom_df.drop_duplicates()

zoom_df.head()

Unnamed: 0,user_name,email,join_time,leave_time,time_in_session_in_minutes
0,Vi,artcici9@gmail.com,2025-04-10 18:30:41,2025-04-10 21:03:36,153
1,Rosil Burnett-Gibbs,gibbsfamily32@gmail.com,2025-04-10 18:34:07,2025-04-10 20:59:44,146
2,Keesha,keesha.williams@yahoo.com,2025-04-10 18:34:15,2025-04-10 18:39:18,6
3,Keesha - Toronto,keesha.williams@yahoo.com,2025-04-10 18:39:13,2025-04-10 20:03:25,85
4,Simone,simoneibyrne@gmail.com,2025-04-10 19:02:31,2025-04-10 20:26:08,84


In [40]:
# Create copy of eventbrite data
eventbrite_df = eventbrite_df_raw[['First Name', 
                                   'Last Name', 
                                   'Email', 
                                   'City', 
                                   'Province/Territory', 
                                   'Postal/Zip Code',
                                   'Please specify']].copy()
# Normailize columns
eventbrite_df = normalize_columns(eventbrite_df)

# Reset index
eventbrite_df.reset_index(drop=True, inplace=True)

# Remove rows where the first_name is 'Info Requested'
info_requested_mask = eventbrite_df['first_name'] == 'Info Requested'
eventbrite_df = eventbrite_df[~info_requested_mask]

# Remove duplicates across first_name, last_name and email columns
eventbrite_df = eventbrite_df.drop_duplicates(subset=['first_name', 'last_name', 'email'])

eventbrite_df.head()

Unnamed: 0,first_name,last_name,email,city,province_territory,postal_zip_code,please_specify
0,Camille,Williams,camillekwilliams@yahoo.ca,Toronto,ON,M3A 3J9,
2,Elaine,Goulbourne,Elaine.goulbourne@wchospital.ca,Toronto,ON,M5S1B2,
3,Ciara,M Shanks,shanksciara@gmail.com,MILWAUKEE,International (residing outside of Canada),53218,Wisconsin
4,Mercy,Osayi,mercywodicommunity@gmail.com,Kitchener,ON,N2A2P4,
5,Texas State Officer Constance,Jones,crjones2264@gmail.com,Houston,MB,77042,


In [41]:
eventbrite_df.shape

(1133, 7)

In [45]:
zoom_df.shape

(599, 5)

**Save Cleaned Data**

In [50]:
# Export clean zoom data
zoom_df.to_csv(r'..\data\clean\clean-zoom-data.csv')

# Export clean eventbrite data
eventbrite_df.to_csv(r'..\data\clean\clean-eventbrite-data.csv')

**Post Cleaning Observations/Notes**
- Raw data sets:
    + 599 for Zoom
    + 1487 for eventbrite
- Cleaned data sets:
    + 599 for Zoom (removing duplicates accross entire data set)
    + 1133 for eventbrite (removing duplicates accross Firt Name, Last Name and Email)
- Column names have been normalized
- Column types have been adjusted