# Exploratory Data Analysis

In [1]:
import pandas as pd
import pyarrow
import os
import matplotlib.pyplot as plt
folder_path = os.getenv("MY_WORKSPACE", 'empty') + '/acl_spring_24_bulls2'
os.chdir(folder_path)



## Import and Merge Data

In [2]:
# Read the parquet files
tickets = pd.read_parquet('data/processed/season_tickets.parquet')
games = pd.read_parquet('data/processed/home_games.parquet')
email_sends = pd.read_parquet('data/processed/season_email_send.parquet')
email_ctrs = pd.read_parquet('data/processed/season_email_ctr.parquet')
liva = pd.read_parquet('data/processed/liv_a.parquet')

In [3]:
tickets_games = pd.merge(tickets, games, on = "event_date", how = "left")

In [10]:
len(tickets_games)

270086

In [4]:
tickets_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270086 entries, 0 to 270085
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   season_name_x             270086 non-null  object        
 1   event_date                270086 non-null  datetime64[ns]
 2   event_weekday             270086 non-null  object        
 3   opponent_short            270086 non-null  object        
 4   parent_ticket_categories  270086 non-null  object        
 5   add_date                  270086 non-null  datetime64[ns]
 6   days_before_event         270086 non-null  int64         
 7   purchaser_email           268908 non-null  object        
 8   total_seats               270086 non-null  int64         
 9   seat_location             270086 non-null  object        
 10  seat_level                270086 non-null  object        
 11  arrival_time              237206 non-null  datetime64[ns]
 12  at

In [5]:
liva.columns

Index(['cust_city_nm', 'cust_state_nm', 'cust_postal_cd', 'cust_ctry_nm',
       'cust_email_addr', 'livea_match_cd', 'gndr_input_indv_cd',
       'gndr_1st_indv_cd', 'gndr_2nd_indv_cd', 'age_two_yr_incr_input_indv',
       ...
       'client_sp', 'client_sale_dt_min', 'client_sale_dt_max',
       'client_pe_tkt_cnt', 'client_pe_sp', 'client_tkt_price',
       'client_tkt_price_max', 'client_tkt_price_min', 'vehicle_type', 'year'],
      dtype='object', length=223)

In [6]:
tickets_games_liva = pd.merge(tickets_games, liva, left_on = "attendee_email", right_on = 'cust_email_addr', how = "left")

In [7]:
tickets_games_liva.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15671001 entries, 0 to 15671000
Columns: 251 entries, season_name_x to year
dtypes: bool(1), boolean(35), datetime64[ns](8), float64(122), int64(2), object(83)
memory usage: 26.1+ GB


In [9]:
len(tickets_games_liva)

15671001

In [11]:
liva['cust_email_addr'].value_counts()

cust_email_addr
markruane@ruaneconstruction.com    4
nirav.batavia@gmail.com            3
jaeder@jdirealty.com               3
scantwell06@yahoo.com              3
sslee225@yahoo.com                 3
                                  ..
jsumm83@gmail.com                  1
lutzjamesp@gmail.com               1
snunberg@yahoo.com                 1
eoyola2@yahoo.com                  1
flannerm22@gmail.com               1
Name: count, Length: 115498, dtype: int64

In [14]:
email_liva = liva[liva['livea_match_cd'] == 'E']

In [15]:
email_liva['cust_email_addr'].value_counts()

cust_email_addr
nirav.batavia@gmail.com          3
mike@nikolich.com                2
jeff.sindelar@gmail.com          2
calchaney@comcast.net            2
nathanieljfalk@gmail.com         2
                                ..
krissyteam4@gmail.com            1
bryant@wallscottsolutions.com    1
J.fallucca@palermospizza.com     1
derekgonzalez2628@yahoo.com      1
moilim21@gmail.com               1
Name: count, Length: 47961, dtype: int64

In [16]:
liva[liva["cust_email_addr"] == 'nirav.batavia@gmail.com']

Unnamed: 0,cust_city_nm,cust_state_nm,cust_postal_cd,cust_ctry_nm,cust_email_addr,livea_match_cd,gndr_input_indv_cd,gndr_1st_indv_cd,gndr_2nd_indv_cd,age_two_yr_incr_input_indv,...,client_sp,client_sale_dt_min,client_sale_dt_max,client_pe_tkt_cnt,client_pe_sp,client_tkt_price,client_tkt_price_max,client_tkt_price_min,vehicle_type,year
49033,,,,,nirav.batavia@gmail.com,E,M,M,F,40.0,...,627.2,2021-10-15,2021-10-15,4.0,627.2,156.8,156.8,156.8,L1,2024
65040,,,,,nirav.batavia@gmail.com,E,M,M,F,40.0,...,627.2,2021-10-15,2021-10-15,4.0,627.2,156.8,156.8,156.8,L1,2023
112484,,,,,nirav.batavia@gmail.com,E,M,M,F,40.0,...,627.2,2021-10-15,2021-10-15,4.0,627.2,156.8,156.8,156.8,L1,2023


In [18]:
df = liva[liva["cust_email_addr"] == 'nirav.batavia@gmail.com']

# Identify duplicate rows
duplicate_rows = df.duplicated()

# Print the duplicate rows
duplicate_rows


49033     False
65040     False
112484     True
dtype: bool

In [26]:
# Get a list of all column names
all_columns = liva.columns.tolist()

# Remove the names of the columns you want to exclude
columns_to_consider = [col for col in all_columns if col not in ['liva_match_cd', 'year']]

# Use df.duplicated() with the subset parameter
duplicates = liva.drop_duplicates(subset=columns_to_consider)

In [27]:
sum(duplicates)

7685

In [28]:
len(df)

3