# Colosseum ticket availability webscraper

## Written by Nicholas Fasano
## Created: 04/09/2023
### Description: Repeatedly query Colosseum website for ticket availability. This script is optimized for finding tickets on a specific day, namely the day the tickets are released. Option to send email alerts when desired tickets are available for easy and fast purchase.
### Website: https://ecm.coopculture.it

In [None]:
# Load in python packages
import numpy as np
import pandas as pd

# for time and date manipulations
import datetime
import time

# load in Selenium functionalities
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# import sending email functionality
import smtplib, ssl

# path to chrome webdriver - will need to install to local machine based on you version of Chrome
# See: https://chromedriver.chromium.org/downloads
PATH_CHROME = "C:\Program Files (x86)\ManuallyInstalled\chromedriver_v111.exe"

# User defined functions used in webscraping https://ecm.coopculture.it

- NOTE: need to set receiver/sender information in send_email() function

In [None]:
def send_email(message):
    port = 465  # For SSL
    smtp_server = "smtp.gmail.com"     
    sender_email = "sender@gmail.com"  # Enter your address
    password = 'enter_your_password_here'         # enter your password
    receiver_email = "receiver@gmail.com"  # Enter receiver address
    

    context = ssl.create_default_context()
    with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:
        server.login(sender_email, password)
        server.sendmail(sender_email, receiver_email, message)


def table_indices(date):

    # Input: date: string in 'mm/dd/yyyy' format
    # Output: row (week number) and col (day of week) that date belongs to
    # This fuinction assumes that the calendar column format is 'S M T W Th F S'
    # Function adapted from: 'https://stackoverflow.com/questions/3806473/week-number-of-the-month#:~:text=The%20answer%20you%20are%20looking,and%20%25%20is%20the%20positive%20remainder.&text=As%20dm%20and%20dw%20are,)%257)%2F7%20%2B%201%20'

    
    # convert input date from string format to datetime object
    date = datetime.datetime.strptime(date, '%m/%d/%Y')
    day_of_week = date.weekday()
    

    # Compute day number (column)
    # adjust output from .weekday() which outputs Monday = 0, Tuesday = 1, ... Sunday = 6
    # so that Sunday = 1, Monday = 2, ... Saturday = 7
    if(day_of_week == 6):
        # first day of the month is a Sunday -- set to 1
        col = 1
    else:
        # day is Monday - Saturday, increment .weekday() output by 2
        col = day_of_week + 2
    
    # compute week number (row)
    
    first_day = date.replace(day = 1).weekday()
    
    # adjust output from .weekday() which outputs Monday = 0, Tuesday = 1, ... Sunday = 6
    # so that Sunday = 0, Monday = 1, ... Saturday = 6
    if(first_day == 6):
        # first day of the month is a Sunday -- set to 0
        first_day = 0
    else:
        # first day is Monday - Saturday, increment .weekday() output by 1
        first_day = first_day + 1
    day_of_month = date.day + first_day
    
    # add 2 to account for month label and day of week label in the html table
    # the +2 is specific to the colosseum website calendar and may vary for other types of calendars
    row = int(np.ceil(day_of_month/7.0)) + 2 

    return row, col 

def calendar_day_color(day_color):
    # input: day_color is the text of the html element that is associated with the color coding of the calendar day
    
    if(day_color == 'background: rgb(206, 234, 208);'):
        return 'green' # tickets available
    elif(day_color == 'background: rgb(255, 224, 223);'):
        return 'red'   # tickets soldout
    elif(day_color == 'background: rgb(238, 238, 238);'):
        return 'grey'  # date unavailable (grey day -- either day has passed or tickets have not been released yet)
    else:
        return 'white' # tickets not being sold online for this day (white)
    

# List urls to different types of tickets

In [None]:
# Full experience ticket (underground + arena)
url1 = 'https://ecm.coopculture.it/index.php?option=com_snapp&view=event&id=D7E12B2E-46C4-074B-5FC5-016ED579426D&catalogid=DDDA3AB3-47BC-0A49-7752-0174490F632A&lang=en'

# Full experience ticket (arena only)
url2 = 'https://ecm.coopculture.it/index.php?option=com_snapp&view=event&id=3C38AB77-8D5A-5394-05B2-0172EB8E7D46&catalogid=F3CB77BD-6A43-108B-6723-0174490EB610&lang=en'

# Full eperience with english didactic tour
url3 = 'https://ecm.coopculture.it/index.php?option=com_snapp&view=event&id=490E25D6-2465-ED3A-6A13-016ED583FB68&catalogid=238D971A-D296-07C6-7A82-0174490F9C7B&lang=en'

# Regular entrance 
url4 ='https://ecm.coopculture.it/index.php?option=com_snapp&view=event&id=3793660E-5E3F-9172-2F89-016CB3FAD609&catalogid=B79E95CA-090E-FDA8-2364-017448FF0FA0&lang=en'

# Regular entrance plus english tour
url5 ='https://ecm.coopculture.it/index.php?option=com_snapp&view=event&id=96DBBFE1-BC45-FDCB-5E1B-016CF138154A&catalogid=7049E852-9020-9834-008D-017FD631028C&lang=en'

# Colosseum by night (english tour)
url6  = 'https://ecm.coopculture.it/index.php?option=com_snapp&view=event&id=2E4CF2E0-F5B7-7757-4241-0173584374D6&catalogid=06B97E06-CA46-981E-360A-017A9FE4C3F6&lang=en'

# Regular entrance for groups and schools
url7 ='https://ecm.coopculture.it/index.php?option=com_snapp&view=event&id=1C12B646-0675-43E4-6480-016CDCFD059A&catalogid=343A9780-C74C-B357-316E-01750CD2B9BA&lang=en'


# Set control parameters for monitoring ticket availability

In [None]:
# choose date to search for tickets
search_date = '06/05/2023'

# choose ticket types to track based on urls from above
urls = [url1,url2,url3]

# name_urls == unique string identifier for tracked ticket types -- saved in dataframe index for filtering purposes
name_urls = ['FEUA','FEA','FEUAG']

# choose time window that you want to track tickets (military time used throughout)
# enter your local time (NOT ROME time [CET])
start_time = -1. # start_time < 0 will allow tracking immediately
end_time   = 18.5 #  end_time > 23 will track through midnight
time_between_calls = .2 # in seconds

# set time interval you wish to receive updates for ticket availbility
# emails only sent if tickets are available
time_between_email = 20 # time delay between sending emails (in seconds)

# set the number of iterations before saving dataframe to pickle file
iter_between_save  = 10 

# pickle file saved as: date_text.replace('/','_') + "_single_day_" + file_extension ".pkl"
file_extension = 'v3'

# Initialize pandas dataframe for recording webscraped data

In [None]:
# create ticket time list: ticket times are in intervals of 5 minutes from 9am-11pm (Rome time [CET])
time_list = [str(i*datetime.timedelta(minutes=5)) for i in range(9*60//5,23*60//5)]
time_list = ['0' + time[:-3] if ':' in time[0:2] else time[:-3] for time in time_list ]
time_list.insert(0,'Color') # track color of calendar day

data = {time_list[j]:pd.arrays.SparseArray(np.nan,fill_value=np.nan) for j in range(len(time_list))}
df = pd.DataFrame(data = data,columns=time_list,index=[datetime.datetime.now().strftime('%m/%d/%Y_%H:%M:%S')])

# Begin primary loop to webscrape ticket availability on the specific date of date_text

In [None]:
# hold off on making webpage calls until start_time -- tickets go live at 9am Rome time
current_time = float(datetime.datetime.now().strftime("%H"))
while(current_time < start_time or current_time > end_time):
    time.sleep(120)
    current_time = float(datetime.datetime.now().strftime("%H"))
    
# first open a brwoser for each ticket that you want to search - store each driver in a list called drivers
drivers = []
for jurl in range(len(urls)):
    drivers.append(webdriver.Chrome(PATH_CHROME))
    drivers[jurl].get(urls[jurl])
    drivers[jurl].maximize_window()
    drivers[jurl].execute_script("window.scrollTo(0, 500)")

# advance the month, if needed
advance_month = int(search_date[0:2])-int(datetime.datetime.now().strftime('%m'))
for jmonth in range(advance_month):
    for jurl in range(len(urls)):          
        element = WebDriverWait(drivers[jurl], 20).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='calendar']/table/tbody/tr[1]/td[3]/div")))
        element.click();    
    time.sleep(3)

    
# set parameters for sensing emails
t0 = [time.time()]*len(urls)   # time tracker  
reset_time = [0]*len(urls)     # if email is sent, reset_time = 1 and t0 is reset to 0 after all tickets are tracked


# first loop over number of iterations // certain timespan
current_time = float(datetime.datetime.now().strftime("%H"))
save_counter = 0 # iteration tracker to save dataframe every ten calls
while(current_time > start_time or current_time < end_time):


    for jurl in range(len(urls)):
        ticket_time = []
        number_avail = []      
        search_time = datetime.datetime.now().strftime("%m/%d/%Y_%H:%M:%S") + '_' + name_urls[jurl] + '_'        
        
        # find color of interested day   
        row, col = table_indices(search_date)
        search_date_str = "//*[@id='calendar']/table/tbody/tr[" + str(row) + "]/td[" + str(col) +"]"
        
        # find day's color and if color green then get availability too
        day_color = calendar_day_color(drivers[jurl].find_element(By.XPATH,search_date_str).get_attribute('style')) # color
        if(day_color == 'green'):

            # click desired day
            element_click_date = WebDriverWait(drivers[jurl], 20).until(EC.element_to_be_clickable((By.XPATH, search_date_str + "/div")))
            element_click_date.click()
            time.sleep(3)

            # check if 'ATTENTION:\n No result is available day' message is presented
            element_no_avail = drivers[jurl].find_elements(By.XPATH,'//*[@id="msg"]')

            if(len(element_no_avail) < 1):
                
                # get ticket time and availability
                el_ticket_time = drivers[jurl].find_elements(By.XPATH,"//div[@class='col-md-4 col-sm-4 col-xs-4']")
                el_number_avail = drivers[jurl].find_elements(By.XPATH,"//div[@class='col-md-8 col-sm-8 col-xs-8 nopadding']")

                ticket_time = ticket_time + [jTime.text for jTime in el_ticket_time]

                number_avail_temp = [jnumber_avail.text for jnumber_avail in el_number_avail]
                number_avail = number_avail + [int(text[text.find('(')+1:text.find(')')]) for text in number_avail_temp]

                num_pages = len(drivers[jurl].find_elements(By.XPATH,"//a[@class = 'page-link']"))
                for k in range(num_pages):
                    drivers[jurl].find_element(By.XPATH,"//a[@class = 'page-link next']").click()
                    time.sleep(3)
                    
                    # get ticket time and availability
                    el_ticket_time = drivers[jurl].find_elements(By.XPATH,"//div[@class='col-md-4 col-sm-4 col-xs-4']")
                    el_number_avail = drivers[jurl].find_elements(By.XPATH,"//div[@class='col-md-8 col-sm-8 col-xs-8 nopadding']")

                    ticket_time = ticket_time + [jTime.text for jTime in el_ticket_time]

                    number_avail_temp = [jnumber_avail.text for jnumber_avail in el_number_avail]
                    number_avail = number_avail + [int(text[text.find('(')+1:text.find(')')]) for text in number_avail_temp]
                
                # periodically send emails with ticket availability
                if(time.time()-t0[jurl] > time_between_email):
                    reset_time[jurl] = 1
            
                    message = 'Subject: ' + name_urls[jurl] + ' TICKETS AVAILABLE on ' + search_date + '\n'
                    for j in range(len(ticket_time)):
                        message = message  + ticket_time[j].replace(':','.') + ' - ' +  str(number_avail[j]) + ' Tickets Available \n'
                    message = message + urls[jurl] + '\n'
                    send_email(message)

                ticket_time.append('Color')
                number_avail.append(day_color)      
                
            else: # Tickets are sold out            
                ticket_time = ticket_time + ['Color']
                number_avail = number_avail + [day_color]
                
        else: # no avilability
            ticket_time = ticket_time + ['Color']
            number_avail = number_avail + [day_color]

        df.loc[search_time] = {ticket_time[j]:number_avail[j] for j in range(len(ticket_time))}   
    
    for jurl in range(len(urls)):    
        drivers[jurl].get(urls[jurl])
        drivers[jurl].execute_script("window.scrollTo(0, 500)")

    for jmonth in range(advance_month):
        for jurl in range(len(urls)):          
            element = WebDriverWait(drivers[jurl], 20).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='calendar']/table/tbody/tr[1]/td[3]/div")))
            element.click();    
        time.sleep(3)
        
    # save data every 'iter_between_save' iterations 
    if(save_counter%iter_between_save == 0):
        df.to_pickle(search_date.replace('/','_') + "_single_day_" + file_extension + ".pkl")
    save_counter = save_counter + 1

    # reset email time tracker for specific tickets where emails were sent
    for jurl in range(len(urls)):
        if(reset_time[jurl] == 1):
            reset_time[jurl] = 0
            t0[jurl] = time.time()
    
    time.sleep(time_between_calls)

# Finally close drivers
for jurl in range(len(urls)):
    drivers[jurl].close()