# Grab from George Birge website and display on a map

## Start by importing everything we need

In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install folium

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By

In [5]:
import pandas as pd
import os
import folium
from geopy.geocoders import Nominatim

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [6]:
from bs4 import BeautifulSoup
import re

In [7]:
from datetime import datetime

In [8]:
import random

In [9]:
import csv

## All of the variables we need to run the program

In [10]:
# URL of the website to scrape
url = 'https://www.georgebirge.com/tour'

# The csv file that we're saving all the performances to, and reading from
file_name = "performances_GB.csv"

## Do the web scraping

In [11]:
# Initialize Safari WebDriver
driver = webdriver.Safari()
driver.implicitly_wait(10) #wait up to 10 secs for the page to load

# Open the URL in the browser
driver.get(url)

# define performance class
class Performance:
    def __init__(self, raw_html, raw_text):
        self.raw_html = raw_html
        self.raw_text = raw_text
        
# define performances list of objects
performances = []

# Find all div elements with class="sqs-tourdates__venue-name"
performance_cards = driver.find_elements(By.CLASS_NAME, 'sqs-tourdates__item')

# Save the html
for performance_card in performance_cards:
    # create the object for this performance card
    performance = Performance(performance_card.get_attribute('outerHTML'), '\n'.join(line for line in performance_card.text.strip().splitlines() if line.strip()))
    # append to the list of objects
    performances.append(performance)
    # print the raw text just to make sure we're grabbing the right thing
    #print(performance.raw_text)

# Close the browser
driver.quit()

## Break down the html data into the performance object properties

In [12]:
# This will make displaying whatever info we want really easy down the road
# I'm going to use BeautifulSoup for this because I think it's a bit more user friendly

for performance in performances:
    # get each raw_html into BeautifulSoup
    soup = BeautifulSoup(performance.raw_html, 'html.parser')
    # find the datetime of the performance [0]
    performance.datetime = soup.find('span', class_='sqs-tourdates__timeframe')['data-tour-datetime']
    # find the date of the performance [1]
    performance.date = datetime.strptime(performance.datetime, "%Y-%m-%dT%H:%M:%S").strftime("%Y-%m-%d")
    # find the venue of the performance [2]
    performance.venue = soup.find('div', class_='sqs-tourdates__venue-name').text.strip().split('@')[0].strip()
    # find the location of the venue
    performance.location = soup.find('a', class_='sqs-tourdates__venue-link').text.strip()
    split_string = performance.location.split(',')
    performance.city = split_string[0].strip()  # Remove leading and trailing whitespace [3]
    performance.state = split_string[1].strip()  # Remove leading and trailing whitespace [4]
    performance.country = split_string[2].strip()  # Remove leading and trailing whitespace [5]
    # find the details (generally the other artists that will be there)
    performance_details = soup.find_all('div', class_='sqs-tourdates__lineup-item')
    performance.details = '' # set up as empty string [6]
    for performance_detail in performance_details:
        performance.details = performance.details + performance_detail.text.strip() + '|'
    performance.details = performance.details[:-1]
    # find the link to tickets
    try:
        performance.ticket_link = soup.find('a', class_='sqs-editable-button sqs-button-element--primary sqs-tourdates__button', text=lambda s: "Tickets" in s)['href'].strip()
    except:
        performance.ticket_link = ''
    # find the link to RSVP
    try:
        performance.rsvp_link = soup.find('a', class_='sqs-editable-button sqs-button-element--primary sqs-tourdates__button', text=lambda s: "RSVP" in s)['href'].strip()
    except:
        performance.rsvp_link = ''
    # find the link to Presale, this often doesn't exist so we just put None
    try:
        performance.presale_link = soup.find('a', class_='sqs-editable-button sqs-button-element--primary sqs-tourdates__button', text=lambda s: "Presale" in s)['href'].strip()
    except:
        performance.presale_link = ''
    performance.latitude = None # [7]
    performance.longitude = None # [8]

## For website performances, find the coordinates

In [13]:
geolocator = Nominatim(user_agent="palmercjones@comcast.net")

for performance in performances:
    if (performance.latitude == None) or (performance.longitude == None):
        coordinates_query = f"{performance.city}, {performance.state}, {performance.country}"
        coordinates = geolocator.geocode(coordinates_query)
        performance.latitude = coordinates.latitude + random.uniform(-0.01, 0.01) # so pins don't fall directly on top of one another
        performance.longitude = coordinates.longitude + random.uniform(-0.01, 0.01)
    if datetime.strptime(performance.date, "%Y-%m-%d") > datetime.today():
        performance.color = "blue"
    else:
        performance.color = "lightgray"
    #print(performance.color)
    print(f"Coordinates for {performance.city}: Latitude = {performance.latitude}, Longitude = {performance.longitude}")

Coordinates for Alpharetta: Latitude = 34.07059646211284, Longitude = -84.27871897047739
Coordinates for Jacksonville: Latitude = 30.33135355557352, Longitude = -81.64976614233449
Coordinates for Nashville: Latitude = 36.15407923923397, Longitude = -86.78100230710974
Coordinates for Myrtle Beach: Latitude = 33.68708498067581, Longitude = -78.89964973117661
Coordinates for Nashville: Latitude = 36.15406938132024, Longitude = -86.78244658607014
Coordinates for Bend: Latitude = 44.06125319948446, Longitude = -121.30863183043773
Coordinates for Auburn: Latitude = 47.31609965835903, Longitude = -122.23333727713863
Coordinates for Nampa: Latitude = 43.57911714037149, Longitude = -116.56823284942274
Coordinates for Harrisburg: Latitude = 40.26115069298861, Longitude = -76.8806595876541
Coordinates for Camden: Latitude = 39.954278395757115, Longitude = -75.11503328332762
Coordinates for Bristow: Latitude = 38.717651438716, Longitude = -77.54433121436013
Coordinates for Virginia Beach: Latitude

## Import the existing csv of performances, if it exists, make into objects and then add to performances list

In [14]:
# Initialize an empty list to store the data
csv_data = []

# Check if the file exists
if os.path.exists(file_name):
    # Open the file in read mode
    with open(file_name, 'r') as csv_file:
        # Create a CSV reader object
        csv_reader = csv.reader(csv_file)

        # Read each row from the CSV file and append it to the data list
        for row in csv_reader:
            csv_data.append(row)

    print("Data imported successfully:")
    #for row in csv_data:
    #    print(row)
else:
    print(f"The file {file_name} does not exist.")

Data imported successfully:


## Compare the the new locations we found

In [15]:
if len(csv_data) == 0:
    print("no data from csv")
else:
    for row in csv_data:
        match_found = False # initialize
        for performance in performances:
            if row[1] == performance.date:
                if row[3] == performance.city:
                    if row[2] == performance.venue:
                        print('Match found: ' + row[1] + ', ' + row[3] + ', ' + row[2])
                        match_found = True
                        break # if we find a match, we skip this performance and move on to the next row in csv_data without adding
        if not match_found:
            print('No match found.')
            performance = Performance('','') #skipped if we found a match in the website already there
            performance.datetime = row[0]
            performance.date = row[1]
            performance.venue = row[2]
            # performance.location = row[]
            performance.city = row[3]
            performance.state = row[4]
            performance.country = row[5]
            performance.details = row[6]
            # performance.ticket_link
            # performance.rsvp_link
            # performance.presale_link
            performance.latitude = row[7]
            performance.longitude = row[8]
            if datetime.strptime(performance.date, "%Y-%m-%d") < datetime.today():
                performance.color = "blue"
            else:
                performance.color = "lightgray"
            performances.append(performance) # append to performances
            print(performance.city + ', ' + performance.state)

Match found: 2024-05-31, Alpharetta, Ameris Bank Amphitheatre
Match found: 2024-06-01, Jacksonville, Daily's Place
Match found: 2024-06-06, Nashville, CMA Music Festival 2024
Match found: 2024-06-07, Myrtle Beach, Carolina Country Music Festival 2024
Match found: 2024-06-08, Nashville, Tin Roof Broadway
Match found: 2024-06-28, Bend, Hayden Homes Amphitheater
Match found: 2024-06-29, Auburn, White River Amphitheatre
Match found: 2024-06-30, Nampa, Ford Idaho Center Amphitheater
Match found: 2024-07-10, Harrisburg, XL Live
Match found: 2024-07-11, Camden, Freedom Mortgage Pavilion
Match found: 2024-07-12, Bristow, Jiffy Lube Live
Match found: 2024-07-13, Virginia Beach, Veterans United Home Loans Amphitheater at Virginia Beach
Match found: 2024-07-14, Dewey Beach, Bottle & Cork
Match found: 2024-07-18, Uncasville, Mohegan Sun Arena
Match found: 2024-07-19, Cuyahoga Falls, Blossom Music Center
Match found: 2024-07-20, Burgettstown, The Pavilion at Star Lake
Match found: 2024-07-25, Jacks

## Put the performances in order

## Do the mapping and save to html file

In [16]:
# Get the directory where the Python script or notebook is located
current_dir = os.path.dirname(os.path.abspath('pythonScrapeGeorgeBirge.ipynb'))

# Set the current working directory to the directory of the Python script or notebook
os.chdir(current_dir)

# Create a map centered at the geographical center of the US
m = folium.Map(location=[39.8283, -98.5795], zoom_start=4)

# Add markers for each location
for performance in performances:
    folium.Marker(
        location=[performance.latitude, performance.longitude],
        tooltip=(performance.venue + '<br>' +performance.city + ', ' + performance.state + '<br>' + performance.date),
        icon=folium.Icon(color=performance.color)
    ).add_to(m)

# Specify the path to save the HTML file
html_file_path = os.path.join(current_dir, 'map_GB.html')

# Save the map to an HTML file in the current directory
m.save(html_file_path)

print(f"Map saved to: {html_file_path}")

m

Map saved to: /Users/palmerjones/Website/Projects/map_GB.html


## Save to CSV

In [17]:
# Save locations to csv

# Open the file in write mode with newline='' to prevent extra blank lines
with open(file_name, 'w', newline='') as csv_file:
    # Create a CSV writer object
    csv_writer = csv.writer(csv_file)
    
    # Write the header row
    #csv_writer.writerow(["Name", "Age", "City"])
    
    # Write the data rows
    for performance in performances:
        csv_writer.writerow([performance.datetime, performance.date, performance.venue, performance.city, performance.state, performance.country, performance.details, performance.latitude, performance.longitude])

print(f"Data saved to {file_name}")

Data saved to performances_GB.csv
