Brief Overview for thought process behind code: 
1. Create two functions, one for scrapping normal pages and one for the two special pages (Alaska and American Samoa)
2. Loop over all pages to scrape required data
3. After collecting all data, append to one list, then create dataframe object

In [1]:
# Importing necessary libararies from selenium/ beautifulsoup/ pandas
import pandas as pd 
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [2]:
# Setting up Chromedriver  
service = Service() 
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

The chromedriver version (123.0.6312.58) detected in PATH at c:\Users\quent\repos\ISOM3400\Assignments\chromedriver.exe might not be compatible with the detected chrome version (124.0.6367.93); currently, chromedriver 124.0.6367.91 is recommended for chrome 124.*, so it is advised to delete the driver in PATH and retry


In [3]:
# Filtering out counties that are present for both Demoratic and Republican 
states_democratic = ['Alabama', 'American Samoa', 'Arkansas', 'California', 'Colorado', 
                     'Iowa', 'Maine', 'Massachusetts', 'Minnesota', 'North Carolina',
                     'Oklahoma', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia']
states_republican = ['Alabama', 'Alaska', 'Arkansas', 'California', 'Colorado',
                     'Maine', 'Massachusetts', 'Minnesota', 'North Carolina', 
                     'Oklahoma', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia']
states_both = [state.lower() for state in states_democratic if state in states_republican]
states_both.insert(4, 'iowa')

In [4]:
# Function for scrapping normal pages 
def scrape_election_page(state_nm):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    columns = soup.select('div.tile-h_ymFU.cnn-pcl-13b0kh1[data-testid="card"]')
    county_name_ls = soup.select("div.tile-h_ymFU.cnn-pcl-13b0kh1[data-testid='card'] > article > div.header-container-1LzJY9 > h2.header-2-AOgLYo")

    # Add in delegate numbers from table above
    delegate_rows = soup.select('tr.cnn-pcl-1me6450.isWinner-3g_AYM')
    candidate_delegates = {} 
    for row in delegate_rows:
        name_element = row.select_one('span[data-testid="candidate-name"]')
        candidate = name_element.get_text(strip=True) if name_element else None

        delegate_element = row.select_one('td[data-testid="delegates"]')
        delegates = int(delegate_element.get_text(strip=True)) if delegate_element else 0

        # Initialize the candidate in the dictionary if it doesn't exist
        if candidate not in candidate_delegates:
            candidate_delegates[candidate] = 0
            
        # Store the candidate-delegate pair in the dictionary
        candidate_delegates[candidate] += delegates


    for i, column in enumerate(columns):  
        table = column.select_one('table.cnn-pcl-1me6450')
        rows = table.select('tr.cnn-pcl-1me6450')
        county_name = county_name_ls[i].text

        for row in rows:
            state = ''
            # Handle word for 2-word counties, which is only North Carolina 
            if state_nm == 'north-carolina': 
                state = "North Carolina"
            else: 
                state = soup.select_one('h2.header-2-AOgLYo.cnn-pcl-xk8c6r').get_text().split()[-1]

            name_element = row.select_one('span[data-testid="candidate-name"]')
            candidate = name_element.get_text(strip=True) if name_element else None

            party_element = row.select_one('span[data-testid="party-label"]')
            party = party_element.get_text(strip=True).split(',')[0] if party_element else None

            incumbent = 'Incumbent' in party_element.get_text(strip=True) if party_element else False
            incumbent_status = 'Yes' if incumbent else 'No'

            vote_percent_element = row.select_one('td[data-testid="votepercent"]')
            percentage = vote_percent_element.get_text(strip=True) if vote_percent_element else None
            
            vote_count_element = row.select_one('span[data-testid="votes"]')
            votes = 0
            if vote_count_element:
                try:
                    votes = int(vote_count_element.get_text(strip=True).replace(',', ''))
                except ValueError:
                    votes = 0
            
            winner = 'Yes' if incumbent_status == 'Yes' else 'No'
            delegates = candidate_delegates.get(candidate, 0)


            if candidate: # Generate and return a dictionary for each row 
                yield {
                    'State': state,
                    'County': county_name,
                    'Candidate': candidate,
                    'Party': party,
                    'Incumbent': incumbent_status,
                    'Votes': votes,
                    'Percentage': percentage,
                    'Winner': winner,
                    'Delegates': delegates
                }


In [5]:
# Function for scrapping the two special pages 
def scrape_election_page_special(state_nm):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    columns = soup.select('article.core-result')
    
    for column in columns:
        county_name = None
        table = column.select_one('table.cnn-pcl-1me6450')
        rows = table.select('tr.cnn-pcl-1me6450')

        for row in rows:
            candidate_delegates = {} 
            name_element = row.select_one('span[data-testid="candidate-name"]')
            candidate = name_element.get_text(strip=True) if name_element else None

            delegate_element = row.select_one('td[data-testid="delegates"]')
            try: # For some reason, this section is prone to error, hence the additional code block
                delegates = int(delegate_element.get_text(strip=True)) if delegate_element else 0
            except ValueError:
                    delegates = 0

            # Initialize the candidate in the dictionary if it doesn't exist
            if candidate not in candidate_delegates:
                candidate_delegates[candidate] = 0
                
            # Store the candidate-delegate pair in the dictionary
            candidate_delegates[candidate] += delegates
            if state_nm == 'american-samoa': # Simliar to 'North Carolina' above, just to handle the only two-word county 
                state = "American Samoa"
            else: 
                state = soup.select_one('h2.header-2-AOgLYo.cnn-pcl-xk8c6r').get_text().split()[-1]
            party_element = row.select_one('span[data-testid="party-label"]')
            party = party_element.get_text(strip=True).split(',')[0] if party_element else None

            incumbent = 'Incumbent' in party_element.get_text(strip=True) if party_element else False
            incumbent_status = 'Yes' if incumbent else 'No'

            vote_percent_element = row.select_one('td[data-testid="votepercent"]')
            percentage = vote_percent_element.get_text(strip=True) if vote_percent_element else None
            
            vote_count_element = row.select_one('span[data-testid="votes"]')
            votes = 0
            if vote_count_element:
                try:
                    votes = int(vote_count_element.get_text(strip=True).replace(',', ''))
                except ValueError:
                    votes = 0
        
            winner = 'Yes' if incumbent_status == 'Yes' else 'No'

            delegates = candidate_delegates.get(candidate, 0)

            # Conditional below is just to ensure driver only scraps Republican for Alaska, and Democratic for American Samao
            if party and (state_nm == 'alaska' and party == 'Republican') or (state_nm == 'american-samoa' and party == 'Democratic'):
                yield {
                    'State': state,
                    'County': county_name,
                    'Candidate': candidate,
                    'Party': party,
                    'Incumbent': incumbent_status,
                    'Votes': votes,
                    'Percentage': percentage,
                    'Winner': winner,
                    'Delegates': delegates
                }

In [6]:
prefix_dem = "https://edition.cnn.com/election/2024/primaries-and-caucuses/results/"
suffix_dem = "/democratic-presidential-primary"
prefix_rep = "https://edition.cnn.com/election/2024/primaries-and-caucuses/results/"
suffix_rep = "/republican-presidential-primary"

combined_data = [] # This list will store all data returned by the functions  
columns = ['State', 'County', 'Candidate', 'Party', 'Incumbent', 'Votes', 'Percentage', 'Winner', 'Delegates']
combined_df = pd.DataFrame(columns=columns)
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Append data for the Democratic Party 
for i in states_both:
    # modify the state names
    state_name = i.replace(" ", "-")
    link = f'{prefix_dem}{state_name}{suffix_dem}'
    driver.get(link)

    wait = WebDriverWait(driver, 1)  
    button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.rightButton.cnn-pcl-13b0kh1")))

    # Right-click button element 
    have_button = True
    button = driver.find_element(By.CSS_SELECTOR, "button.rightButton.cnn-pcl-13b0kh1")

    # Display all pages 
    while have_button: 
        try:
            button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.rightButton.cnn-pcl-13b0kh1")))
            # Scrape the election page and add data to combined list
            for election_data in scrape_election_page(state_name):
                    if election_data is not None:
                        combined_data.append(election_data) 
            button.click()
        except TimeoutException:
            have_button = False
    
    # Scrap data on last page (no right button)
    for election_data in scrape_election_page(state_name):
                if election_data is not None:
                    combined_data.append(election_data) 


states_both.remove('iowa') # Removing 'Iowa' to prevent its Republican data from being scrapped  


# Append data for the Republican Party 
for i in states_both:
    # modify the state names
    state_name = i.replace(" ", "-")
    link = f'{prefix_rep}{state_name}{suffix_rep}'
    driver.get(link)

    wait = WebDriverWait(driver, 1)  
    button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.rightButton.cnn-pcl-13b0kh1")))

    # Right-click button element 
    have_button = True
    button = driver.find_element(By.CSS_SELECTOR, "button.rightButton.cnn-pcl-13b0kh1")

    # Display all pages 
    while have_button: 
        try:
            button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.rightButton.cnn-pcl-13b0kh1")))
            # Scrape the election page and add data to combined list
            for election_data in scrape_election_page(state_name):
                    if election_data is not None:
                        combined_data.append(election_data)
            button.click()
        except TimeoutException:
            have_button = False

    for election_data in scrape_election_page(state_name):
                if election_data is not None:
                    combined_data.append(election_data) 

    
# Append data for the two special webpages (Alaska and American Samoa)
states_special = ['Alaska', 'American Samoa'] 
for i in states_special: 
        state_name = i.lower().replace(" ", "-")
        prefix = "https://edition.cnn.com/election/2024/primaries-and-caucuses/results/"
        link = f'{prefix}{state_name}'
        driver.get(link)
        for election_data in scrape_election_page_special(state_name):
            if election_data is not None:
                combined_data.append(election_data)


In [7]:
combined_df = pd.DataFrame(combined_data)     
print(combined_df.shape)  
combined_df.reset_index(inplace=True, drop=True) 
combined_df.index = combined_df.index + 1 
combined_df = combined_df.fillna(value=pd.NA)
combined_df['Votes'] = combined_df['Votes'].fillna(0).astype('int') # Filling 'Votes' column NaN with 0s, and converting data type to int for easier later access 

combined_df.to_csv('data.csv', index=False)

(10960, 9)
