In [12]:
import time
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument("--incognito")
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
chrome_options.binary_location = "/usr/bin/google-chrome"
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
# chrome_options.add_argument(
#     "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
# )
chromedriver_path = "/usr/local/bin/chromedriver"
service = Service(chromedriver_path)

def create_webdriver():
    """
    Create a new instance of the Chrome WebDriver with specified options.
    """
    return webdriver.Chrome(service=service, options=chrome_options)

# def extract_main_heading(browser):
#     """
#     Extract the main heading from the page.

#     Args:
#         browser (webdriver.Chrome): The WebDriver instance.

#     Returns:
#         tuple: A tuple containing text before '/', between '/' and '-', and after '-'.
#     """
#     main_heading_element = browser.find_element(By.XPATH, '//*[@id="body"]/div/div[1]/div/div[3]/div[1]/h1')
#     main_heading_text = main_heading_element.text
#     heading_pattern = r'^(.*?) / (.*?) - (.*)$'
#     heading_match = re.search(heading_pattern, main_heading_text)
    
#     if heading_match:
#         text_before_slash = heading_match.group(1)
#         text_between_slash_and_dash = heading_match.group(2)
#         text_after_dash = heading_match.group(3)
#         return text_before_slash, text_between_slash_and_dash, text_after_dash
#     return None, None, None

def extract_main_heading(browser):
    """
    Extract the main heading from the page.

    Args:
        browser (webdriver.Chrome): The WebDriver instance.

    Returns:
        tuple: A tuple containing the text before and after the dash.
    """
    main_heading_element = browser.find_element(By.XPATH, '//*[@id="body"]/div/div[1]/div/div[3]/div[1]/h1')
    main_heading_text = main_heading_element.text
    heading_pattern = r'^(.*?) / (.*?) - (.*)$'
    heading_match = re.search(heading_pattern, main_heading_text)
    
    if heading_match:
        text_before_dash = heading_match.group(1) + ' / ' + heading_match.group(2)
        text_after_dash = heading_match.group(3)
        return text_before_dash, text_after_dash
    return None, None
    
def extract_date_time(browser):
    """
    Extract the date and time range from the page.

    Args:
        browser (webdriver.Chrome): The WebDriver instance.

    Returns:
        tuple: A tuple containing the date and time range.
    """
    date_time_element = browser.find_element(By.XPATH, '//*[@id="body"]/div/div[1]/div/table/tbody/tr/td[1]/span')
    date_time_text = date_time_element.text
    date_time_pattern = r"([a-zA-Z]+\s\d{1,2},\s\d{4}),\s(.+)"
    date_time_match = re.search(date_time_pattern, date_time_text)
    
    if date_time_match:
        date = date_time_match.group(1)
        time_range = date_time_match.group(2)
        return date, time_range
    return None, None

def extract_authors_address(browser):
    """
    Extract the first author, last author, and address from the page.

    Args:
        browser (webdriver.Chrome): The WebDriver instance.

    Returns:
        tuple: A tuple containing the first author, last author, and address.
    """
    authors_address_element = browser.find_element(By.XPATH, '//*[@id="body"]/div/div[2]/div[2]/dl/dd[1]')
    authors_address_text = authors_address_element.text
    authors_address_pattern = r"\*(.*?);\s(.*)"
    authors_address_match = re.search(authors_address_pattern, authors_address_text)
    
    if authors_address_match:
        authors_string = authors_address_match.group(1)
        address = authors_address_match.group(2)
        authors_list = [author.strip() for author in authors_string.split(',')]
        first_author = authors_list[0]
        last_author = authors_list[-1]
        return authors_string, last_author, address
    return None, None, None

def extract_abstract(browser):
    """
    Extract the abstract from the page.

    Args:
        browser (webdriver.Chrome): The WebDriver instance.

    Returns:
        str: The abstract text.
    """
    abstract_element = browser.find_element(By.XPATH, '//*[@id="body"]/div/div[2]/div[2]/dl/dd[3]')
    return abstract_element.text

def extract_location(browser):
    """
    Extract the location information from the page.

    Args:
        browser (webdriver.Chrome): The WebDriver instance.

    Returns:
        str: The location text.
    """
    location_element = browser.find_element(By.XPATH, '//*[@id="body"]/div/div[1]/div/table/tbody/tr/td[2]/span')
    return location_element.text

def extract_poster_type(browser):
    """
    Extract the poster type from the page.

    Args:
        browser (webdriver.Chrome): The WebDriver instance.

    Returns:
        str: The poster type text.
    """
    poster_type_element = browser.find_element(By.XPATH, '//*[@id="body"]/div/div[2]/div[1]/div/dl/dd[2]')
    return poster_type_element.text

def extract_category_type(browser):
    """
    Extract the poster type from the page.

    Args:
        browser (webdriver.Chrome): The WebDriver instance.

    Returns:
        str: The poster type text.
    """
    poster_type_element = browser.find_element(By.XPATH, '//*[@id="body"]/div/div[1]/div/div[3]/div[1]/h2/a')
    heading_pattern = r'^Session (\S+) - (.*)$'
    heading_match = re.search(heading_pattern, poster_type_element.text)
    
    if heading_match:
        session_number = heading_match.group(1)
        text_after_dash = heading_match.group(2)
        return session_number, text_after_dash
    return None, None



def reload_and_get_content(browser, url, url_id, max_retries=5):
    """
    Reload the page and extract information with retries.

    Args:
        browser (webdriver.Chrome): The WebDriver instance.
        url (str): The URL of the page to scrape.
        url_id (int): The ID or index of the URL.
        max_retries (int): The maximum number of retry attempts.

    Returns:
        dict: A dictionary containing extracted data, or None if all retries fail.
    """
    for attempt in range(max_retries):
        browser.get(url)
        try:
            element_present = EC.presence_of_element_located((By.XPATH, f'//*[contains(text(), "Add to Itinerary")]'))
            wait = WebDriverWait(browser, timeout=12, poll_frequency=3)
            wait.until(element_present)

            # Extract information from the page
            # text_before_slash, text_between_slash_and_dash, text_after_dash = extract_main_heading(browser)
            text_before_dash, text_after_dash = extract_main_heading(browser)

            date, time_range = extract_date_time(browser)
            first_author, last_author, address = extract_authors_address(browser)
            abstract_text = extract_abstract(browser)
            location = extract_location(browser)
            poster_type = extract_poster_type(browser)
            session_number, category_type = extract_category_type(browser)

            return {
                "Poster # / Board #": text_before_dash,
                "Poster title": text_after_dash,
                "Full Author List": first_author,             
                "Date": date,
                "Time": time_range,
                "Location": location,
                "Session #": session_number,               
                "Session Type": category_type,
                "Affliliation": address,                
                "Senior Author/PI Account": last_author,
                # "abstract": abstract_text,
                # "poster_type": poster_type,
                "url": url,
            }
        
        except TimeoutException:
            # Log the retry attempt and wait before retrying
            print(f"TimeoutException: Retry attempt {attempt + 1}")
            browser.refresh()
            time.sleep(3)

    # Log failure if all retries fail
    print(f"Failed to retrieve content from {url} after {max_retries} attempts.")
    return None

# Read the URLs from CSV
urls_df = pd.read_csv(r'urls.csv')
all_results = []

# Process each URL
for index, row in urls_df.iterrows():
    url = row['url']
    browser = create_webdriver()
    
    result = reload_and_get_content(browser, url, index)
    if result:
        all_results.append(result)
    browser.quit()

# Save results to CSV
results_df = pd.DataFrame(all_results)
results_df.to_csv('output_data.csv', index=False)

# Display the first few rows of the results
print(results_df.head())

  Poster # / Board #                                       Poster title  \
0    PSTR034.01 / J6  Expression of senescence-associated β-galactos...   

                         Full Author List             Date               Time  \
0  T. KOMORI1, E. KURIYAMA2, Y. MORIKAWA1  October 5, 2024  1:00 PM - 5:00 PM   

     Location Session #                Session Type  \
0  MCP Hall A   PSTR034  Cellular Actions of Stress   

                                        Affliliation Senior Author/PI Account  
0  1Dept. of Anat. & Neurobio., 2Dept. of Neurolo...             Y. MORIKAWA1  
