In [None]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.16.0-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.23.1-py3-none-any.whl (448 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m448.3/448.3 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?

In [None]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from google.colab import files
from bs4 import BeautifulSoup

def scrape_acm_data():
    search_keyword = "LISP"
    start_year = "1981"
    end_year = "1995"
    first_page = 1 #first page number to scrape, following the numbering of ACM website
    last_page = 53 #last page number to scrape, following the numbering of ACM website
    base_url = "https://dl.acm.org/action/doSearch?AllField={}&pageSize=20&AfterYear={}&BeforeYear={}&startPage=".format(search_keyword,start_year,end_year)

    all_articles = []
    all_authors = []

    # Use a headless browser (in this case, Chrome)

    options = webdriver.FirefoxOptions()
    options.add_argument('--verbose')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920,1200')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/97.0')

    driver = webdriver.Firefox(options=options)


    for page in range(first_page-1, last_page):
        url = base_url + str(page)
        print(url)
        driver.get(url)

        print("starting scraping page" + str(page+1))

        # Wait for the elements to be present on the page
        highlights = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, '//a[@title="Show Highlights"]')))

        # Scroll and click on each "Highlights" element
        for highlight in highlights:
            try:

              # Scroll to the element to make it clickable
              driver.execute_script("arguments[0].scrollIntoView(true);", highlight)

              # Wait for the element to be clickable
              WebDriverWait(driver, 3).until(
                  EC.element_to_be_clickable((By.XPATH, '//a[@title="Show Highlights"]'))
              )

              # Click on the element
              highlight.click()

            except Exception as e:
              driver.execute_script("arguments[0].click()", highlight)

        # Identify all the hidden authors buttons
        hidden_authors = driver.find_elements(By.CLASS_NAME, "removed-items-count")

        # Uncover all hidden authors
        for hidden_author in hidden_authors:
            try:
                hidden_author.click()

            except Exception as e:
                print(f"Error clicking on element: {e}")
                driver.execute_script("arguments[0].click();", hidden_author)

        # Find all author's profile URL
        profile_links = driver.find_elements(By.XPATH, '//a[starts-with(@href, "/profile")]')
        profile_urls = [link.get_attribute("href") for link in profile_links]

        # Extract html to be parsed
        soup = BeautifulSoup(driver.page_source, 'lxml')
        article_divs = soup.find_all('div', class_='issue-item issue-item--search clearfix')

        # Visite each author profile and get author data
        author_links = driver.find_elements(By.XPATH, '//ul[@class="rlist--inline loa truncate-list trunc-done"]/li/a')

        for profile_url in profile_urls:


            # Check if the URL is already in the table
            url_exists = any(author['URL'] == profile_url for author in all_authors)

            if url_exists:
                # If the URL is already in the table, do nothing
                print(f"URL {profile_url} already exists, skipping...")
            else:

                driver.get(profile_url)

                # Extract institution
                try:
                    # Extract institution
                    institution = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.XPATH, '//a[contains(@href, "/institution")]'))
                    )
                    institution_text = institution.text.strip() if institution else 'N/A'

                except Exception as e:
                    institution_text = 'N/A'

                # Extract author name
                try:
                      author = driver.find_element(By.XPATH, '//div[@class="item-meta-row"]/h2')
                      author_text = author.text.strip() if author else 'N/A'
                except Exception as e:
                      author_text = 'N/A'


                # Print intermediate results
                print(f"   - Author: {author_text}")
                print(f"   - Author URL: {profile_url}")
                print(f"   - Institution: {institution_text}")

                # Append data to the list
                all_authors.append({
                    'Author': author_text,
                    'Institution': institution_text,
                    'URL': profile_url
                })

                # Go back to the previous page
                #driver.back()



        for article_div in article_divs:

            # Extracting publication date
            pub_date_div = article_div.find('div', class_='bookPubDate simple-tooltip__block--b')
            publication_date = pub_date_div['data-title'] if pub_date_div else 'N/A'

            # Extracting title
            title_div = article_div.find('h5', class_='issue-item__title').find('span', class_='hlFld-Title')
            title = title_div.text.strip() if title_div else 'N/A'


            # Extracting publication details
            try:
                  publication_div = article_div.find('div', class_='issue-item__detail').find('a')
                  publication_text = publication_div.text.strip() if publication_div else 'N/A'
            except Exception as e:
                  publication_text = 'N/A'

            # Extracting authors names and profile URL
            authors_a_list = article_div.find('ul', class_='rlist--inline loa truncate-list trunc-done')
            authors = [a.text.strip() for a in authors_a_list.find_all('a')] if authors_a_list else ['N/A']
            author_links = article_div.select('ul.rlist--inline.loa li a')
            href_list = [link.get('href') for link in author_links]


            # Extracting abstract
            try:
                abstract_div = article_div.find('div', class_='abstract-text')
                paragraphs = abstract_div.find_all('p') if abstract_div else []
                abstract = ' '.join([p.text.strip() for p in paragraphs]) if paragraphs else 'N/A'
            except Exception as e:
                abstract = 'N/A'

            # Extracting full text
            try:
                text_div = article_div.find('div', class_='full-text')
                paragraphs = text_div.find_all('p') if text_div else []
                text = ' '.join([p.text.strip() for p in paragraphs]) if paragraphs else 'N/A'
            except Exception as e:
                text = 'N/A'


            # Print intermediate results
            print(f"   - Publication Date: {publication_date}")
            print(f"   - Title: {title}")
            print(f"   - Publication Type: {publication_text}")
            print(f"   - Authors: {', '.join(authors)}")
            print(f"   - Authors URL: {', '.join(href_list)}")
            print(f"   - Abstract: {abstract}")
            print(f"   - Full text: {text}")

            # Append data to the list
            all_articles.append({
                'Publication Date': publication_date,
                'Title': title,
                'Publication Type': publication_text,
                'Authors': ', '.join(authors),
                'Authors URL': ', '.join(href_list),
                'Abstract': abstract,
                'Full Text': text
            })

    driver.quit()

    return all_articles, all_authors




def save_to_csv(data, filename):
    fields = data[0].keys()
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writeheader()
        writer.writerows(data)

if __name__ == "__main__":
    scraped_data = scrape_acm_data()
    all_articles = scraped_data[0]
    all_authors = scraped_data[1]
    save_to_csv(all_articles, "all_articles.csv")
    save_to_csv(all_authors, "all_authors.csv")
    files.download('all_articles_1_10.csv')
    files.download('all_authors_1_10.csv')
    print("Scraping complete. Data saved")


https://dl.acm.org/action/doSearch?AllField=LISP&pageSize=20&AfterYear=1981&BeforeYear=1995&startPage=0


KeyboardInterrupt: ignored