This is part one of an attempt to expand on a working paper, "[Extracting protest events from newspaper articles with ChatGPT](https://osf.io/dvht7)" that I wrote with Andy Andrews and Rashawn Ray. In that paper, we tested whether ChatGPT could replace my undergraduate RAs in extracting details about Black Lives Matter protests from media accounts. This time, I want to expand it to include more articles, movements, and variables.

In this part, I largely copy [old code on downloading](https://nealcaren.github.io/notes/posts/scraping/bulk-download.html) to help gather a couple of thousand articles from the [Crowd Counting Consortium](https://github.com/nonviolent-action-lab/crowd-counting-consortium)'s dataset. Their dataset includes event characteristics for over a hundred thousand protest events and the source web addresses. I aim to test if GPT models can replicate their hand-coding results, but this script just gets the data.

In [41]:
pip install undetected-chromedriver

^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [17]:
import os
import asyncio
import nest_asyncio
from random import shuffle
from collections import Counter
from urllib.parse import urlparse
import re
from concurrent.futures import ThreadPoolExecutor

from slugify import slugify

import pandas as pd


In [None]:
df = pd.read_csv(
    "https://github.com/nonviolent-action-lab/crowd-counting-consortium/raw/master/ccc_compiled_2021-present.csv",
    encoding="latin",
    low_memory=False,
)
print(len(df))

106443


In [None]:
# Limit to just 2023 or 2024
df = df[pd.to_datetime(df["date"]).dt.year.isin([2023,2024])]
print(len(df))

# Keep only with one source 
df.dropna(subset=['source_2'], inplace=True)
len(df)
df['Keep'] = True
print(df['Keep'].sum())

# Step 2: Eliminate social media URLs and ensure they contain 'http'
social_media_domains = ["twitter.com", "youtube.com", "facebook.com", "instagram.com", "tiktok.com", "bsky.com"]
for sm_domain in social_media_domains:
    df['Keep'] = df['Keep'] & (~df['source_1'].str.contains(sm_domain) & df['source_1'].str.contains("http"))

print(df['Keep'].sum())

# Step 3: Filter URLs to keep only those that appear once
# Count occurrences of each URL
url_counts = df.loc[df['Keep'], 'source_1'].value_counts()

# Here, we use 'map' to align counts with the original DataFrame, checking if each count equals 1
unique_url_mask = df['source_1'].map(url_counts) == 1

# Update the 'Keep' column: True only if previously True AND the URL is unique (appears once)
df['Keep'] = df['Keep'] & unique_url_mask

print(df['Keep'].sum())


39808
17258
8826
3574


In [None]:
# Save the subset

df = df[df['Keep']]
df.to_json('ccc_sample.json', orient='records')

In [None]:
# Load the subset and make a list of the URLS

df = pd.read_json('ccc_sample.json')
urls = df['source_1'].values
shuffle(urls)

In [63]:
import undetected_chromedriver as uc

import os


# Ensure the HTML directory exists
html_dir = "HTML"
os.makedirs(html_dir, exist_ok=True)

# List of URLs that couldn't be fetched
bad_urls = []

# User agent to be used for all requests

# User agent to be used for all requests

def new_driver():
    import undetected_chromedriver as uc

    options = uc.ChromeOptions()
    options.add_argument('--load-extension=bypass-paywalls-chrome-clean-3.6.1.0.crx')
    driver = uc.Chrome(headless=True,
                       options=options,
                       use_subprocess=False)
    return driver

def slugify(url):
    """Create a slug from a URL to use as a filename."""
    parse_result = urlparse(url)
    filename = re.sub(r'[^a-zA-Z0-9]+', '-', parse_result.netloc + parse_result.path)
    return filename.strip('-')

def fetch(url, driver = new_driver()):
    filename = slugify(url) + ".html"
    file_path = os.path.join(html_dir, filename)

    if os.path.isfile(file_path):
        print(f"File {file_path} already exists, skipping download.")
        return

    if url in bad_urls:
        print(f"Skipping bad URL: {url}")
        return

    try:
        # Setup Selenium with Chrome

        # Navigate to the URL
        driver.get(url)
        content = driver.page_source
        # Save the content to a file
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(content)
    
        print(f"Content from {url} has been saved to {file_path}")

    except Exception as e:
        print(f"An error occurred while fetching {url}: {e}")
        bad_urls.append(url)
    finally:
        driver.quit()

# Example usage:
url = 'https://pantagraph.com/news/local/video-activists-gather-for-a-rally-in-support-of-palestinians-on-monday-in-normal/video_21ebf46a-1b3b-54f7-a761-457a60930271.html'
url = "https://www.bozemandailychronicle.com/news/international/protestors-picket-testers-weekend-fundraiser/article_241b4486-bec9-11ee-8e46-db5dbc78639a.html"
fetch(url)


File HTML/www-bozemandailychronicle-com-news-international-protestors-picket-testers-weekend-fundraiser-article-241b4486-bec9-11ee-8e46-db5dbc78639a-html.html already exists, skipping download.


In [55]:
def bulk_download(urls):
    # Number of drivers/workers you want to use
    num_drivers = 2

    # Initialize the drivers
    drivers = {_:new_driver() for _ in range(num_drivers)}

    # Use ThreadPoolExecutor to manage parallel execution
    with ThreadPoolExecutor(max_workers=num_drivers) as executor:
        # Create a future for each URL. Assign each driver in a round-robin fashion.
        futures = [executor.submit(fetch, url, drivers[i % num_drivers]) for i, url in enumerate(urls)]

        # Optionally, you can wait for all futures to complete and handle their results
        for future in futures:
            future.result()  # This will re-raise any exceptions caught during the fetch execution

    # Cleanup: Quit drivers after the tasks are done
    for i in range(num_drivers):
        drivers[i].quit()


In [56]:
bulk_download(urls[:4])

File HTML/www-mobilize-us-mobilize-event-545609.html already exists, skipping download.File HTML/www-wral-com-story-local-jewish-community-plans-gathering-in-support-of-israel-on-monday-night-21088078.html already exists, skipping download.

Content from https://www.fox13news.com/news/the-people-of-israel-live-hundreds-rally-in-solidarity-with-israel-at-curtis-hixon-park has been saved to HTML/www-fox13news-com-news-the-people-of-israel-live-hundreds-rally-in-solidarity-with-israel-at-curtis-hixon-park.html
Content from https://www.channel3000.com/news/all-i-want-is-just-for-it-to-stop-calls-for-support-peace-at-pro/article_fdf4876c-6fbe-11ee-883f-abffc254bea0.html has been saved to HTML/www-channel3000-com-news-all-i-want-is-just-for-it-to-stop-calls-for-support-peace-at-pro-article-fdf4876c-6fbe-11ee-883f-abffc254bea0-html.html


In [20]:
user_agent = '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'''

In [18]:
def slugurl(url):
    filename = slugify(url) + ".html"
    file_path = os.path.join('HTML', filename)
    return file_path

slug = slugurl(url)

In [19]:
import subprocess

slug = slugurl(url)
command = f'''shot-scraper html {url} -o {slug} --bypass-csp --user-agent "{user_agent}" '''
process = subprocess.run(command, shell=True, text=True, capture_output=True)


In [58]:
import subprocess
import json

# Command line to be executed
command = f"""
shot-scraper javascript "{url}" --bypass-csp --user-agent "{user_agent}" "async () => {{
  const readability = await import('https://cdn.skypack.dev/@mozilla/readability');
  return (new readability.Readability(document)).parse();
}}"
"""

# Run the command line
process = subprocess.run(command, shell=True, text=True, capture_output=True)

# Check if the command was successful
if process.returncode == 0:
    # Assuming the output is JSON, we can parse it
    result = json.loads(process.stdout)
    print(result)
else:
    print("Error running command:", process.stderr)



In [31]:
from selenium import webdriver
import undetected_chromedriver as uc


options = uc.ChromeOptions()
options.add_argument('--load-extension=bypass-paywalls-chrome-clean')
uc_driver = uc.Chrome(headless=True,
                       options=options,
                       use_subprocess=False)


In [27]:



options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
options.add_argument('--load-extension=bypass-paywalls-chrome-clean')
options.add_argument(f"--user-agent={user_agent}")
selenium_bypass_driver = webdriver.Chrome(options=options)

options = webdriver.ChromeOptions()
options.add_argument("--headless=True")
options.add_argument(f"--user-agent={user_agent}")
selenium_driver = webdriver.Chrome(options=options)


In [28]:
url = 'https://www.fox13news.com/news/the-people-of-israel-live-hundreds-rally-in-solidarity-with-israel-at-curtis-hixon-park'
#url = 'https://www.nytimes.com/2024/03/25/nyregion/trump-bond-reduced.html'
selenium_driver.get(url)
selenium_driver.get_screenshot_as_file('nyt.png')

True

In [29]:
selenium_bypass_driver.get(url)
selenium_driver.get_screenshot_as_file('nyt_bp.png')

True

In [32]:
uc_driver.get(url)
uc_driver.get_screenshot_as_file('nyt_ul.png')

True

In [84]:
def slugurl(url):
    filename = slugify(url) + ".html"
    file_path = os.path.join('HTML', filename)
    return file_path
file_path = slugurl(url)

In [81]:
content = driver.page_source
with open(file_path, "w", encoding="utf-8") as file:
    file.write(content)