### Web-Scraping the files 

Srcaping the cases from: https://law.justia.com/cases/federal/appellate-courts/cit/
Note: We restricted it to cases ruled on by the U.S. Court of International Trade, between 2020 and 2023. 

In [1]:
import time
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode to avoid opening a browser window
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Base URL
base_url = "https://law.justia.com/cases/federal/appellate-courts/cit/"

# Create a folder to store the downloaded PDFs
download_dir = os.path.expanduser("~/Downloads/pdf_cases")
os.makedirs(download_dir, exist_ok=True)

# Function to scrape all cases for a given year
def scrape_year(year):
    driver.get(f"{base_url}{year}/")
    time.sleep(2)
    
    while True:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        case_elements = soup.find_all('div', class_="has-padding-content-block-30 -zb")
        
        for case in case_elements:
            case_name = case.find('strong').text.strip()
            case_date = case.find('span', class_='color-emperor').text.strip().split(": ")[1]
            citation = case.find('span', class_='justia-citation').text.strip()
            docket_number_elem = case.find('strong', string='Docket Number:')
            docket_number = docket_number_elem.find_next('span').text.strip() if docket_number_elem else 'Not available'
            
            print(f"Case Name: {case_name}\nDate: {case_date}\nCitation: {citation}\nDocket Number: {docket_number}")
            
            case_link = case.find('a', class_='case-name')['href']
            driver.get(f"https://law.justia.com{case_link}")
            time.sleep(2)
            
            pdf_link = driver.find_element(By.PARTIAL_LINK_TEXT, 'Download PDF').get_attribute('href')
            file_name = pdf_link.split('/')[-1].split('?')[0]
            file_path = os.path.join(download_dir, file_name)
            
            if not os.path.exists(file_path):
                print(f"Downloading PDF: {pdf_link}\n")
                download_pdf(pdf_link, file_path)
            else:
                print(f"PDF already exists: {file_path}, skipping...\n")
            
            driver.back()
            time.sleep(2)
        
        next_button = soup.find('a', class_='next')
        if next_button:
            next_page_url = next_button['href']
            driver.get(f"https://law.justia.com{next_page_url}")
            time.sleep(2)
        else:
            break

# Function to download the PDF
def download_pdf(pdf_url, file_path):
    pdf_response = requests.get(pdf_url)
    with open(file_path, 'wb') as f:
        f.write(pdf_response.content)
        print(f"Downloaded: {file_path}")

# Scrape data for the years 2023 to 2025
for year in range(2023, 2025):
    print(f"Scraping year: {year}")
    scrape_year(year)

driver.quit()


Scraping year: 2023
Case Name: Navneet Education Ltd. v. United States
Date:  December 29, 2023
Citation: 23-191
Docket Number: Not available
PDF already exists: /Users/Test/Downloads/pdf_cases/22-00132-2023-12-29.pdf, skipping...

Case Name: GoPro, Inc. v. United States
Date:  December 28, 2023
Citation: 23-190
Docket Number: Not available
PDF already exists: /Users/Test/Downloads/pdf_cases/20-00176-2023-12-28.pdf, skipping...

Case Name: Brooklyn Bedding, LLC v. United States
Date:  December 22, 2023
Citation: 23-189
Docket Number: Not available
PDF already exists: /Users/Test/Downloads/pdf_cases/21-00285-2023-12-22.pdf, skipping...

Case Name: AG der Dillinger Hüttenwerke v. United States
Date:  December 21, 2023
Citation: 23-187
Docket Number: Not available
PDF already exists: /Users/Test/Downloads/pdf_cases/17-00158-2023-12-21.pdf, skipping...

Case Name: Jilin Bright Future Chem. Co., Ltd. v. United States
Date:  December 21, 2023
Citation: 23-188
Docket Number: Not available
PDF

ReadTimeoutError: HTTPConnectionPool(host='localhost', port=50134): Read timed out. (read timeout=120)

Note: Manually interrupted to limit the dataste size (underestimated the numerber of cases per year)