In [1]:
from pymongo import MongoClient
import requests
import json


We start by fetching and saving cases for each year from 1990 to 2024 from the Oyez API.
1. We set up a MongoDB client instance using credentials from environment variables and connects to the MongoDB database.
2. For each year, it sends a GET request to the Oyez API to fetch cases for that year.
3. If the request is successful, it inserts each case into the `cases` collection in the MongoDB database.
4. If the request fails, it prints an error message with the status code.

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

MONGODB_USERNAME = os.getenv('MONGODB_USERNAME')
MONGODB_PASSWORD = os.getenv('MONGODB_PASSWORD')
MONGODB_HOST = os.getenv('MONGODB_HOST')
MONGODB_DATABASE = os.getenv('MONGODB_DATABASE')

mongo_uri = f"mongodb+srv://{MONGODB_USERNAME}:{MONGODB_PASSWORD}@{MONGODB_HOST}/{MONGODB_DATABASE}?authSource=admin&replicaSet=db-mongo-graph-explorer"
client = MongoClient(mongo_uri)
db = client[MONGODB_DATABASE]
cases_collection = db.cases

# Fetch and save cases for each year from 1990 to 2024
for year in range(1990, 2025):
    url = f"https://api.oyez.org/cases?per_page=0&filter=term:{year}"
    response = requests.get(url)
    if response.status_code == 200:
        cases = response.json()
        for case in cases:
            cases_collection.insert_one(case)
    else:
        print(f"Failed to fetch data for year {year}: {response.status_code}")


 The code connects to a MongoDB database and iterates over each document in the `cases` collection. For each document, it fetches the `href` field and makes a GET request to that URL. If the request is successful, it inserts the fetched data along with the original `href` and document ID into the `expanded_cases_collection`. If the request fails or an exception occurs, it prints an error message. Finally, it prints "Processing complete." when all documents have been processed.


In [7]:
import requests
from tqdm import tqdm

cases_collection = db.cases
expanded_cases_collection = db.expanded_cases

# Iterate over each document in the cases collection
for doc in tqdm(cases_collection.find(), desc="Processing cases"):
    href = doc.get('href')
    if href:
        try:
            response = requests.get(href)
            if response.status_code == 200:
                href_data = response.text
                expanded_cases_collection.insert_one({
                    'href': href,
                    'data': href_data,
                    'original_id': doc.get('_id')
                })
            else:
                print(f"Failed to fetch href {href}: {response.status_code}")
        except requests.RequestException as e:
            print(f"Error fetching href {href}: {e}")
    else:
        print(f"No href found in document with ID: {doc.get('_id')}")

print("Processing complete.")

Processing cases: 1461it [25:47,  1.01s/it]

Failed to fetch href https://api.oyez.org/cases/2005/04-1131: 503


Processing cases: 1468it [25:53,  1.07it/s]

Failed to fetch href https://api.oyez.org/cases/2005/04-473: 503


Processing cases: 1496it [26:22,  1.08s/it]

Failed to fetch href https://api.oyez.org/cases/2005/04-1544: 503


Processing cases: 1498it [26:24,  1.10s/it]

Failed to fetch href https://api.oyez.org/cases/2005/04-1329: 503


Processing cases: 1501it [26:27,  1.06s/it]

Failed to fetch href https://api.oyez.org/cases/2005/05-502: 503


Processing cases: 1504it [26:30,  1.07it/s]

Failed to fetch href https://api.oyez.org/cases/2005/04-1495: 503


Processing cases: 1506it [26:31,  1.08it/s]

Failed to fetch href https://api.oyez.org/cases/2005/05-416: 503


Processing cases: 2884it [49:50,  1.04s/it]

Processing complete.





The code connects to a MongoDB database and iterates over each document in the `expanded_cases_collection`.
For each document, it retrieves the `data` and `href` fields.
If `data` is present, it attempts to parse it as JSON and adds the `href` to the parsed data.
The parsed data is then inserted into the `processed_cases_collection`.
If parsing fails, it prints an error message with the `href`.
If `data` is not found, it prints a message indicating the missing data for the `href`.


In [8]:
expanded_cases_collection = db.expanded_cases
processed_cases_collection = db.processed_cases

for doc in expanded_cases_collection.find():
    data = doc.get('data')
    href = doc.get('href')
    if data:
        try:
            parsed_data = json.loads(data)
            parsed_data['href'] = href
            processed_cases_collection.insert_one(parsed_data)
        except json.JSONDecodeError as e:
            print(f"Error parsing data for href {href}: {e}")
    else:
        print(f"No data found in document with href {href}")

Error parsing data for href https://api.oyez.org/cases/2005/04-1034: Unterminated string starting at: line 1 column 32708 (char 32707)


The code connects to a MongoDB database and iterates over each document in the `expanded_cases_collection`.
For each document, it retrieves the `data` and `href` fields.
If `data` is present, it attempts to parse it as JSON and adds the `href` to the parsed data.
The parsed data is then inserted into the `processed_cases_collection`.
If parsing fails, it prints an error message with the `href`.
If `data` is not found, it prints a message indicating the missing data for the `href`.


In [9]:
import requests
from bs4 import BeautifulSoup

def fetch_and_convert_to_markdown(url):
    # Fetch the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the div with the class "primary-content"
        primary_content_div = soup.find('div', class_='primary-content')

        if primary_content_div:
            # Convert the HTML to markdown
            def html_to_markdown(soup):
                markdown = ""
                for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'li']):
                    if element.name.startswith('h'):
                        level = int(element.name[1])
                        markdown += f"{'#' * level} {element.get_text()}\n\n"
                    elif element.name == 'p':
                        markdown += f"{element.get_text()}\n\n"
                    elif element.name in ['ul', 'ol']:
                        for li in element.find_all('li'):
                            markdown += f"- {li.get_text()}\n"
                        markdown += "\n"
                return markdown

            return html_to_markdown(primary_content_div)
        else:
            return "No primary content found in the document."
    else:
        return f"Failed to fetch the URL: {response.status_code}"



This code initializes a connection to a MongoDB database using the provided URI and database name.
It then retrieves the collections for processed cases and opinions.
The code iterates over each document in the processed cases collection.
For each document, it checks if there are any written opinions.
If written opinions are found, it iterates over each opinion.
For each opinion, it retrieves the Justia opinion URL, opinion ID, and title.
If all three values are present, it fetches the markdown content from the Justia opinion URL.
The fetched markdown content, along with the opinion ID, case ID, and title, is then inserted into the opinions collection.
Finally, the MongoDB connection is closed.


In [12]:
from pymongo import MongoClient
from tqdm.notebook import tqdm

# Initialize MongoDB connection
client = MongoClient(mongo_uri)
db = client[MONGODB_DATABASE]
cases_collection = db.processed_cases
opinions_collection = db.opinions

# Iterate over each document in the cases collection
for doc in tqdm(cases_collection.find()):
    written_opinions = doc.get('written_opinion', [])
    if written_opinions is not None:
        for opinion in written_opinions:
            justia_opinion_url = opinion.get('justia_opinion_url')
            justia_opinion_id = opinion.get('id')
            title = opinion.get('title')

            if justia_opinion_url and justia_opinion_id and title:
                # Fetch the markdown content
                markdown_content = fetch_and_convert_to_markdown(justia_opinion_url)

                # Save the fetched document in the opinions collection
                opinions_collection.insert_one({
                    'id': justia_opinion_id,
                    'case_id': doc.get('ID'),
                    'title': title,
                    'content': markdown_content
                })

# Close the MongoDB connection
client.close()

0it [00:00, ?it/s]

CursorNotFound: cursor id 7757750565071782110 not found, full error: {'ok': 0.0, 'errmsg': 'cursor id 7757750565071782110 not found', 'code': 43, 'codeName': 'CursorNotFound', '$clusterTime': {'clusterTime': Timestamp(1724361114, 1), 'signature': {'hash': b'm\x89v\x82\x8a\x08Zj*#\xb9\xc0l\x95\xc8Y\x82\np;', 'keyId': 7401203771792949255}}, 'operationTime': Timestamp(1724361114, 1)}