In [22]:
"""
This script gets a list of all the IDs of available services on the TOSDR API.
It then goes through the list of ids and downloads the JSON object associated with the 
service, containing its details, full text of documents, points, etc. 
The script also downloads all the cases from TOSDR

Both services and cases are inserted into a remote MongoDB instance. This means we do not
need to pass around a huge .txt file or re-scrape the data we need from the API each time 
we run a test.
"""
import requests
import json
import pprint
import time
from pymongo import MongoClient
from tqdm import tqdm

# Do pip install pymongo and pip install tqdm

ATLAS_URI="mongodb+srv://user:9ZDgfo2r3Rc6BCh6@tosleuth.mn1yhns.mongodb.net/?retryWrites=true&w=majority"
DB_NAME="tosleuth"

client = MongoClient(ATLAS_URI)
db = client[DB_NAME]

# Create collections for testing
services = db["services"]
cases = db["cases"]

In [None]:
# Collect all IDs of services that have been comprehensively reviewed
service_ids = []

services_endpoint = "https://api.tosdr.org/service/v2"
# Pagination is enabled on this API so we can only get 100 at a time
# Find how many pages we need first
response = requests.get(services_endpoint)
start = json.loads(response.text)["parameters"]["_page"]["start"]
end = json.loads(response.text)["parameters"]["_page"]["end"]
for i in tqdm(range(start, end+1), desc="Retrieving all service ids:"):
    response = requests.get(services_endpoint, params={"page": i})
    current_services = json.loads(response.text)["parameters"]["services"]
    # Filter services that would not offer useful data:
    # is_comprehensively_reviewed = False 
    # rating = null
    # pprint.pprint(json.loads(response.text))
    service_ids += [
        service["id"] for service in current_services
        if (
            service["is_comprehensively_reviewed"]
            and service["rating"]
        )
    ]
    # sleep so we don't get banned 
    time.sleep(2)
with open("ids.txt", "w+") as f:
    f.writelines([str(id) + "\n" for id in service_ids])

In [21]:
# Now we go through each service ID and get the service object, documents with full text and points to insert into the DB
# from TOSDR's old API
old_services_endpoint = "https://api.tosdr.org/service/v1/"
with open("ids.txt", "r") as f:
    service_ids = f.readlines()
    for index in tqdm(range(0, len(service_ids)), desc="Downloading service records:"):
        service_id = service_ids[index]
        # For each service_id, get the response from the API
        # print(service_id)
        response = requests.get(old_services_endpoint, params={"service": int(service_id)})
        service = json.loads(response.text)["parameters"]
        # pprint.pprint(response)
        # Insert it into MongoDB to be used later
        services.insert_one(service)
        time.sleep(1.5)

SyntaxError: invalid syntax (1381832889.py, line 11)

In [23]:
# Collect all cases from TOSDR and insert them in MongoDB
cases_endpoint = "https://api.tosdr.org/case/v1/"
# Find how many pages we need first
response = requests.get(cases_endpoint)
start = json.loads(response.text)["parameters"]["_page"]["start"]
end = json.loads(response.text)["parameters"]["_page"]["end"]
for index in tqdm(range(0, end+1), desc="Downloading cases:"):
    response = requests.get(cases_endpoint, params={"page": i})
    curr_cases = json.loads(response.text)["parameters"]["cases"]
    # Insert into our database collection called "cases"
    cases.insert_many(curr_cases)


Downloading cases:: 100%|██████████| 4/4 [00:04<00:00,  1.05s/it]
