In [22]:
"""
This script gets a list of all the IDs of available services on the TOSDR API.
It then goes through the list of ids and downloads the JSON object associated with the 
service, containing its details, full text of documents, points, etc. 
The script also downloads all the cases from TOSDR

Both services and cases are inserted into a remote MongoDB instance. This means we do not
need to pass around a huge .txt file or re-scrape the data we need from the API each time 
we run a test.
"""
import requests
import json
import pprint
import time
from pymongo import MongoClient
from tqdm import tqdm

# Do pip install pymongo and pip install tqdm

ATLAS_URI="mongodb+srv://user:9ZDgfo2r3Rc6BCh6@tosleuth.mn1yhns.mongodb.net/?retryWrites=true&w=majority"
DB_NAME="tosleuth"

client = MongoClient(ATLAS_URI)
db = client[DB_NAME]

# Create collections for testing
services = db["services"]
cases = db["cases"]

In [None]:
# Collect all IDs of services that have been comprehensively reviewed
service_ids = []

services_endpoint = "https://api.tosdr.org/service/v2"
# Pagination is enabled on this API so we can only get 100 at a time
# Find how many pages we need first
response = requests.get(services_endpoint)
start = json.loads(response.text)["parameters"]["_page"]["start"]
end = json.loads(response.text)["parameters"]["_page"]["end"]
for i in tqdm(range(start, end+1), desc="Retrieving all service ids:"):
    response = requests.get(services_endpoint, params={"page": i})
    current_services = json.loads(response.text)["parameters"]["services"]
    # Filter services that would not offer useful data:
    # is_comprehensively_reviewed = False 
    # rating = null
    # pprint.pprint(json.loads(response.text))
    service_ids += [
        service["id"] for service in current_services
        if (
            service["is_comprehensively_reviewed"]
            and service["rating"]
        )
    ]
    # sleep so we don't get banned 
    time.sleep(2)
with open("ids.txt", "w+") as f:
    f.writelines([str(id) + "\n" for id in service_ids])

In [21]:
# Now we go through each service ID and get the service object, documents with full text and points to insert into the DB
# from TOSDR's old API
old_services_endpoint = "https://api.tosdr.org/service/v1/"
with open("ids.txt", "r") as f:
    service_ids = f.readlines()
    for index in tqdm(range(0, len(service_ids)), desc="Downloading service records:"):
        service_id = service_ids[index]
        # For each service_id, get the response from the API
        # print(service_id)
        response = requests.get(old_services_endpoint, params={"service": int(service_id)})
        service = json.loads(response.text)["parameters"]
        # pprint.pprint(response)
        # Insert it into MongoDB to be used later
        services.insert_one(service)
        time.sleep(1.5)

SyntaxError: invalid syntax (1381832889.py, line 11)

In [5]:
import requests
import json
from tqdm import tqdm
import pprint
# Collect all cases from TOSDR and insert them in MongoDB
cases_endpoint = "https://api.tosdr.org/case/v1/"
# Find how many pages we need first
response = requests.get(cases_endpoint)
start = json.loads(response.text)["parameters"]["_page"]["start"]
end = json.loads(response.text)["parameters"]["_page"]["end"]
for index in tqdm(range(0, end+1), desc="Downloading cases:"):
    response = requests.get(cases_endpoint, params={"page": index})
    curr_cases = json.loads(response.text)["parameters"]["cases"]
    # Insert into our database collection called "cases"
    cases.insert_many(curr_cases)


Downloading cases::  25%|██▌       | 1/4 [00:01<00:03,  1.15s/it]

[{'classification': {'hex': 0, 'human': 'good'},
  'created_at': {'pgsql': '2018-01-16T15:26:09.452Z',
                 'timezone': 'Europe/Berlin',
                 'unix': 1516116369},
  'description': 'You can ask the service to remove your content at any time, '
                 "and it's deleted within a reasonable time.",
  'id': '175',
  'links': {'crisp': {'api': 'https://api.tosdr.org/case/v1/?case=175'},
            'phoenix': {'case': 'https://edit.tosdr.org/case/175',
                        'edit': 'https://edit.tosdr.org/case/175/edit',
                        'new_comment': 'https://edit.tosdr.org/case/175/case_comments/new'}},
  'title': 'You can delete your content from this service',
  'updated_at': {'pgsql': '2021-02-27T14:23:56.238Z',
                 'timezone': 'Europe/Berlin',
                 'unix': 1614435836}},
 {'classification': {'hex': 0, 'human': 'bad'},
  'created_at': {'pgsql': '2018-01-16T15:26:08.388Z',
                 'timezone': 'Europe/Berlin',
  

Downloading cases::  50%|█████     | 2/4 [00:02<00:02,  1.10s/it]

[{'classification': {'hex': 0, 'human': 'good'},
  'created_at': {'pgsql': '2018-01-16T15:26:09.452Z',
                 'timezone': 'Europe/Berlin',
                 'unix': 1516116369},
  'description': 'You can ask the service to remove your content at any time, '
                 "and it's deleted within a reasonable time.",
  'id': '175',
  'links': {'crisp': {'api': 'https://api.tosdr.org/case/v1/?case=175'},
            'phoenix': {'case': 'https://edit.tosdr.org/case/175',
                        'edit': 'https://edit.tosdr.org/case/175/edit',
                        'new_comment': 'https://edit.tosdr.org/case/175/case_comments/new'}},
  'title': 'You can delete your content from this service',
  'updated_at': {'pgsql': '2021-02-27T14:23:56.238Z',
                 'timezone': 'Europe/Berlin',
                 'unix': 1614435836}},
 {'classification': {'hex': 0, 'human': 'bad'},
  'created_at': {'pgsql': '2018-01-16T15:26:08.388Z',
                 'timezone': 'Europe/Berlin',
  

Downloading cases::  75%|███████▌  | 3/4 [00:03<00:01,  1.15s/it]

[{'classification': {'hex': 0, 'human': 'good'},
  'created_at': {'pgsql': '2018-09-23T22:17:51.199Z',
                 'timezone': 'Europe/Berlin',
                 'unix': 1537741071},
  'description': 'Binding Arbitration is a dispute resolution method involving '
                 'a neutral third party which will render a decision instead '
                 'of going to court. Since the arbitrator can decide the '
                 'outgoing of a dispute without evidences or a legal basis, it '
                 'is preferable that users decide whether they want or not '
                 'their dispute to be resolved this way.',
  'id': '335',
  'links': {'crisp': {'api': 'https://api.tosdr.org/case/v1/?case=335'},
            'phoenix': {'case': 'https://edit.tosdr.org/case/335',
                        'edit': 'https://edit.tosdr.org/case/335/edit',
                        'new_comment': 'https://edit.tosdr.org/case/335/case_comments/new'}},
  'title': 'You aren’t forced into bindi

Downloading cases:: 100%|██████████| 4/4 [00:04<00:00,  1.06s/it]

[{'classification': {'hex': 0, 'human': 'neutral'},
  'created_at': {'pgsql': '2018-01-16T15:26:08.898Z',
                 'timezone': 'Europe/Berlin',
                 'unix': 1516116368},
  'description': '',
  'id': '150',
  'links': {'crisp': {'api': 'https://api.tosdr.org/case/v1/?case=150'},
            'phoenix': {'case': 'https://edit.tosdr.org/case/150',
                        'edit': 'https://edit.tosdr.org/case/150/edit',
                        'new_comment': 'https://edit.tosdr.org/case/150/case_comments/new'}},
  'title': 'Spidering, crawling, or accessing the site through any automated '
           'means is not allowed',
  'updated_at': {'pgsql': '2021-07-25T10:42:48.562Z',
                 'timezone': 'Europe/Berlin',
                 'unix': 1627209768}},
 {'classification': {'hex': 0, 'human': 'blocker'},
  'created_at': {'pgsql': '2021-03-31T14:12:47.586Z',
                 'timezone': 'Europe/Berlin',
                 'unix': 1617199967},
  'description': 'Users c


