In [22]:
from pymongo import MongoClient
from tqdm import tqdm
import pprint
import random 

ATLAS_URI="mongodb+srv://user:9ZDgfo2r3Rc6BCh6@tosleuth.mn1yhns.mongodb.net/?retryWrites=true&w=majority"
DB_NAME="tosleuth"

client = MongoClient(ATLAS_URI)
db = client[DB_NAME]

# Create collections for testing
services = db["services"]
cases = db["cases"]

In [32]:
services_data = services.find()
# Only get cases classified as good or bad

In [34]:
# Randomly sample from services 
n = 10
# Set seed for reproducibility 
random.seed(42069)
test_services = random.sample(list(services_data), n)

ValueError: Sample larger than population or is negative

In [30]:
for service in test_services:
    print(service["name"])

Shopify
USSeek
HP | Hewlett-Packard
MIT App Inventor
Movistar
Roblox
Turnitin
ptgms Industries
WikiHow
Tubi


In [19]:
# Testing part
import requests
import json 
from bs4 import BeautifulSoup

add_endpoint = "http://127.0.0.1:8000/add"
query_endpoint = "http://127.0.0.1:8000/query"
tosdr_point_url = "https://edit.tosdr.org/points/"

# store our test result data here
data = []

for service in test_services:
    doc_id_to_name = {}
    for doc in service["documents"]:
        doc_obj = {
            "service": service["name"],
            "url": doc["url"],
            "name": doc["name"],
            "text": doc["text"]
        }
        # pprint.pprint(doc_obj)
        response = requests.post(add_endpoint, json=doc_obj)
        doc_id_to_name[doc["id"]] = doc["name"]
    # print(f"doc_id_to_name: {doc_id_to_name}")
    # Get 50/50 mix of valid and non valid cases
    # only get approved points 
    valid_points = [
        point for point in service["points"] 
        if point["status"] == 'approved'
    ]
    filtered_valid_cases = []
    invalid_cases = []
    # Get the case id for each point and check its rating
    # We don't really care about neutral cases so filter those out
    for point in valid_points:
        valid_case_from_point = cases.find_one({"id": str(point["case_id"])})
        if valid_case_from_point["classification"]["human"] in ["good", "blocker", "bad"]:
            filtered_valid_cases.append(point)
    # Get cases that do not apply to this service (negative cases) 
    invalid_cases = list(cases.aggregate(
        [
            {
                "$match": {
                    "classification.human": {
                        "$in": ["good", "blocker", "bad"]
                    },
                    "id": {
                        "$nin": [i["case_id"] for i in filtered_valid_cases]
                    }
                }
            },
            {
                "$sample": {
                    "size": len(filtered_valid_cases)
                }
            }
        ]
    ))
    # print(invalid_cases)

    # valid_case_ids = [point["case_id"] for point in valid_points]
    
    # for id in valid_case_ids:
    #     case = cases.find_one({"id": id})
    #     if case["classification"]["human"] in ["good", "blocker"]:
    #         filtered_valid_cases.append()
    # invalid_cases = []
    # count = 0
    
    # while count < len(valid_points):
    #     for valid_case in random.sample(test_cases, len(test_cases)):
    #         if valid_case["id"] not in valid_case_ids:
    #             invalid_cases.append(valid_case)
    #             count += 1
                
    # Sample random valid cases:
    valid_cases_random_sample = random.sample(filtered_valid_cases, len(filtered_valid_cases) // 2)
    invalid_cases_random_sample = random.sample(invalid_cases, len(filtered_valid_cases) // 2)
    # print(len(valid_cases_random_sample))
    # print(len(invalid_cases_random_sample))
    # print(valid_cases_random_sample)
    # print('-'*100)
    # print(invalid_cases_random_sample)
    # Sample random invalid cases

    invalid_llm_outputs = 0
    false_negatives = 0
    false_positives = 0
    true_positives = 0
    true_negatives = 0
    # TODO: Batch processing of valid and invalid cases
    # for idx in range(0, len(randomized_points)):
    for idx in tqdm(range(0, len(valid_cases_random_sample)), desc=f"Validating points for {service['name']}..."):
        valid_case = valid_cases_random_sample[idx]
        invalid_case = invalid_cases_random_sample[idx]

        # Go to point page and scrape the quote
        # curr_point_url = tosdr_point_url + str(point["id"])
        # page = requests.get(curr_point_url)
        # soup = BeautifulSoup(page.content, "html.parser")
        # quote = soup.select_one(".col-sm-10>blockquote").get_text()
        
        # Valid case 
        # Handle missing document ids
        try:
            valid_doc_name = doc_id_to_name[valid_case["document_id"]]
        except KeyError:
            valid_doc_name = "PLACEHOLDER"

        valid_llm_query = {
            "tosdr_cases": [valid_case["title"]],
            "service": service["name"],
            "doc_name": valid_doc_name
        }
        valid_llm_response = requests.post(url=query_endpoint, json=valid_llm_query)
        if valid_llm_response.status_code == 200:
            body = valid_llm_response.json()["results"][0]
            if body["error"] == 2:
                # LLM was not able to generate a valid JSON object and the backend
                # couldn't parse it
                invalid_llm_outputs += 1
            elif body["error"] == 1:
                # LLM returned 0 on a valid case - false negative
                false_negatives += 1
            else:
                # Validate true / false positives here
                true_positives += 1
            
        # Invalid case
        invalid_doc_name = "PLACEHOLDER"
        
        invalid_llm_query = {
            "tosdr_cases": [invalid_case["title"]],
            "service": service["name"],
            "doc_name": invalid_doc_name
        }
        invalid_llm_response = requests.post(url=query_endpoint, json=invalid_llm_query)
        if invalid_llm_response.status_code == 200:
            body = invalid_llm_response.json()["results"][0]
            if body["error"] == 2:
                invalid_llm_outputs += 1 
            elif body["error"] == None:
                # LLM should return 0 on a negative case - false positive
                false_positives += 1 
            elif body["error"] == 1:
                # Correct: LLM should return 0 on negative case
                true_negatives += 1
                # print("="*50)
                # print(f"query: {point['title']} point_id: {point['id']}")
                # print("-"*50)
                # print(quote)
                # print("-"*50)
                # print(body["source_text"])
                # quote_tokenized = word_tokenize(quote)
                # source_tokenized = word_tokenize(body["source_text"])
                # print(quote_tokenized, source_tokenized)
                # diff = set(source_tokenized).intersection(quote_tokenized)
                # print("-"*50)
                # print(diff)
                # print(f"{len(diff)} {len(quote_tokenized)} {len(source_tokenized)}")
                # if (
                #     (len(diff) > len(quote_tokenized) // 2)
                # ):
                #     # The LLM's chosen text snippet and the real quote associated with the point
                #     # are sufficiently similar if they share more than 50% of vocab (rough heuristic)
                #     print("Text choice verified")
                #     pass
                # else:
                #     print("Invalid choice")
                #     invalid_choices += 1
                # print("="*50)
    data.append({
        "service": service["name"],
        "num_valid_cases_tested": len(valid_cases_random_sample),
        "num_invalid_cases_tested": len(invalid_cases_random_sample),
        "false_positives": false_positives,
        "true_positives": true_positives,
        "false_negatives": false_negatives,
        "true_negatives": true_negatives,
        "num_invalid_llm_outputs": invalid_llm_outputs
    })

Validating points for KidzSearch...: 100%|██████████| 4/4 [00:11<00:00,  2.98s/it]
Validating points for Amazon...: 100%|██████████| 8/8 [00:19<00:00,  2.49s/it]
Validating points for BlueMail...: 100%|██████████| 5/5 [00:13<00:00,  2.72s/it]
Validating points for Free...: 100%|██████████| 10/10 [00:30<00:00,  3.00s/it]
Validating points for Zoom Video Communications...: 100%|██████████| 17/17 [00:45<00:00,  2.67s/it]
Validating points for Google Analytics...: 100%|██████████| 7/7 [00:18<00:00,  2.71s/it]
Validating points for HuffPost...: 100%|██████████| 2/2 [00:05<00:00,  2.67s/it]
Validating points for Truth Social...: 100%|██████████| 4/4 [00:10<00:00,  2.65s/it]
Validating points for OneSignal...: 100%|██████████| 11/11 [00:27<00:00,  2.49s/it]
Validating points for ToS;DR...: 100%|██████████| 7/7 [00:01<00:00,  4.25it/s]


In [20]:
total_invalid_llm_outputs = sum(i["num_invalid_llm_outputs"] for i in data)
total_valid_cases_tested = sum(i["num_valid_cases_tested"] for i in data)
total_invalid_cases_tested = sum(i["num_invalid_cases_tested"] for i in data)
total_avg_invalid_llm_output_rate = total_invalid_llm_outputs / (total_invalid_cases_tested + total_valid_cases_tested)
print(f"Total number of invalid LLM outputs: {total_invalid_llm_outputs}/{total_valid_cases_tested + total_invalid_cases_tested}")
print(f"Average overall failure rate: {total_avg_invalid_llm_output_rate*100:.1f}%")
avg_invalid_llm_output_rate = sum(i['num_invalid_llm_outputs'] / (i['num_invalid_cases_tested']+i['num_valid_cases_tested']) for i in data) / len(data)
print(f"Average failure rate per service: {avg_invalid_llm_output_rate*100:.1f}%" )
avg_false_positive_rate = sum(
    [
        i["false_positives"] / i["num_invalid_cases_tested"]
        for i in data
    ]) / len(data)
avg_true_positive_rate = sum(
    [
        i["true_positives"] / i["num_valid_cases_tested"]
        for i in data
    ]) / len(data)
avg_true_negative_rate = sum(
    [
        i["true_negatives"] / + i["num_invalid_cases_tested"]
        for i in data
    ]) / len(data)
avg_false_negative_rate = sum(
    [
        i["false_negatives"] / i["num_valid_cases_tested"]
        for i in data
    ]) / len(data)
print(f"Average false positive rate: {avg_false_positive_rate*100:.1f}%")
print(f"Average true positive rate: {avg_true_positive_rate*100:.1f}%")
print(f"Average true negative rate: {avg_true_negative_rate*100:.1f}%")
print(f"Average false negative rate: {avg_false_negative_rate*100:.1f}%")

total_true_positives = sum(i["true_positives"] for i in data)
total_true_negatives = sum(i["true_negatives"] for i in data)
total_false_positives = sum(i["false_positives"] for i in data)
total_false_negatives = sum(i["false_negatives"] for i in data)

total_true_positive_rate = total_true_positives / (total_valid_cases_tested)
total_false_positive_rate = total_false_positives / (total_invalid_cases_tested)
total_true_negative_rate = total_true_negatives / (total_invalid_cases_tested)
total_false_negative_rate = total_false_negatives / (total_valid_cases_tested)
print(f"Total false positive rate: {total_false_positive_rate*100:.1f}%")
print(f"Total true positive rate: {total_true_positive_rate*100:.1f}%")
print(f"Total true negative rate: {total_true_negative_rate*100:.1f}%")
print(f"Total false negative rate: {total_false_negative_rate*100:.1f}%")
print("="*100)
print(f"Accuracy: {(total_true_positives+total_true_negatives)/(total_true_positives+total_true_negatives+total_false_negatives+total_false_positives)}")
print(f"Misclassification: {(total_false_positives+total_false_negatives)/(total_true_positives+total_true_negatives+total_false_negatives+total_false_positives)}")
print(f"Precision: {(total_true_positives/(total_true_positives+total_false_positives))}")
print(f"Sensitivity: {(total_true_positives)/(total_true_positives+total_false_negatives)}")
print(f"Specificity: {total_true_negatives/(total_true_negatives+total_false_positives)}")

Total number of invalid LLM outputs: 46/150
Average overall failure rate: 30.7%
Average failure rate per service: 32.4%
Average false positive rate: 60.2%
Average true positive rate: 51.0%
Average true negative rate: 1.0%
Average false negative rate: 0.0%
Total false positive rate: 61.3%
Total true positive rate: 54.7%
Total true negative rate: 1.3%
Total false negative rate: 0.0%
Accuracy: 0.4772727272727273
Misclassification: 0.5227272727272727
Precision: 0.47126436781609193
Sensitivity: 1.0
Specificity: 0.02127659574468085
