# Argument Mining API query testing

Load the target datafile, and see how the query results work

In [1]:
# reload local package definitions for each cell
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time

from dotenv import find_dotenv, load_dotenv
import requests
import grequests
import pandas as pd

from arg_mine import DATA_DIR
from arg_mine.data.loaders import get_gdelt_df
from arg_mine.api import classify, auth, session, errors
from arg_mine import utils

  curious_george.patch_all(thread=False, select=False)


In [3]:
# load the env variables to get the API key
user_id, api_key = auth.load_auth_tokens()

In [None]:
csv_datapath = os.path.join(DATA_DIR, "raw", "2020-climate-change-narrative")
csv_filepath = os.path.join(csv_datapath, "WebNewsEnglishSnippets.2020.csv")

url_df = get_gdelt_df(csv_filepath)
url_df.head()

## Build the `classify` request

In [None]:
url = url_df.iloc[0].content_url

In [None]:
CLASSIFY_BASE_URL = "https://api.argumentsearch.com/en/classify"
timeout = 5

topic = "climate change"


payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "topicRelevance": "word2vec",
    "predictStance": True,
    "computeAttention": True,
    "showOnlyArguments": False,
}

response = requests.post(
    CLASSIFY_BASE_URL,
    json=payload,
    timeout=timeout,
)
response.raise_for_status()

In [None]:
out = response.json()
out.keys()

In [None]:
response = classify.classify_url_sentences(topic, url_df.content_url.values[0], user_id, api_key)

In [None]:
classify.ClassifiedSentence.from_dict(url, topic, response['sentences'][0])

In [None]:
response['sentences'][0]

In [None]:
# good example of a link that is no longer valid
url = url_df.content_url.values[3]
try: 
    response = classify.classify_url_sentences(topic, url, user_id, api_key)
except errors.Refused as e:
    print(e)

## Run a batch of urls

In [None]:
url_df.content_url.shape

In [None]:
start_time = time.time()

doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:20])

print("iteration took {:.3} s".format(time.time() - start_time))

In [None]:
print(len(refused_doc_list))
refused_doc_list

## try concurrent requests with grequest

In [None]:

pool_size=5
chunk_size=100

In [None]:
url_list = url_df.content_url.values[:20]

start_time = time.time()

s = session.get_session(pool_size=pool_size)
full_list = []

# for i in range(0, len(url_list), chunk_size):
print(">>>> iteration: {}".format(i))
chunk_urls = url_list[i:i + chunk_size]
unsent_requests = (
    grequests.post(
        session.ApiUrl.CLASSIFY_BASE_URL,
        json=classify.bundle_payload(topic, u),
        session=s,
        allow_redirects=False,
        )
    for u in chunk_urls
)
output = grequests.map(unsent_requests, size=pool_size) #, exception_handler=classify.exception_handler)

print("iteration took {:.3} s".format(time.time() - start_time))

In [None]:
output

## collect data for unit tests
Using tools in the package, run queries that give known responses, and save those responses as json fixtures for unit tests

In [None]:
import pkg_resources
import json
from tests.fixtures import save_json_request_fixture, load_json_fixture
import logging

logger = logging.basicConfig()

SAVE_FIXTURES = False

In [None]:
url  = url_df.content_url.values[0]
payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "model": "default",
    "topicRelevance": classify.TopicRelevance.WORD2VEC,
    "predictStance": True,  # we don't want to predict stance without context
    "computeAttention": False,  # doesnt work for BERT-based models (the default model)
    "showOnlyArguments": True,  # only return sentences classified as arguments
    "userMetadata": url,
}

result = session.fetch(session.ApiUrl.CLASSIFY_BASE_URL, payload)

In [None]:
result.keys()

In [None]:
if SAVE_FIXTURES:
    save_json_request_fixture("response_classify_only_args.json", payload, result, status_code=200, drop_keys=['apiKey'])

### known bad article request, gives 404 on server

In [None]:
# this url gives a known 404 on the article server
url = url_df.content_url.values[3]
result = None
print(url)
payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "model": "default",
    "topicRelevance": classify.TopicRelevance.WORD2VEC,
    "predictStance": True,  # we don't want to predict stance without context
    "computeAttention": False,  # doesnt work for BERT-based models (the default model)
    "showOnlyArguments": True,  # only return sentences classified as arguments
    "userMetadata": url,
}

try: 
    result = session.fetch(session.ApiUrl.CLASSIFY_BASE_URL, payload)
except errors.Refused as e:
    print(result)
    print(e)

In [None]:
response = requests.post(session.ApiUrl.CLASSIFY_BASE_URL, json=payload)
print(response.json())
error_response = response.json()
if SAVE_FIXTURES:
    save_json_request_fixture("response_classify_refused_remote_404.json", payload, error_response, status_code=400, drop_keys=['apiKey'])

### Bad parameters test

In [None]:
# good url
url = url_df.content_url.values[0]
result = None
print(url)
payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "model": "i_am_a_ bad_model",
    "userMetadata": url,
}
try:
    result = session.fetch(session.ApiUrl.CLASSIFY_BASE_URL, payload)
except errors.InternalGatewayError as e:
    print(result)
    print(e)

In [None]:
response = requests.post(session.ApiUrl.CLASSIFY_BASE_URL, json=payload)
if SAVE_FIXTURES:
    save_json_request_fixture("response_classify_500_bad_payload.json", payload, "", status_code=response.status_code, drop_keys=['apiKey'])

### timeout test

In [None]:
url = url_df.content_url.values[0]
result = None
print(url)
payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "model": "default",
    "topicRelevance": classify.TopicRelevance.WORD2VEC,
    "predictStance": True,  # we don't want to predict stance without context
    "computeAttention": False,  # doesnt work for BERT-based models (the default model)
    "showOnlyArguments": True,  # only return sentences classified as arguments
    "userMetadata": url,
}
try:
    result = session.fetch(session.ApiUrl.CLASSIFY_BASE_URL, payload, timeout=0.1)
except errors.NotResponding as e:
    print(result)
    print(e)

In [None]:
try:
    response = requests.post(session.ApiUrl.CLASSIFY_BASE_URL, json=payload, timeout=0.1)
except requests.Timeout as e:
    print(e)