# Argument Mining API query testing

Load the target datafile, and see how the query results work

In [1]:
# reload local package definitions for each cell
%load_ext autoreload
%autoreload 2

In [2]:
import os
from dotenv import find_dotenv, load_dotenv
import requests
import pandas as pd

from arg_mine import DATA_DIR
from arg_mine.data.loaders import get_gdelt_df
from arg_mine.api import classify, auth, session
from arg_mine import utils

In [3]:
# load the env variables to get the API key
user_id, api_key = auth.load_auth_tokens()

In [4]:
csv_datapath = os.path.join(DATA_DIR, "raw", "2020-climate-change-narrative")
csv_filepath = os.path.join(csv_datapath, "WebNewsEnglishSnippets.2020.csv")

url_df = get_gdelt_df(csv_filepath)
url_df.head()

Unnamed: 0,datetime,title,headline_image_url,content_url,snippit,timestamp
0,20200115111500,Liberal MPs back Science Minister Karen Andrew...,https://static.ffx.io/images/$zoom_0.2627%2C$m...,https://www.smh.com.au/politics/federal/libera...,the science in her interview with The Age and...,2020-01-15 11:15:00
1,20200117184500,Several crowd-pullers on day two of KLF,https://www.thehindu.com/news/cities/kozhikode...,https://www.thehindu.com/news/cities/kozhikode...,"Guha, who talked about patriotism and jingois...",2020-01-17 18:45:00
2,20200106233000,Seven arrested after gas rig protest,https://www.stourbridgenews.co.uk/resources/im...,https://www.stourbridgenews.co.uk/news/nationa...,three demands for the Scottish and UK Governm...,2020-01-06 23:30:00
3,20200101111500,Australia sending aid to wildfire towns as dea...,https://bloximages.newyork1.vip.townnews.com/h...,https://www.heraldmailmedia.com/news/nation/au...,this season the worst on record and reignited ...,2020-01-01 11:15:00
4,20200107101500,"A hot, dry country caught between fire and a c...",,http://global.chinadaily.com.cn/a/202001/07/WS...,", which is burned to generate electricity, wit...",2020-01-07 10:15:00


## Build the `classify` request

In [5]:
url = url_df.iloc[0].content_url

In [6]:
CLASSIFY_BASE_URL = "https://api.argumentsearch.com/en/classify"
timeout = 5

topic = "climate change"


payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "topicRelevance": "word2vec",
    "predictStance": True,
    "computeAttention": True,
    "showOnlyArguments": False,
}

response = requests.post(
    CLASSIFY_BASE_URL,
    json=payload,
    timeout=timeout,
)
response.raise_for_status()

In [7]:
out = response.json()

## collect data for unit tests
Using tools in the package, run queries that give known responses, and save those responses as json fixtures for unit tests

In [117]:
import pkg_resources
import json
from tests.fixtures import save_json_request_fixture, load_json_fixture

SAVE_FIXTURES = False

In [114]:
url  = url_df.content_url.values[0]
payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "model": "default",
    "topicRelevance": classify.TOPIC_RELEVANCE.WORD2VEC,
    "predictStance": True,  # we don't want to predict stance without context
    "computeAttention": False,  # doesnt work for BERT-based models (the default model)
    "showOnlyArguments": True,  # only return sentences classified as arguments
    "userMetadata": url,
}

result = session.fetch(session.ApiUrl.CLASSIFY_BASE_URL, payload)

In [119]:
result.keys()

dict_keys(['metadata', 'sentences'])

In [118]:
if SAVE_FIXTURES:
    fixture_filename = "response_classify_only_args.json"
    save_json_request_fixture("response_classify_only_args.json", payload, result, status_code=200, drop_keys=['apiKey'])

In [126]:
# this url gives a known 404 on the article server
url = url_df.content_url.values[3]
result = None
print(url)
payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "model": "default",
    "topicRelevance": classify.TOPIC_RELEVANCE.WORD2VEC,
    "predictStance": True,  # we don't want to predict stance without context
    "computeAttention": False,  # doesnt work for BERT-based models (the default model)
    "showOnlyArguments": True,  # only return sentences classified as arguments
    "userMetadata": url,
}
result = session.fetch(session.ApiUrl.CLASSIFY_BASE_URL, payload)
print("Found {} topic sentences of {}".format(
    len(result['sentences']), result['metadata']['totalClassifiedSentences']))

https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-towns-as-death-toll-rises/article_883fa793-6c0a-547e-8f77-b5964f1d7182.html


400 : Website could not be crawled or returned an empty result. Please contact an administrator.


****** inside HTTPError catch
{'response': <Response [400]>, 'request': <PreparedRequest [POST]>}


Refused: Website could not be crawled or returned an empty result. Please contact an administrator.

In [125]:
try:
    response = requests.post(session.ApiUrl.CLASSIFY_BASE_URL, json=payload, timeout=5)
    response.raise_for_status()
except requests.HTTPError as e:
    print(e.__dict__)
    print

{'response': <Response [400]>, 'request': <PreparedRequest [POST]>}


In [122]:
error_response = response.json()

In [111]:
requests.exceptions.HTTPError("400 Client Error: BAD REQUEST for url: https://api.argumentsearch.com/en/classify")

requests.exceptions.HTTPError('400 Client Error: BAD REQUEST for url: https://api.argumentsearch.com/en/classify')

In [113]:
fixture_filename = "response_classify_only_args.json"

save_json_request_fixture("response_classify_refused_remote_404.json", payload, error_response, status_code=400, drop_keys=['apiKey'])

In [33]:
response = classify.classify_url_sentences(topic, url, user_id, api_key)
response

InvalidSchema: No connection adapters were found for "{'topic': 'climate change', 'userID': '4uDdOJd9', 'apiKey': '1oMkQPyX4jl1jvtnOQuvBgpvqwMLk+f6', 'targetUrl': 'https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html', 'model': 'default', 'topicRelevance': 'word2vec', 'predictStance': True, 'computeAttention': False, 'showOnlyArguments': True, 'userMetadata': 'https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html'}"

In [None]:
classify.ClassifiedSentence.from_dict(url, topic, response['sentences'][0])

In [None]:
response['sentences'][0]

In [31]:
# good example of a link that is no longer valid

response = classify.classify_url_sentences(topic, url_df.content_url.values[3], user_id, api_key)

InvalidSchema: No connection adapters were found for "{'topic': 'climate change', 'userID': '4uDdOJd9', 'apiKey': '1oMkQPyX4jl1jvtnOQuvBgpvqwMLk+f6', 'targetUrl': 'https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-towns-as-death-toll-rises/article_883fa793-6c0a-547e-8f77-b5964f1d7182.html', 'model': 'default', 'topicRelevance': 'word2vec', 'predictStance': True, 'computeAttention': False, 'showOnlyArguments': True, 'userMetadata': 'https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-towns-as-death-toll-rises/article_883fa793-6c0a-547e-8f77-b5964f1d7182.html'}"

## run a batch of urls

In [15]:
doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:10])

InvalidSchema: No connection adapters were found for "{'topic': 'climate change', 'userID': '4uDdOJd9', 'apiKey': '1oMkQPyX4jl1jvtnOQuvBgpvqwMLk+f6', 'targetUrl': 'https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html', 'model': 'default', 'topicRelevance': 'word2vec', 'predictStance': True, 'computeAttention': False, 'showOnlyArguments': True, 'userMetadata': 'https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html'}"

In [None]:
print(len(refused_doc_list))
refused_doc_list

In [None]:
len(sentence_list)

In [None]:
sentence_list[0]

In [None]:
classify.logger.root.manager.loggerDict

In [25]:
import json

In [26]:
def load_json_fixture(fixture_filename):
    """Get test fixture data from a JSON filename"""
    json_path = pkg_resources.resource_filename("tests.fixtures", fixture_filename)
    with open(json_path, "r") as f:
        json_blob = json.load(f)
    return json_blob

In [28]:
blob = load_json_fixture("response_classify_only_args.json")

In [30]:
blob['response']

{'metadata': {'computeAttention': False,
  'language': 'en',
  'modelVersion': 0.1,
  'predictStance': True,
  'removeDuplicates': True,
  'showOnlyArguments': True,
  'sortBy': 'argumentConfidence',
  'timeArgumentPrediction': 0.45038652420043945,
  'timeAttentionComputation': -1,
  'timeLogging': 0.07773518562316895,
  'timePreprocessing': 3.647804260253906e-05,
  'timeStancePrediction': -1,
  'timeTotal': 0.7465801239013672,
  'topic': 'climate change',
  'totalArguments': 5,
  'totalClassifiedSentences': 37,
  'totalContraArguments': 3,
  'totalNonArguments': 32,
  'totalProArguments': 2,
  'userMetadata': 'https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html'},
 'sentences': [{'argumentConfidence': 0.8324097394943237,
   'argumentLabel': 'argument',
   'sentenceOriginal': '"So she\'s right - there is no point wasting time discussing climate change - because we are not going to stop it from happening," he 