# Argument Mining API query testing

Load the target datafile, and see how the query results work

In [1]:
# reload local package definitions for each cell
%load_ext autoreload
%autoreload 2

In [2]:
import os
from dotenv import find_dotenv, load_dotenv
import requests
import pandas as pd

from arg_mine import DATA_DIR
from arg_mine.data.loaders import get_gdelt_df
from arg_mine.api import classify, auth, session, errors
from arg_mine import utils

In [3]:
# load the env variables to get the API key
user_id, api_key = auth.load_auth_tokens()

In [4]:
csv_datapath = os.path.join(DATA_DIR, "raw", "2020-climate-change-narrative")
csv_filepath = os.path.join(csv_datapath, "WebNewsEnglishSnippets.2020.csv")

url_df = get_gdelt_df(csv_filepath)
url_df.head()

Unnamed: 0,datetime,title,headline_image_url,content_url,snippit,timestamp
0,20200115111500,Liberal MPs back Science Minister Karen Andrew...,https://static.ffx.io/images/$zoom_0.2627%2C$m...,https://www.smh.com.au/politics/federal/libera...,the science in her interview with The Age and...,2020-01-15 11:15:00
1,20200117184500,Several crowd-pullers on day two of KLF,https://www.thehindu.com/news/cities/kozhikode...,https://www.thehindu.com/news/cities/kozhikode...,"Guha, who talked about patriotism and jingois...",2020-01-17 18:45:00
2,20200106233000,Seven arrested after gas rig protest,https://www.stourbridgenews.co.uk/resources/im...,https://www.stourbridgenews.co.uk/news/nationa...,three demands for the Scottish and UK Governm...,2020-01-06 23:30:00
3,20200101111500,Australia sending aid to wildfire towns as dea...,https://bloximages.newyork1.vip.townnews.com/h...,https://www.heraldmailmedia.com/news/nation/au...,this season the worst on record and reignited ...,2020-01-01 11:15:00
4,20200107101500,"A hot, dry country caught between fire and a c...",,http://global.chinadaily.com.cn/a/202001/07/WS...,", which is burned to generate electricity, wit...",2020-01-07 10:15:00


## Build the `classify` request

In [5]:
url = url_df.iloc[0].content_url

In [6]:
CLASSIFY_BASE_URL = "https://api.argumentsearch.com/en/classify"
timeout = 5

topic = "climate change"


payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "topicRelevance": "word2vec",
    "predictStance": True,
    "computeAttention": True,
    "showOnlyArguments": False,
}

response = requests.post(
    CLASSIFY_BASE_URL,
    json=payload,
    timeout=timeout,
)
response.raise_for_status()

In [7]:
out = response.json()

In [8]:
response = classify.classify_url_sentences(topic, url_df.content_url.values[0], user_id, api_key)

In [9]:
classify.ClassifiedSentence.from_dict(url, topic, response['sentences'][0])

ClassifiedSentence(url='https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html', doc_id='657f9dd95eb97597e34d0c05b5a93ba6', topic='climate change', sentence_id='c68493474ee7757287eeff11ac1bd1ea', argument_confidence=0.8324097394943237, argument_label='argument', sentence_original='"So she\'s right - there is no point wasting time discussing climate change - because we are not going to stop it from happening," he said of the minister\'s comments.', sentence_preprocessed='"So she\'s right - there is no point wasting time discussing climate change - because we are not going to stop it from happening," he said of the minister\'s comments.', sort_confidence=0.9131057365821765, stance_confidence=0.9938017336700293, stance_label='pro')

In [10]:
response['sentences'][0]

{'argumentConfidence': 0.8324097394943237,
 'argumentLabel': 'argument',
 'sentenceOriginal': '"So she\'s right - there is no point wasting time discussing climate change - because we are not going to stop it from happening," he said of the minister\'s comments.',
 'sentencePreprocessed': '"So she\'s right - there is no point wasting time discussing climate change - because we are not going to stop it from happening," he said of the minister\'s comments.',
 'sortConfidence': 0.9131057365821765,
 'stanceConfidence': 0.9938017336700293,
 'stanceLabel': 'pro'}

In [23]:
# good example of a link that is no longer valid
url = url_df.content_url.values[3]
try: 
    response = classify.classify_url_sentences(topic, url, user_id, api_key)
except errors.Refused as e:
    print(e)

ERROR:arg_mine.api.session:400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}


Refused: 400: Website could not be crawled or returned an empty result. Please contact an administrator.


## run a batch of urls

In [25]:
doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:10])

DEBUG:2020-06-18 20:41:50,415:arg_mine.api.classify: Attempting url 0, try #1
DEBUG:arg_mine.api.classify:Attempting url 0, try #1
DEBUG:2020-06-18 20:41:52,154:arg_mine.api.classify: Attempting url 1, try #1
DEBUG:arg_mine.api.classify:Attempting url 1, try #1
DEBUG:2020-06-18 20:41:54,654:arg_mine.api.classify: Attempting url 2, try #1
DEBUG:arg_mine.api.classify:Attempting url 2, try #1
DEBUG:2020-06-18 20:41:57,078:arg_mine.api.classify: Attempting url 3, try #1
DEBUG:arg_mine.api.classify:Attempting url 3, try #1
ERROR:arg_mine.api.session:400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
INFO:2020-06-18 20:41:58,205:arg_mine.api.classify: Skipping 3: https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-towns-as-death-toll-rises/article_883fa793-6c0a-547e-8f77-b5964f1d7182.html
INFO:arg_mine.api.classify:Skipping 3: https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-town

In [26]:
print(len(refused_doc_list))
refused_doc_list

4


['https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-towns-as-death-toll-rises/article_883fa793-6c0a-547e-8f77-b5964f1d7182.html',
 'https://www.seattlepi.com/news/world/article/Cooler-weather-brings-respite-in-Australian-14950114.php',
 'https://www.airdrietoday.com/national-business/starbucks-goals-for-sustainability-will-require-significant-consumer-buy-in-2037947',
 'https://www.heraldbulletin.com/news/nation_world/davos-chief-welcomes-views-of-trump-greta-thunberg-at-forum/article_24dbb151-7132-5c48-a7df-c13b67004ec0.html']

## collect data for unit tests
Using tools in the package, run queries that give known responses, and save those responses as json fixtures for unit tests

In [12]:
import pkg_resources
import json
from tests.fixtures import save_json_request_fixture, load_json_fixture
import logging

logger = logging.basicConfig()

SAVE_FIXTURES = False

In [13]:
url  = url_df.content_url.values[0]
payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "model": "default",
    "topicRelevance": classify.TopicRelevance.WORD2VEC,
    "predictStance": True,  # we don't want to predict stance without context
    "computeAttention": False,  # doesnt work for BERT-based models (the default model)
    "showOnlyArguments": True,  # only return sentences classified as arguments
    "userMetadata": url,
}

result = session.fetch(session.ApiUrl.CLASSIFY_BASE_URL, payload)

In [14]:
result.keys()

dict_keys(['metadata', 'sentences'])

In [15]:
if SAVE_FIXTURES:
    save_json_request_fixture("response_classify_only_args.json", payload, result, status_code=200, drop_keys=['apiKey'])

### known bad article request, gives 404 on server

In [16]:
# this url gives a known 404 on the article server
url = url_df.content_url.values[3]
result = None
print(url)
payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "model": "default",
    "topicRelevance": classify.TopicRelevance.WORD2VEC,
    "predictStance": True,  # we don't want to predict stance without context
    "computeAttention": False,  # doesnt work for BERT-based models (the default model)
    "showOnlyArguments": True,  # only return sentences classified as arguments
    "userMetadata": url,
}

try: 
    result = session.fetch(session.ApiUrl.CLASSIFY_BASE_URL, payload)
except errors.Refused as e:
    print(result)
    print(e)

https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-towns-as-death-toll-rises/article_883fa793-6c0a-547e-8f77-b5964f1d7182.html


ERROR:arg_mine.api.session:400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}


None
Refused: 400: Website could not be crawled or returned an empty result. Please contact an administrator.


In [17]:
response = requests.post(session.ApiUrl.CLASSIFY_BASE_URL, json=payload)
print(response.json())
error_response = response.json()
if SAVE_FIXTURES:
    save_json_request_fixture("response_classify_refused_remote_404.json", payload, error_response, status_code=400, drop_keys=['apiKey'])

{'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}


### Bad parameters test

In [18]:
# good url
url = url_df.content_url.values[0]
result = None
print(url)
payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "model": "i_am_a_ bad_model",
    "userMetadata": url,
}
try:
    result = session.fetch(session.ApiUrl.CLASSIFY_BASE_URL, payload)
except errors.InternalGatewayError as e:
    print(result)
    print(e)

https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html


ERROR:arg_mine.api.session:500 : Server Error: INTERNAL SERVER ERROR for url: https://api.argumentsearch.com/en/classify, check payload contents?


None
InternalGatewayError: 500: Server Error: INTERNAL SERVER ERROR for url: https://api.argumentsearch.com/en/classify, check payload contents?


In [19]:
response = requests.post(session.ApiUrl.CLASSIFY_BASE_URL, json=payload)
if SAVE_FIXTURES:
    save_json_request_fixture("response_classify_500_bad_payload.json", payload, "", status_code=response.status_code, drop_keys=['apiKey'])

### timeout test

In [20]:
url = url_df.content_url.values[0]
result = None
print(url)
payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "model": "default",
    "topicRelevance": classify.TopicRelevance.WORD2VEC,
    "predictStance": True,  # we don't want to predict stance without context
    "computeAttention": False,  # doesnt work for BERT-based models (the default model)
    "showOnlyArguments": True,  # only return sentences classified as arguments
    "userMetadata": url,
}
try:
    result = session.fetch(session.ApiUrl.CLASSIFY_BASE_URL, payload, timeout=0.1)
except errors.NotResponding as e:
    print(result)
    print(e)

https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html
None
Server not responding, ConnectionError or Timeout (0.1 s)


In [21]:
try:
    response = requests.post(session.ApiUrl.CLASSIFY_BASE_URL, json=payload, timeout=0.1)
except requests.Timeout as e:
    print(e)

HTTPSConnectionPool(host='api.argumentsearch.com', port=443): Max retries exceeded with url: /en/classify (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f9cba9c8940>, 'Connection to api.argumentsearch.com timed out. (connect timeout=0.1)'))
