# Argument Mining API query testing

Load the target datafile, and see how the query results work

In [2]:
# reload local package definitions for each cell
%load_ext autoreload
%autoreload 2

In [3]:
import os
import time

from dotenv import find_dotenv, load_dotenv
import requests
import grequests
import pandas as pd
import json

from arg_mine import DATA_DIR
from arg_mine.data.loaders import get_gdelt_df
from arg_mine.api import classify, auth, session, errors
from arg_mine import utils

  curious_george.patch_all(thread=False, select=False)


In [4]:
# load the env variables to get the API key
user_id, api_key = auth.load_auth_tokens()

In [5]:
csv_datapath = os.path.join(DATA_DIR, "raw", "2020-climate-change-narrative")
csv_filepath = os.path.join(csv_datapath, "WebNewsEnglishSnippets.2020.csv")

url_df = get_gdelt_df(csv_filepath)
url_df.head()

Unnamed: 0,datetime,title,headline_image_url,content_url,snippit,timestamp
0,20200115111500,Liberal MPs back Science Minister Karen Andrew...,https://static.ffx.io/images/$zoom_0.2627%2C$m...,https://www.smh.com.au/politics/federal/libera...,the science in her interview with The Age and...,2020-01-15 11:15:00
1,20200117184500,Several crowd-pullers on day two of KLF,https://www.thehindu.com/news/cities/kozhikode...,https://www.thehindu.com/news/cities/kozhikode...,"Guha, who talked about patriotism and jingois...",2020-01-17 18:45:00
2,20200106233000,Seven arrested after gas rig protest,https://www.stourbridgenews.co.uk/resources/im...,https://www.stourbridgenews.co.uk/news/nationa...,three demands for the Scottish and UK Governm...,2020-01-06 23:30:00
3,20200101111500,Australia sending aid to wildfire towns as dea...,https://bloximages.newyork1.vip.townnews.com/h...,https://www.heraldmailmedia.com/news/nation/au...,this season the worst on record and reignited ...,2020-01-01 11:15:00
4,20200107101500,"A hot, dry country caught between fire and a c...",,http://global.chinadaily.com.cn/a/202001/07/WS...,", which is burned to generate electricity, wit...",2020-01-07 10:15:00


## Build the `classify` request

In [6]:
url = url_df.iloc[0].content_url

In [7]:
CLASSIFY_BASE_URL = "https://api.argumentsearch.com/en/classify"
timeout = 5

topic = "climate change"


payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "topicRelevance": "word2vec",
    "predictStance": True,
    "computeAttention": True,
    "showOnlyArguments": False,
}

response = requests.post(
    CLASSIFY_BASE_URL,
    json=payload,
    timeout=timeout,
)
response.raise_for_status()

In [8]:
out = response.json()
out.keys()

dict_keys(['metadata', 'sentences'])

In [9]:
response = classify.classify_url_sentences(topic, url_df.content_url.values[0], user_id, api_key)

In [10]:
classify.ClassifiedSentence.from_dict(url, topic, response['sentences'][0])

ClassifiedSentence(url='https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html', doc_id='657f9dd95eb97597e34d0c05b5a93ba6', topic='climate change', sentence_id='da903878c62343fb482bfad67a7523f1', argument_confidence=0.9836708698421717, argument_label='argument', sentence_original='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sentence_preprocessed='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sort_confidence=0.9907711706754092, stance_confidence=0.9978714715086467, stance_label='pro')

In [11]:
response['sentences'][0]

{'argumentConfidence': 0.9836708698421717,
 'argumentLabel': 'argument',
 'sentenceOriginal': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'sentencePreprocessed': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'sortConfidence': 0.9907711706754092,
 'stanceConfidence': 0.9978714715086467,
 'stanceLabel': 'pro'}

In [12]:
# good example of a link that is no longer valid
url = url_df.content_url.values[3]
try: 
    response = classify.classify_url_sentences(topic, url, user_id, api_key)
except errors.Refused as e:
    print(e)

Refused: 400: Website could not be crawled or returned an empty result. Please contact an administrator.


## Run a batch of urls

In [13]:
url_df.content_url.shape

(108459,)

In [14]:
start_time = time.time()

doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:20])

print("iteration took {:.3} s".format(time.time() - start_time))

DEBUG:2020-06-29 19:31:29,249:arg_mine.api.classify: Attempting url 1 of 20
DEBUG:2020-06-29 19:31:30,658:arg_mine.api.classify: Attempting url 2 of 20
DEBUG:2020-06-29 19:31:32,117:arg_mine.api.classify: Attempting url 3 of 20
DEBUG:2020-06-29 19:31:33,835:arg_mine.api.classify: Attempting url 4 of 20
DEBUG:2020-06-29 19:31:34,452:arg_mine.api.classify: Attempting url 5 of 20
DEBUG:2020-06-29 19:31:35,785:arg_mine.api.classify: Attempting url 6 of 20
DEBUG:2020-06-29 19:31:37,051:arg_mine.api.classify: Attempting url 7 of 20
DEBUG:2020-06-29 19:31:37,312:arg_mine.api.classify: Attempting url 8 of 20
DEBUG:2020-06-29 19:31:38,351:arg_mine.api.classify: Attempting url 9 of 20
DEBUG:2020-06-29 19:31:39,524:arg_mine.api.classify: Attempting url 10 of 20
DEBUG:2020-06-29 19:31:40,960:arg_mine.api.classify: Attempting url 11 of 20
DEBUG:2020-06-29 19:31:43,055:arg_mine.api.classify: Attempting url 12 of 20
DEBUG:2020-06-29 19:31:44,181:arg_mine.api.classify: Attempting url 13 of 20
DEBUG:20

iteration took 26.0 s


In [15]:
print(len(refused_doc_list))
refused_doc_list

7


['https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-towns-as-death-toll-rises/article_883fa793-6c0a-547e-8f77-b5964f1d7182.html',
 'https://www.seattlepi.com/news/world/article/Cooler-weather-brings-respite-in-Australian-14950114.php',
 'https://www.airdrietoday.com/national-business/starbucks-goals-for-sustainability-will-require-significant-consumer-buy-in-2037947',
 'https://www.heraldbulletin.com/news/nation_world/davos-chief-welcomes-views-of-trump-greta-thunberg-at-forum/article_24dbb151-7132-5c48-a7df-c13b67004ec0.html',
 'https://ravallirepublic.com/news/local/article_cb291465-d430-5477-8b41-1bedda9beaea.html',
 'https://www.caledonianrecord.com/opinion/columns/shawn-shouldice-small-business-owners-prepare-to-dodge-bullets-during/article_9f1b6399-7ca9-59bc-8788-b4d05ec5825c.html',
 'https://www.nsnews.com/trudeau-meets-pallister-and-the-meng-hearing-in-the-news-for-jan-20-1.24056357']

## try concurrent requests with grequest

`grequest` does concurrent threaded requests, but has memory issues for long lists. We can chunk the async requests and write the outputs to storage to avoid this problem.

In [16]:
responses = classify.fetch_concurrent(topic, url_list = url_df.content_url.values[:100], chunk_size=10)

DEBUG:2020-06-29 19:31:55,308:arg_mine.api.classify: >>>> starting doc extraction: 0
DEBUG:2020-06-29 19:32:08,493:arg_mine.api.classify: iteration 0 took 13.2 s (10 docs)
DEBUG:2020-06-29 19:32:22,090:arg_mine.api.classify: iteration 1 took 13.6 s (10 docs)
DEBUG:2020-06-29 19:32:41,170:arg_mine.api.classify: iteration 2 took 19.1 s (10 docs)
DEBUG:2020-06-29 19:32:59,244:arg_mine.api.classify: iteration 3 took 18.1 s (10 docs)
DEBUG:2020-06-29 19:33:27,294:arg_mine.api.classify: iteration 4 took 28.0 s (10 docs)
ERROR:2020-06-29 19:34:02,163:arg_mine.api.classify: Request failed request:<grequests.AsyncRequest object at 0x7fc757a94460> 
 exception:HTTPSConnectionPool(host='api.argumentsearch.com', port=443): Max retries exceeded with url: /en/classify (Caused by ResponseError('too many 500 error responses')) 
NoneType: None
DEBUG:2020-06-29 19:34:02,165:arg_mine.api.classify: iteration 5 took 34.9 s (10 docs)
DEBUG:2020-06-29 19:34:28,025:arg_mine.api.classify: iteration 6 took 25.9 

In [17]:
docs_df, sentences_df = classify.process_responses(responses)

ERROR:2020-06-29 19:35:23,027:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-29 19:35:23,030:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-29 19:35:23,033:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-29 19:35:23,034:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-29 19:35:23,037:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-29 19:35:23,039:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-29 19:35

In [18]:
sentences_df.shape

(684, 11)

In [21]:
target_dir = "gdelt-climate-change-docs"
data_path = os.path.join(DATA_DIR, "processed", "target_dir")

# Check timing of different approaches

Does grequests give us a performance boost?
* time serial extraction vs using grequests

Does returning all sentences vs just arguments give us a performance hit?
* time extraction of 50 articles to see if API times are significantly different