# Argument Mining API query testing

Load the target datafile, and see how the query results work

In [1]:
# reload local package definitions for each cell
%load_ext autoreload
%autoreload 2

In [41]:
import os
import time

from dotenv import find_dotenv, load_dotenv
import requests
import grequests
import pandas as pd
import json

from arg_mine import DATA_DIR
from arg_mine.data.loaders import get_gdelt_df
from arg_mine.api import classify, auth, session, errors
from arg_mine import utils

In [3]:
# load the env variables to get the API key
user_id, api_key = auth.load_auth_tokens()

In [4]:
csv_datapath = os.path.join(DATA_DIR, "raw", "2020-climate-change-narrative")
csv_filepath = os.path.join(csv_datapath, "WebNewsEnglishSnippets.2020.csv")

url_df = get_gdelt_df(csv_filepath)
url_df.head()

Unnamed: 0,datetime,title,headline_image_url,content_url,snippit,timestamp
0,20200115111500,Liberal MPs back Science Minister Karen Andrew...,https://static.ffx.io/images/$zoom_0.2627%2C$m...,https://www.smh.com.au/politics/federal/libera...,the science in her interview with The Age and...,2020-01-15 11:15:00
1,20200117184500,Several crowd-pullers on day two of KLF,https://www.thehindu.com/news/cities/kozhikode...,https://www.thehindu.com/news/cities/kozhikode...,"Guha, who talked about patriotism and jingois...",2020-01-17 18:45:00
2,20200106233000,Seven arrested after gas rig protest,https://www.stourbridgenews.co.uk/resources/im...,https://www.stourbridgenews.co.uk/news/nationa...,three demands for the Scottish and UK Governm...,2020-01-06 23:30:00
3,20200101111500,Australia sending aid to wildfire towns as dea...,https://bloximages.newyork1.vip.townnews.com/h...,https://www.heraldmailmedia.com/news/nation/au...,this season the worst on record and reignited ...,2020-01-01 11:15:00
4,20200107101500,"A hot, dry country caught between fire and a c...",,http://global.chinadaily.com.cn/a/202001/07/WS...,", which is burned to generate electricity, wit...",2020-01-07 10:15:00


## Build the `classify` request

In [5]:
url = url_df.iloc[0].content_url

In [6]:
CLASSIFY_BASE_URL = "https://api.argumentsearch.com/en/classify"
timeout = 5

topic = "climate change"


payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "topicRelevance": "word2vec",
    "predictStance": True,
    "computeAttention": True,
    "showOnlyArguments": False,
}

response = requests.post(
    CLASSIFY_BASE_URL,
    json=payload,
    timeout=timeout,
)
response.raise_for_status()

In [7]:
out = response.json()
out.keys()

dict_keys(['metadata', 'sentences'])

In [8]:
response = classify.classify_url_sentences(topic, url_df.content_url.values[0], user_id, api_key)

In [9]:
classify.ClassifiedSentence.from_dict(url, topic, response['sentences'][0])

ClassifiedSentence(url='https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html', doc_id='657f9dd95eb97597e34d0c05b5a93ba6', topic='climate change', sentence_id='da903878c62343fb482bfad67a7523f1', argument_confidence=0.9836708698421717, argument_label='argument', sentence_original='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sentence_preprocessed='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sort_confidence=0.9907711706754092, stance_confidence=0.9978714715086467, stance_label='pro')

In [10]:
response['sentences'][0]

{'argumentConfidence': 0.9836708698421717,
 'argumentLabel': 'argument',
 'sentenceOriginal': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'sentencePreprocessed': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'sortConfidence': 0.9907711706754092,
 'stanceConfidence': 0.9978714715086467,
 'stanceLabel': 'pro'}

In [11]:
# good example of a link that is no longer valid
url = url_df.content_url.values[3]
try: 
    response = classify.classify_url_sentences(topic, url, user_id, api_key)
except errors.Refused as e:
    print(e)

Refused: 400: Website could not be crawled or returned an empty result. Please contact an administrator.


## Run a batch of urls

In [12]:
url_df.content_url.shape

(108459,)

In [13]:
start_time = time.time()

doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:20])

print("iteration took {:.3} s".format(time.time() - start_time))

DEBUG:2020-06-26 18:54:52,142:arg_mine.api.classify: Attempting url 1 of 20
DEBUG:2020-06-26 18:54:53,594:arg_mine.api.classify: Attempting url 2 of 20
DEBUG:2020-06-26 18:54:54,751:arg_mine.api.classify: Attempting url 3 of 20
DEBUG:2020-06-26 18:54:56,498:arg_mine.api.classify: Attempting url 4 of 20
DEBUG:2020-06-26 18:54:57,185:arg_mine.api.classify: Attempting url 5 of 20
DEBUG:2020-06-26 18:54:58,846:arg_mine.api.classify: Attempting url 6 of 20
DEBUG:2020-06-26 18:55:00,722:arg_mine.api.classify: Attempting url 7 of 20
DEBUG:2020-06-26 18:55:00,986:arg_mine.api.classify: Attempting url 8 of 20
DEBUG:2020-06-26 18:55:02,002:arg_mine.api.classify: Attempting url 9 of 20
DEBUG:2020-06-26 18:55:03,233:arg_mine.api.classify: Attempting url 10 of 20
DEBUG:2020-06-26 18:55:04,771:arg_mine.api.classify: Attempting url 11 of 20
DEBUG:2020-06-26 18:55:06,346:arg_mine.api.classify: Attempting url 12 of 20
DEBUG:2020-06-26 18:55:07,821:arg_mine.api.classify: Attempting url 13 of 20
DEBUG:20

iteration took 26.8 s


In [14]:
print(len(refused_doc_list))
refused_doc_list

7


['https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-towns-as-death-toll-rises/article_883fa793-6c0a-547e-8f77-b5964f1d7182.html',
 'https://www.seattlepi.com/news/world/article/Cooler-weather-brings-respite-in-Australian-14950114.php',
 'https://www.airdrietoday.com/national-business/starbucks-goals-for-sustainability-will-require-significant-consumer-buy-in-2037947',
 'https://www.heraldbulletin.com/news/nation_world/davos-chief-welcomes-views-of-trump-greta-thunberg-at-forum/article_24dbb151-7132-5c48-a7df-c13b67004ec0.html',
 'https://ravallirepublic.com/news/local/article_cb291465-d430-5477-8b41-1bedda9beaea.html',
 'https://www.caledonianrecord.com/opinion/columns/shawn-shouldice-small-business-owners-prepare-to-dodge-bullets-during/article_9f1b6399-7ca9-59bc-8788-b4d05ec5825c.html',
 'https://www.nsnews.com/trudeau-meets-pallister-and-the-meng-hearing-in-the-news-for-jan-20-1.24056357']

## try concurrent requests with grequest

In [15]:
pool_size=5
chunk_size=10
url_list = url_df.content_url.values[:30]

start_time = time.time()

s = session.get_session(pool_size=pool_size)
full_list = []


for i in range(0, len(url_list), chunk_size):
    iter_time = time.time()
    print(">>>> iteration: {}".format(i))
    chunk_urls = url_list[i:i + chunk_size]
    unsent_requests = (
        grequests.post(
            session.ApiUrl.CLASSIFY_BASE_URL,
            json=classify.bundle_payload(topic, u),
            session=s,
            allow_redirects=False,
            )
        for u in chunk_urls
    )
    output = grequests.map(unsent_requests, size=100, exception_handler=classify.exception_handler)
    full_list.extend(output)

    print("iteration took {:.3} s".format(time.time() - iter_time))

print("{} URLs took {:.3} s".format(len(url_list), time.time() - start_time))


>>>> iteration: 0
iteration took 12.2 s
>>>> iteration: 10
iteration took 14.1 s
>>>> iteration: 20
iteration took 18.9 s
30 URLs took 45.3 s


In [16]:
full_list

[<Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [400]>,
 <Response [200]>,
 <Response [200]>,
 <Response [400]>,
 <Response [400]>,
 <Response [400]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [400]>,
 <Response [400]>,
 <Response [200]>,
 <Response [400]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [400]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>]

In [17]:
output[0].json().keys()

dict_keys(['metadata', 'sentences'])

In [29]:
r = full_list[2]

In [30]:
r.raise_for_status()

In [31]:
json_response = r.json()

In [32]:
json_response['metadata']

{'computeAttention': False,
 'language': 'en',
 'modelVersion': 0.1,
 'predictStance': True,
 'removeDuplicates': True,
 'showOnlyArguments': True,
 'sortBy': 'argumentConfidence',
 'timeArgumentPrediction': 0.32140135765075684,
 'timeAttentionComputation': -1,
 'timeLogging': 0.1323702335357666,
 'timePreprocessing': 3.218650817871094e-05,
 'timeStancePrediction': -1,
 'timeTotal': 0.8731288909912109,
 'topic': 'climate change',
 'totalArguments': 5,
 'totalClassifiedSentences': 24,
 'totalContraArguments': 4,
 'totalNonArguments': 19,
 'totalProArguments': 1,
 'userMetadata': 'https://www.stourbridgenews.co.uk/news/national/18141364.seven-arrested-gas-rig-protest/'}

In [59]:
doc_list, sentence_list, refused_doc_list = classify.process_responses(full_list)

ERROR:2020-06-26 19:20:38,840:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-26 19:20:38,843:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-26 19:20:38,845:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-26 19:20:38,848:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-26 19:20:38,850:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-26 19:20:38,853:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-06-26 19:20

In [63]:
docs_df = pd.DataFrame(utils.dataclasses_to_dicts(doc_list))

In [64]:
docs_df

Unnamed: 0,doc_id,url,topic,model_version,language,time_argument_prediction,time_attention_computation,time_preprocessing,time_stance_prediction,time_logging,time_total,total_arguments,total_contra_arguments,total_pro_arguments,total_non_arguments,total_classified_sentences
0,657f9dd95eb97597e34d0c05b5a93ba6,https://www.smh.com.au/politics/federal/libera...,climate change,0.1,en,0.489759,-1,3.6e-05,-1,0.065924,0.745644,5,2,3,32,37
1,3c3e98232ca62ff3469234f554bea7b4,https://www.thehindu.com/news/cities/kozhikode...,climate change,0.1,en,0.368602,-1,4.1e-05,-1,0.129132,0.742825,1,1,0,27,28
2,db5ae573c6388b6fc794e81245baba90,https://www.stourbridgenews.co.uk/news/nationa...,climate change,0.1,en,0.321401,-1,3.2e-05,-1,0.13237,0.873129,5,4,1,19,24
3,cf84a621b674514c19fc2ad15ece8db7,http://global.chinadaily.com.cn/a/202001/07/WS...,climate change,0.1,en,0.369287,-1,3.8e-05,-1,0.044311,1.326333,13,8,5,15,28
4,cc5e8dcf8b787ea4fc0f7455a84559ac,https://www.leicestermercury.co.uk/news/uk-wor...,climate change,0.1,en,0.808165,-1,4.9e-05,-1,0.087824,1.130397,15,0,15,47,62
5,12ac4bc3da84ac75df28dffe4b13d85d,https://in.reuters.com/article/us-brazil-envir...,climate change,0.1,en,0.261803,-1,2.3e-05,-1,0.086771,0.527551,8,5,3,11,19
6,6fc1a513b9e2d64eb252bdea66b0cc5f,https://www.thehindu.com/sci-tech/science/indi...,climate change,0.1,en,0.471469,-1,2.9e-05,-1,0.064903,0.781918,7,2,5,31,38
7,8e5b639c0e49bf028c15acf9b88ca24a,https://antinuclear.net/2020/01/14/australia-c...,climate change,0.1,en,0.33946,-1,3.9e-05,-1,0.102486,1.041405,14,2,12,11,25
8,1de27aa91b19ad50ebc651ba36739855,http://www.baltimoresun.com/maryland/howard/bs...,climate change,0.1,en,0.041643,-1,1e-05,-1,0.060179,0.833888,0,0,0,3,3
9,aa9560eb90c7d5fec38b3b2fdf1fa43c,http://www.digitaljournal.com/news/world/clima...,climate change,0.1,en,0.220015,-1,2.1e-05,-1,0.110479,0.662259,8,7,1,8,16


In [None]:
responses = classify.fetch_concurrent(topic, url_list = url_df.content_url.values[:30], chunk_size=10)

DEBUG:2020-06-26 19:24:26,189:arg_mine.api.classify: >>>> iteration: 0
ERROR:2020-06-26 19:24:49,516:arg_mine.api.classify: Request failed request:<grequests.AsyncRequest object at 0x7fb146db7df0> 
 exception:HTTPSConnectionPool(host='api.argumentsearch.com', port=443): Max retries exceeded with url: /en/classify (Caused by ResponseError('too many 500 error responses')) 
NoneType: None
ERROR:2020-06-26 19:24:49,518:arg_mine.api.classify: Request failed request:<grequests.AsyncRequest object at 0x7fb142464d90> 
 exception:HTTPSConnectionPool(host='api.argumentsearch.com', port=443): Max retries exceeded with url: /en/classify (Caused by ResponseError('too many 500 error responses')) 
NoneType: None
ERROR:2020-06-26 19:24:49,520:arg_mine.api.classify: Request failed request:<grequests.AsyncRequest object at 0x7fb1424645b0> 
 exception:HTTPSConnectionPool(host='api.argumentsearch.com', port=443): Max retries exceeded with url: /en/classify (Caused by ResponseError('too many 500 error resp

iteration took 23.3 s


ERROR:2020-06-26 19:25:12,728:arg_mine.api.classify: Request failed request:<grequests.AsyncRequest object at 0x7fb146bfbac0> 
 exception:HTTPSConnectionPool(host='api.argumentsearch.com', port=443): Max retries exceeded with url: /en/classify (Caused by ResponseError('too many 500 error responses')) 
NoneType: None
ERROR:2020-06-26 19:25:12,729:arg_mine.api.classify: Request failed request:<grequests.AsyncRequest object at 0x7fb141e4d070> 
 exception:HTTPSConnectionPool(host='api.argumentsearch.com', port=443): Max retries exceeded with url: /en/classify (Caused by ResponseError('too many 500 error responses')) 
NoneType: None
ERROR:2020-06-26 19:25:12,730:arg_mine.api.classify: Request failed request:<grequests.AsyncRequest object at 0x7fb141e4d040> 
 exception:HTTPSConnectionPool(host='api.argumentsearch.com', port=443): Max retries exceeded with url: /en/classify (Caused by ResponseError('too many 500 error responses')) 
NoneType: None
ERROR:2020-06-26 19:25:12,731:arg_mine.api.cla

iteration took 23.2 s


In [19]:
responses

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]