# Argument Mining API query testing

Load the target datafile, and see how the query results work

In [1]:
# reload local package definitions for each cell
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time

from dotenv import find_dotenv, load_dotenv
import requests
import grequests
import numpy as np
import pandas as pd
import json

from arg_mine import DATA_DIR
from arg_mine.data.loaders import get_gdelt_df
from arg_mine.api import classify, auth, session, errors
from arg_mine import utils

  curious_george.patch_all(thread=False, select=False)


In [3]:
# load the env variables to get the API key
user_id, api_key = auth.load_auth_tokens()

In [4]:
csv_datapath = os.path.join(DATA_DIR, "raw", "2020-climate-change-narrative")
csv_filepath = os.path.join(csv_datapath, "WebNewsEnglishSnippets.2020.csv")

url_df = get_gdelt_df(csv_filepath)
url_df.head()

Unnamed: 0,datetime,title,headline_image_url,content_url,labeled_argument,timestamp
0,20200115111500,Liberal MPs back Science Minister Karen Andrew...,https://static.ffx.io/images/$zoom_0.2627%2C$m...,https://www.smh.com.au/politics/federal/libera...,the science in her interview with The Age and...,2020-01-15 11:15:00
1,20200117184500,Several crowd-pullers on day two of KLF,https://www.thehindu.com/news/cities/kozhikode...,https://www.thehindu.com/news/cities/kozhikode...,"Guha, who talked about patriotism and jingois...",2020-01-17 18:45:00
2,20200106233000,Seven arrested after gas rig protest,https://www.stourbridgenews.co.uk/resources/im...,https://www.stourbridgenews.co.uk/news/nationa...,three demands for the Scottish and UK Governm...,2020-01-06 23:30:00
3,20200101111500,Australia sending aid to wildfire towns as dea...,https://bloximages.newyork1.vip.townnews.com/h...,https://www.heraldmailmedia.com/news/nation/au...,this season the worst on record and reignited ...,2020-01-01 11:15:00
4,20200107101500,"A hot, dry country caught between fire and a c...",,http://global.chinadaily.com.cn/a/202001/07/WS...,", which is burned to generate electricity, wit...",2020-01-07 10:15:00


## Build the `classify` request

In [5]:
url = url_df.iloc[0].content_url

In [6]:
CLASSIFY_BASE_URL = "https://api.argumentsearch.com/en/classify"
timeout = 5

topic = "climate change"


payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "topicRelevance": "word2vec",
    "predictStance": True,
    "computeAttention": True,
    "showOnlyArguments": False,
}

response = requests.post(
    CLASSIFY_BASE_URL,
    json=payload,
    timeout=timeout,
)
response.raise_for_status()

In [7]:
out = response.json()
out.keys()

dict_keys(['metadata', 'sentences'])

## try a single URL
Get the doc and sentence objects

In [8]:
response = classify.classify_url_sentences(topic, url_df.content_url.values[0], user_id, api_key)

In [9]:
classify.ClassifiedSentence.from_dict(url, topic, response['sentences'][0])

ClassifiedSentence(url='https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html', doc_id='657f9dd95eb97597e34d0c05b5a93ba6', topic='climate change', sentence_id='da903878c62343fb482bfad67a7523f1', argument_confidence=0.9836708698421717, argument_label='argument', sentence_original='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sentence_preprocessed='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sort_confidence=None, stance_confidence=0.9978714715086467, stance_label='pro')

In [10]:
response['sentences'][0]

{'argumentConfidence': 0.9836708698421717,
 'argumentLabel': 'argument',
 'sentenceOriginal': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'sentencePreprocessed': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'stanceConfidence': 0.9978714715086467,
 'stanceLabel': 'pro'}

In [11]:
len(response['sentences'])

5

In [12]:
# good example of a link that is no longer valid
url = url_df.content_url.values[3]
try: 
    response = classify.classify_url_sentences(topic, url, user_id, api_key)
except errors.Refused as e:
    print(e)

Refused: 400: Website could not be crawled or returned an empty result. Please contact an administrator.


## Run a batch of urls

In [13]:
url_df.content_url.shape

(108459,)

In [14]:
start_time = time.time()

doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:20])

print("iteration took {:.3} s".format(time.time() - start_time))

DEBUG:2020-07-02 23:07:22,259:arg_mine.api.classify: Attempting url 1 of 20
DEBUG:2020-07-02 23:07:25,335:arg_mine.api.classify: Attempting url 2 of 20
DEBUG:2020-07-02 23:07:27,921:arg_mine.api.classify: Attempting url 3 of 20
DEBUG:2020-07-02 23:07:29,656:arg_mine.api.classify: Attempting url 4 of 20
DEBUG:2020-07-02 23:07:30,364:arg_mine.api.classify: Attempting url 5 of 20
DEBUG:2020-07-02 23:07:32,455:arg_mine.api.classify: Attempting url 6 of 20
DEBUG:2020-07-02 23:07:34,426:arg_mine.api.classify: Attempting url 7 of 20
DEBUG:2020-07-02 23:07:35,182:arg_mine.api.classify: Attempting url 8 of 20
DEBUG:2020-07-02 23:07:36,318:arg_mine.api.classify: Attempting url 9 of 20
DEBUG:2020-07-02 23:07:37,614:arg_mine.api.classify: Attempting url 10 of 20
DEBUG:2020-07-02 23:07:39,153:arg_mine.api.classify: Attempting url 11 of 20
DEBUG:2020-07-02 23:07:41,436:arg_mine.api.classify: Attempting url 12 of 20
DEBUG:2020-07-02 23:07:42,713:arg_mine.api.classify: Attempting url 13 of 20
DEBUG:20

iteration took 33.3 s


In [15]:
print(len(refused_doc_list))
refused_doc_list

7


['https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-towns-as-death-toll-rises/article_883fa793-6c0a-547e-8f77-b5964f1d7182.html',
 'https://www.seattlepi.com/news/world/article/Cooler-weather-brings-respite-in-Australian-14950114.php',
 'https://www.airdrietoday.com/national-business/starbucks-goals-for-sustainability-will-require-significant-consumer-buy-in-2037947',
 'https://www.heraldbulletin.com/news/nation_world/davos-chief-welcomes-views-of-trump-greta-thunberg-at-forum/article_24dbb151-7132-5c48-a7df-c13b67004ec0.html',
 'https://ravallirepublic.com/news/local/article_cb291465-d430-5477-8b41-1bedda9beaea.html',
 'https://www.caledonianrecord.com/opinion/columns/shawn-shouldice-small-business-owners-prepare-to-dodge-bullets-during/article_9f1b6399-7ca9-59bc-8788-b4d05ec5825c.html',
 'https://www.nsnews.com/trudeau-meets-pallister-and-the-meng-hearing-in-the-news-for-jan-20-1.24056357']

## try concurrent requests with grequest

`grequest` does concurrent threaded requests, but has memory issues for long lists. We can chunk the async requests and write the outputs to storage to avoid this problem.

In [16]:
responses = classify.fetch_concurrent(topic, url_list = url_df.content_url.values[:20], chunk_size=10)

DEBUG:2020-07-02 23:07:55,590:arg_mine.api.classify: >>>> starting doc extraction
DEBUG:2020-07-02 23:08:09,806:arg_mine.api.classify: iteration 0 took 14.214 s (10 docs)
DEBUG:2020-07-02 23:08:25,559:arg_mine.api.classify: iteration 1 took 15.752 s (10 docs)
DEBUG:2020-07-02 23:08:25,560:arg_mine.api.classify: 20 URLs took 29.970 s


In [17]:
docs_df, sentences_df, missing_urls = classify.process_responses(responses)

ERROR:2020-07-02 23:08:25,596:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-07-02 23:08:25,599:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-07-02 23:08:25,600:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-07-02 23:08:25,603:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-07-02 23:08:25,605:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-07-02 23:08:25,606:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
ERROR:2020-07-02 23:08

In [18]:
sentences_df.shape

(313, 11)

# Label the sentences as GT arguments
The given snippit contains context surrounding the given identified key phrase. Tokenizing the phrase will not work.
It is still unclear on whether or not the GT data can be claimed to be an argument; likely not.

> This final dataset covers worldwide English language online news coverage 2015-2020 mentioning "climate change" OR "global warming" OR "climate crisis" OR "greenhouse gas" OR "greenhouse gases" OR "carbon tax" totaling 6.3 million articles. [...]  
>Most importantly, for each match, a short snippet is shown that shows the first instance of one of the climate change phrases above in the article with the 100 characters before and after the appearance, truncated to the nearest word (if the 100th character before or after the phrase appears in the middle of a word, the window will be shrunk to the closest full word). Note that in the majority of cases the first match in the article is selected, but sometimes due to the nature of the finite automaton used to generate the snippets, a later match may be chosen from the article if it allows for a larger context window under certain circumstances.  
>Using a window of 100 characters before and after the match allows for brief non-consumptive snippets that show the context of the match and allow a better understanding of whether the article's mention of climate change was a cursory mention or central to the story and the argument, evidence  and context of the narrative within.

In [19]:
# load the extracted sentences
target_dir = "gdelt-climate-change-docs"
in_data_path = os.path.join(DATA_DIR, "processed", target_dir)

docs_filename = "gdelt_2020_docs.csv"
sentences_filename = "gdelt_2020_sentences.csv"

# load the files into dataframes
docs_df = pd.read_csv(os.path.join(in_data_path, docs_filename))
sentences_df = pd.read_csv(os.path.join(in_data_path, sentences_filename))

In [20]:
# writing this out to use for parsing later
keyword_list = [
    "climate change",
    "global warming",
    "climate crisis",
    "greenhouse gas",
    "greenhouse gases",
    "carbon tax"
]

In [21]:
# utility methods for finding the GT argument sentences we extract

def match_doc_id(url, docs_df):
    """Get the document id from the ground truth dataset"""
    return docs_df[url == docs_df['url']]['doc_id'].iloc[0]

def get_doc_sentences(doc_id, sentences_df):
    """Return all sentences for a given doc_id"""
    return sentences_df[sentences_df.doc_id == doc_id]

In [22]:
doc_id = match_doc_id(url_df.iloc[0]['content_url'], docs_df)
doc_sentences = get_doc_sentences(doc_id, sentences_df)

In [23]:
sentences_df['has_labeled_arg'] = False

In [24]:
sentences_df.columns

Index(['url', 'doc_id', 'topic', 'sentence_id', 'argument_confidence',
       'argument_label', 'sentence_original', 'sentence_preprocessed',
       'sort_confidence', 'stance_confidence', 'stance_label',
       'has_labeled_arg'],
      dtype='object')

In [25]:

content_url = url_df.iloc[0]
snippit = url_df['labeled_argument'][0]
doc_id = match_doc_id(content_url['content_url'], docs_df)
doc_sentences = get_doc_sentences(doc_id, sentences_df)

# tokenize the GT argument
arg_tokens = snippit.split(".") if isinstance(snippit, str) else None
arg_tokens = [s.strip() for s in arg_tokens]

for token in arg_tokens:
    matches = doc_sentences[doc_sentences.sentence_original.str.contains(token, na=False)]['sentence_id']
    print(matches)
    # only look at the first match
    sentences_df.loc[sentences_df['sentence_id'] == matches.values[0], 'has_labeled_arg'] = True

26    4a2fcf5b4ca0e45bcae7fcc6af5e2cd6
Name: sentence_id, dtype: object
3    9eac4f301da33af0eaf7e5a4b1a5c759
Name: sentence_id, dtype: object
23    e3496a47bcad630aea1f078b5202cd43
Name: sentence_id, dtype: object


In [26]:
token

'"Australians want us to get on with the job of meeting our'

In [27]:
sentences_df.loc[sentences_df.doc_id == doc_id]['has_labeled_arg']

0     False
1     False
2     False
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23     True
24    False
25    False
26     True
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
Name: has_labeled_arg, dtype: bool

In [28]:
url_df_crop = url_df[url_df['content_url'].isin(docs_df.url.values)]

In [29]:
from arg_mine.data import labelers

In [32]:
labelers.label_doc_sentences_with_context(url_df_crop.iloc[0], docs_df, sentences_df)

Unnamed: 0,url,doc_id,topic,sentence_id,argument_confidence,argument_label,sentence_original,sentence_preprocessed,sort_confidence,stance_confidence,stance_label,has_labeled_arg
0,https://www.smh.com.au/politics/federal/libera...,657f9dd95eb97597e34d0c05b5a93ba6,climate change,da903878c62343fb482bfad67a7523f1,0.983671,argument,"Mr Morrison said Ms Andrews had ""well set out""...","Mr Morrison said Ms Andrews had ""well set out""...",0.990771,0.997871,pro,False
1,https://www.smh.com.au/politics/federal/libera...,657f9dd95eb97597e34d0c05b5a93ba6,climate change,194995c09a1bbb4fa2f1d53dabc88f76,0.950217,argument,"""What I have been seeking to stress, particula...","""What I have been seeking to stress, particula...",0.970347,0.990477,pro,False
2,https://www.smh.com.au/politics/federal/libera...,657f9dd95eb97597e34d0c05b5a93ba6,climate change,7582076e5fbb0316c7cff50bfea50857,0.932927,argument,"Mr Kelly said the only ""denial"" he had seen wa...","Mr Kelly said the only ""denial"" he had seen wa...",0.889054,0.845181,contra,False
3,https://www.smh.com.au/politics/federal/libera...,657f9dd95eb97597e34d0c05b5a93ba6,climate change,9eac4f301da33af0eaf7e5a4b1a5c759,0.774244,argument,"""Overwhelmingly, Australians accept the scienc...","""Overwhelmingly, Australians accept the scienc...",0.879241,0.984239,pro,True
4,https://www.smh.com.au/politics/federal/libera...,657f9dd95eb97597e34d0c05b5a93ba6,climate change,cb25d9b5f07e95fa139f19951515c0e4,0.650039,argument,"""Karen is correct when she says every second s...","""Karen is correct when she says every second s...",0.701024,0.752010,contra,False
...,...,...,...,...,...,...,...,...,...,...,...,...
31663,http://www.terradaily.com/afp/200122185236.7b9...,9e9c7cd2ec9e0a16afb31198eec12e81,climate change,d7a0378202ff2b0a4f4f6a61db7178fb,0.004751,no argument,The industry-led Alliance to End Plastic Waste...,The industry-led Alliance to End Plastic Waste...,0.004751,0.000000,,False
31664,http://www.terradaily.com/afp/200122185236.7b9...,9e9c7cd2ec9e0a16afb31198eec12e81,climate change,d0f2a40853f196e105761dd88f8fb90f,0.004536,no argument,"- Say one thing, do another -","- Say one thing, do another -",0.004536,0.000000,,False
31665,http://www.terradaily.com/afp/200122185236.7b9...,9e9c7cd2ec9e0a16afb31198eec12e81,climate change,1efc109bdddbb6e51e9b69cc0a1b0701,0.004422,no argument,All rights reserved.,All rights reserved.,0.004422,0.000000,,False
31666,http://www.terradaily.com/afp/200122185236.7b9...,9e9c7cd2ec9e0a16afb31198eec12e81,climate change,eef99c4932bfe46e71fa656019c0c55c,0.004049,no argument,PepsiCo said it could elaborate on the announc...,PepsiCo said it could elaborate on the announc...,0.004049,0.000000,,False


In [34]:
start_time = time.time()

sentences_df = labelers.label_gdelt_sentences(url_df, docs_df, sentences_df)


print("labeling took {:0.2f} s".format(time.time() - start_time))

DEBUG:2020-07-02 23:09:38,159:arg_mine.api.classify: No matches found for token in doc 3c3e98232ca62ff3469234f554bea7b4: 'The day's other speakers included writers Anand, Mukundan'
DEBUG:2020-07-02 23:09:38,166:arg_mine.api.classify: No matches found for token in doc db5ae573c6388b6fc794e81245baba90: ' three demands for the Scottish and UK Governments, including urging them to "tell the truth" about the impact of the fossil fuel industry on the climate crisis.'
DEBUG:2020-07-02 23:09:38,206:arg_mine.api.classify: No matches found for token in doc cc5e8dcf8b787ea4fc0f7455a84559ac: 'Pick one thing to help the planet, or pick several.'
DEBUG:2020-07-02 23:09:38,208:arg_mine.api.classify: No matches found for token in doc cc5e8dcf8b787ea4fc0f7455a84559ac: 'But please commit to changing today, so that we can all do our bit to slow down the climate crisis.'
DEBUG:2020-07-02 23:09:38,211:arg_mine.api.classify: No matches found for token in doc cc5e8dcf8b787ea4fc0f7455a84559ac: 'Start small.'


error: unterminated character set at position 0

# Check timing of different approaches

Does grequests give us a performance boost?
* time serial extraction vs using grequests

Does returning all sentences vs just arguments give us a performance hit?
* time extraction of 50 articles to see if API times are significantly different

In [None]:
num_docs = 50

In [None]:
start_time = time.time()

doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:num_docs])

print("iteration took {:.3} s".format(time.time() - start_time))

In [None]:
start_time = time.time()

responses = classify.fetch_concurrent(topic, url_list = url_df.content_url.values[:num_docs], chunk_size=10)
docs_df, sentences_df, missing_urls = classify.process_responses(responses)

print("iteration took {:.3} s".format(time.time() - start_time))