# Argument Mining API query testing

Load the target datafile, and see how the query results work

In [1]:
# reload local package definitions for each cell
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time

from dotenv import find_dotenv, load_dotenv
import requests
import grequests
import numpy as np
import pandas as pd
import json

from arg_mine import DATA_DIR
from arg_mine.data.loaders import get_gdelt_df
from arg_mine.api import classify, auth, session, errors
from arg_mine import utils

  curious_george.patch_all(thread=False, select=False)


In [3]:
# load the env variables to get the API key
user_id, api_key = auth.load_auth_tokens()

In [4]:
csv_datapath = os.path.join(DATA_DIR, "raw", "2020-climate-change-narrative")
csv_filepath = os.path.join(csv_datapath, "WebNewsEnglishSnippets.2020.csv")

url_df = get_gdelt_df(csv_filepath)
url_df.head()

Unnamed: 0,datetime,title,headline_image_url,content_url,labeled_argument,timestamp
0,20200115111500,Liberal MPs back Science Minister Karen Andrew...,https://static.ffx.io/images/$zoom_0.2627%2C$m...,https://www.smh.com.au/politics/federal/libera...,the science in her interview with The Age and...,2020-01-15 11:15:00
1,20200117184500,Several crowd-pullers on day two of KLF,https://www.thehindu.com/news/cities/kozhikode...,https://www.thehindu.com/news/cities/kozhikode...,"Guha, who talked about patriotism and jingois...",2020-01-17 18:45:00
2,20200106233000,Seven arrested after gas rig protest,https://www.stourbridgenews.co.uk/resources/im...,https://www.stourbridgenews.co.uk/news/nationa...,three demands for the Scottish and UK Governm...,2020-01-06 23:30:00
3,20200101111500,Australia sending aid to wildfire towns as dea...,https://bloximages.newyork1.vip.townnews.com/h...,https://www.heraldmailmedia.com/news/nation/au...,this season the worst on record and reignited ...,2020-01-01 11:15:00
4,20200107101500,"A hot, dry country caught between fire and a c...",,http://global.chinadaily.com.cn/a/202001/07/WS...,", which is burned to generate electricity, wit...",2020-01-07 10:15:00


## Build the `classify` request

In [5]:
url = url_df.iloc[0].content_url

In [6]:
CLASSIFY_BASE_URL = "https://api.argumentsearch.com/en/classify"
timeout = 5

topic = "climate change"


payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "topicRelevance": "word2vec",
    "predictStance": True,
    "computeAttention": True,
    "showOnlyArguments": False,
}

response = requests.post(
    CLASSIFY_BASE_URL,
    json=payload,
    timeout=timeout,
)
response.raise_for_status()

In [7]:
out = response.json()
out.keys()

dict_keys(['metadata', 'sentences'])

## try a single URL
Get the doc and sentence objects

In [8]:
response = classify.classify_url_sentences(topic, url_df.content_url.values[0], user_id, api_key)

In [9]:
classify.ClassifiedSentence.from_dict(url, topic, response['sentences'][0])

ClassifiedSentence(url='https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html', doc_id='657f9dd95eb97597e34d0c05b5a93ba6', topic='climate change', sentence_id='da903878c62343fb482bfad67a7523f1', argument_confidence=0.9836708698421717, argument_label='argument', sentence_original='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sentence_preprocessed='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sort_confidence=0.9907711706754092, stance_confidence=0.9978714715086467, stance_label='pro')

In [10]:
response['sentences'][0]

{'argumentConfidence': 0.9836708698421717,
 'argumentLabel': 'argument',
 'sentenceOriginal': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'sentencePreprocessed': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'sortConfidence': 0.9907711706754092,
 'stanceConfidence': 0.9978714715086467,
 'stanceLabel': 'pro'}

In [11]:
# good example of a link that is no longer valid
url = url_df.content_url.values[3]
try: 
    response = classify.classify_url_sentences(topic, url, user_id, api_key)
except errors.Refused as e:
    print(e)

Refused: 400: Website could not be crawled or returned an empty result. Please contact an administrator.


## Run a batch of urls

In [12]:
url_df.content_url.shape

(108459,)

In [15]:
start_time = time.time()

doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:20])

print("iteration took {:.3} s".format(time.time() - start_time))

DEBUG:2020-06-30 19:45:30,986:arg_mine.api.classify: Attempting url 1 of 20
DEBUG:2020-06-30 19:45:32,843:arg_mine.api.classify: Attempting url 2 of 20
DEBUG:2020-06-30 19:45:33,776:arg_mine.api.classify: Attempting url 3 of 20
DEBUG:2020-06-30 19:45:35,229:arg_mine.api.classify: Attempting url 4 of 20
DEBUG:2020-06-30 19:45:35,798:arg_mine.api.classify: Attempting url 5 of 20
DEBUG:2020-06-30 19:45:37,159:arg_mine.api.classify: Attempting url 6 of 20
DEBUG:2020-06-30 19:45:38,425:arg_mine.api.classify: Attempting url 7 of 20
DEBUG:2020-06-30 19:45:38,709:arg_mine.api.classify: Attempting url 8 of 20
DEBUG:2020-06-30 19:45:39,826:arg_mine.api.classify: Attempting url 9 of 20
DEBUG:2020-06-30 19:45:41,009:arg_mine.api.classify: Attempting url 10 of 20
DEBUG:2020-06-30 19:45:42,292:arg_mine.api.classify: Attempting url 11 of 20
DEBUG:2020-06-30 19:45:43,312:arg_mine.api.classify: Attempting url 12 of 20
DEBUG:2020-06-30 19:45:44,439:arg_mine.api.classify: Attempting url 13 of 20
DEBUG:20

iteration took 24.9 s


In [None]:
print(len(refused_doc_list))
refused_doc_list

## try concurrent requests with grequest

`grequest` does concurrent threaded requests, but has memory issues for long lists. We can chunk the async requests and write the outputs to storage to avoid this problem.

In [None]:
responses = classify.fetch_concurrent(topic, url_list = url_df.content_url.values[:20], chunk_size=10)

In [None]:
docs_df, sentences_df, missing_urls = classify.process_responses(responses)

In [None]:
sentences_df.shape

# Label the sentences as GT arguments
The given snippit contains context surrounding the given identified key phrase. Tokenizing the phrase will not work.
It is still unclear on whether or not the GT data can be claimed to be an argument; likely not.

> This final dataset covers worldwide English language online news coverage 2015-2020 mentioning "climate change" OR "global warming" OR "climate crisis" OR "greenhouse gas" OR "greenhouse gases" OR "carbon tax" totaling 6.3 million articles. [...]  
>Most importantly, for each match, a short snippet is shown that shows the first instance of one of the climate change phrases above in the article with the 100 characters before and after the appearance, truncated to the nearest word (if the 100th character before or after the phrase appears in the middle of a word, the window will be shrunk to the closest full word). Note that in the majority of cases the first match in the article is selected, but sometimes due to the nature of the finite automaton used to generate the snippets, a later match may be chosen from the article if it allows for a larger context window under certain circumstances.  
>Using a window of 100 characters before and after the match allows for brief non-consumptive snippets that show the context of the match and allow a better understanding of whether the article's mention of climate change was a cursory mention or central to the story and the argument, evidence  and context of the narrative within.

In [70]:
# load the extracted sentences
target_dir = "gdelt-climate-change-docs"
in_data_path = os.path.join(DATA_DIR, "processed", target_dir)

docs_filename = "gdelt_2020_docs.csv"
sentences_filename = "gdelt_2020_sentences.csv"

# load the files into dataframes
docs_df = pd.read_csv(os.path.join(in_data_path, docs_filename))
sentences_df = pd.read_csv(os.path.join(in_data_path, sentences_filename))

In [71]:
# writing this out to use for parsing later
keyword_list = [
    "climate change",
    "global warming",
    "climate crisis",
    "greenhouse gas",
    "greenhouse gases",
    "carbon tax"
]

In [72]:
# utility methods for finding the GT argument sentences we extract

def match_doc_id(url, docs_df):
    """Get the document id from the ground truth dataset"""
    return docs_df[url == docs_df['url']]['doc_id'].iloc[0]

def get_doc_sentences(doc_id, sentences_df):
    """Return all sentences for a given doc_id"""
    return sentences_df[sentences_df.doc_id == doc_id]

In [73]:
doc_id = match_doc_id(url_df.iloc[0]['content_url'], docs_df)
doc_sentences = get_doc_sentences(doc_id, sentences_df)

In [74]:
sentences_df['has_labeled_arg'] = False

In [102]:
content_url = url_df.iloc[0]
snippit = url_df['labeled_argument'][0]
doc_id = match_doc_id(content_url['content_url'], docs_df)
doc_sentences = get_doc_sentences(doc_id, sentences_df)

# tokenize the GT argument
arg_tokens = snippit.split(".") if isinstance(snippit, str) else None
arg_tokens = [s.strip() for s in arg_tokens]

for token in arg_tokens:
    matches = doc_sentences[doc_sentences.sentence_original.str.contains(token, na=False)]['sentence_id']
    print(matches)

    sentences_df[sentences_df['sentence_id'] == matches.values[0]]['has_labeled_arg'] = True

Series([], Name: sentence_id, dtype: object)


IndexError: index 0 is out of bounds for axis 0 with size 0

In [108]:
token

'the science in her interview with The Age and The Sydney Morning Herald on Wednesday'

In [107]:
doc_sentences[doc_sentences.sentence_original.str.contains(token, na=False)]

Unnamed: 0,url,doc_id,topic,sentence_id,argument_confidence,argument_label,sentence_original,sentence_preprocessed,sort_confidence,stance_confidence,stance_label,has_labeled_arg


In [105]:
doc_sentences.sentence_original.str.contains("Liberal backbenchers")

0     False
1     False
2     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17     True
18    False
19    False
20    False
21    False
22    False
24    False
25    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
Name: sentence_original, dtype: bool

In [93]:
token

' "Australians want us to get on with the job of meeting our '

In [91]:
arg_tokens

[' the science in her interview with The Age and The Sydney Morning Herald on Wednesday',
 ' "Overwhelmingly, Australians accept the science of climate change and they now have lived experience of the climatic conditions this summer," he said',
 ' "Australians want us to get on with the job of meeting our ']

In [89]:
sentences_df.loc[sentences_df.doc_id == doc_id]['has_labeled_arg']

0     False
1     False
2     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
24    False
25    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
Name: has_labeled_arg, dtype: bool

In [None]:
#%%timeit -r 1 -n 1
# this takes forever!! why does it take so long?

# can we find the ground truth argument sentences for a given doc?
sentences_df['has_labeled_arg'] = False

content_url = url_df.iloc[0]
# for content_url in url_df['content_url']:
# get the doc sentences

doc_id = match_doc_id(content_url['content_url'], docs_df)
doc_sentences = get_doc_sentences(doc_id, sentences_df)

# tokenize the GT argument
arg_tokens = [sentence.split(".") if isinstance(sentence, str) else None for sentence in url_df['labeled_argument'] ]

# if arg_tokens is None:
#     continue
# then iterate over sentences to see if the arg is contained in the sentence
for token in arg_tokens:
    try:
        sentences_df.loc[sentences_df.doc_id == doc_id, 'has_labeled_arg'] = (
            sentences_df.loc[sentences_df.doc_id == doc_id, 'has_labeled_arg'] ^ 
            doc_sentences.sentence_original.str.contains(token, na=False)
        )
    except TypeError as e:
        print(token)

In [None]:
np.where(arg_tokens==None)

In [None]:
arg_tokens

# Check timing of different approaches

Does grequests give us a performance boost?
* time serial extraction vs using grequests

Does returning all sentences vs just arguments give us a performance hit?
* time extraction of 50 articles to see if API times are significantly different

In [None]:
num_docs = 50

In [None]:
start_time = time.time()

doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:num_docs])

print("iteration took {:.3} s".format(time.time() - start_time))

In [None]:
start_time = time.time()

responses = classify.fetch_concurrent(topic, url_list = url_df.content_url.values[:num_docs], chunk_size=10)
docs_df, sentences_df, missing_urls = classify.process_responses(responses)

print("iteration took {:.3} s".format(time.time() - start_time))