# Argument Mining API query testing

Load the target datafile, and see how the query results work

In [1]:
# reload local package definitions for each cell
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time

from dotenv import find_dotenv, load_dotenv
import requests
import grequests
import numpy as np
import pandas as pd
import json

from arg_mine import DATA_DIR
from arg_mine.api import classify, auth, session, errors
from arg_mine import utils
from arg_mine.data import labelers, loaders

  curious_george.patch_all(thread=False, select=False)


In [3]:
# load the env variables to get the API key
user_id, api_key = auth.load_auth_tokens()

In [4]:
csv_datapath = os.path.join(DATA_DIR, "raw", "2020-climate-change-narrative")
csv_filepath = os.path.join(csv_datapath, "WebNewsEnglishSnippets.2020.csv")

url_df = loaders.get_gdelt_df(csv_filepath)
url_df.head()

INFO:2020-07-06 23:56:58,290:arg_mine.data.loaders: reading data from: /opt/workspace/data/raw/2020-climate-change-narrative/WebNewsEnglishSnippets.2020.csv


Unnamed: 0,datetime,title,headline_image_url,content_url,topic_context,timestamp
0,20200115111500,Liberal MPs back Science Minister Karen Andrew...,https://static.ffx.io/images/$zoom_0.2627%2C$m...,https://www.smh.com.au/politics/federal/libera...,the science in her interview with The Age and...,2020-01-15 11:15:00
1,20200117184500,Several crowd-pullers on day two of KLF,https://www.thehindu.com/news/cities/kozhikode...,https://www.thehindu.com/news/cities/kozhikode...,"Guha, who talked about patriotism and jingois...",2020-01-17 18:45:00
2,20200106233000,Seven arrested after gas rig protest,https://www.stourbridgenews.co.uk/resources/im...,https://www.stourbridgenews.co.uk/news/nationa...,three demands for the Scottish and UK Governm...,2020-01-06 23:30:00
3,20200101111500,Australia sending aid to wildfire towns as dea...,https://bloximages.newyork1.vip.townnews.com/h...,https://www.heraldmailmedia.com/news/nation/au...,this season the worst on record and reignited ...,2020-01-01 11:15:00
4,20200107101500,"A hot, dry country caught between fire and a c...",,http://global.chinadaily.com.cn/a/202001/07/WS...,", which is burned to generate electricity, wit...",2020-01-07 10:15:00


## Build the `classify` request

In [5]:
url = url_df.iloc[0].content_url

In [7]:
CLASSIFY_BASE_URL = "https://api.argumentsearch.com/en/classify"
timeout = 5

topic = "climate change"


payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "topicRelevance": "word2vec",
    "predictStance": True,
    "computeAttention": True,
    "showOnlyArguments": False,
}

response = requests.post(
    CLASSIFY_BASE_URL,
    json=payload,
    timeout=timeout,
)
response.raise_for_status()

In [8]:
out = response.json()
out.keys()

dict_keys(['metadata', 'sentences'])

## try a single URL
Get the doc and sentence objects

In [9]:
response = classify.classify_url_sentences(topic, url_df.content_url.values[0], user_id, api_key)

In [10]:
classify.ClassifiedSentence.from_dict(url, topic, response['sentences'][0])

ClassifiedSentence(url='https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html', doc_id='657f9dd95eb97597e34d0c05b5a93ba6', topic='climate change', sentence_id='da903878c62343fb482bfad67a7523f1', argument_confidence=0.9836708698421717, argument_label='argument', sentence_original='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sentence_preprocessed='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sort_confidence=None, stance_confidence=0.9978714715086467, stance_label='pro')

In [11]:
response['sentences'][0]

{'argumentConfidence': 0.9836708698421717,
 'argumentLabel': 'argument',
 'sentenceOriginal': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'sentencePreprocessed': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'stanceConfidence': 0.9978714715086467,
 'stanceLabel': 'pro'}

In [12]:
len(response['sentences'])

5

In [13]:
# good example of a link that is no longer valid
url = url_df.content_url.values[3]
try: 
    response = classify.classify_url_sentences(topic, url, user_id, api_key)
except errors.Refused as e:
    print(e)

Refused: 400: Website could not be crawled or returned an empty result. Please contact an administrator.


## Run a batch of urls

In [14]:
url_df.content_url.shape

(108459,)

In [15]:
start_time = time.time()

doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:20])

print("iteration took {:.3} s".format(time.time() - start_time))

DEBUG:2020-07-07 00:05:37,048:arg_mine.api.classify: Attempting url 1 of 20
DEBUG:2020-07-07 00:05:38,572:arg_mine.api.classify: Attempting url 2 of 20
DEBUG:2020-07-07 00:05:39,410:arg_mine.api.classify: Attempting url 3 of 20
DEBUG:2020-07-07 00:05:40,905:arg_mine.api.classify: Attempting url 4 of 20
DEBUG:2020-07-07 00:05:41,495:arg_mine.api.classify: Attempting url 5 of 20
DEBUG:2020-07-07 00:05:43,384:arg_mine.api.classify: Attempting url 6 of 20
DEBUG:2020-07-07 00:05:44,629:arg_mine.api.classify: Attempting url 7 of 20
DEBUG:2020-07-07 00:05:44,967:arg_mine.api.classify: Attempting url 8 of 20
DEBUG:2020-07-07 00:05:46,014:arg_mine.api.classify: Attempting url 9 of 20
DEBUG:2020-07-07 00:05:47,179:arg_mine.api.classify: Attempting url 10 of 20
DEBUG:2020-07-07 00:05:48,472:arg_mine.api.classify: Attempting url 11 of 20
DEBUG:2020-07-07 00:05:49,418:arg_mine.api.classify: Attempting url 12 of 20
DEBUG:2020-07-07 00:05:50,448:arg_mine.api.classify: Attempting url 13 of 20
DEBUG:20

iteration took 24.7 s


In [16]:
print(len(refused_doc_list))
refused_doc_list

7


['https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-towns-as-death-toll-rises/article_883fa793-6c0a-547e-8f77-b5964f1d7182.html',
 'https://www.seattlepi.com/news/world/article/Cooler-weather-brings-respite-in-Australian-14950114.php',
 'https://www.airdrietoday.com/national-business/starbucks-goals-for-sustainability-will-require-significant-consumer-buy-in-2037947',
 'https://www.heraldbulletin.com/news/nation_world/davos-chief-welcomes-views-of-trump-greta-thunberg-at-forum/article_24dbb151-7132-5c48-a7df-c13b67004ec0.html',
 'https://ravallirepublic.com/news/local/article_cb291465-d430-5477-8b41-1bedda9beaea.html',
 'https://www.caledonianrecord.com/opinion/columns/shawn-shouldice-small-business-owners-prepare-to-dodge-bullets-during/article_9f1b6399-7ca9-59bc-8788-b4d05ec5825c.html',
 'https://www.nsnews.com/trudeau-meets-pallister-and-the-meng-hearing-in-the-news-for-jan-20-1.24056357']

## try concurrent requests with grequest

`grequest` does concurrent threaded requests, but has memory issues for long lists. We can chunk the async requests and write the outputs to storage to avoid this problem.

In [17]:
responses = classify.fetch_concurrent(topic, url_list = url_df.content_url.values[:20], chunk_size=10)

DEBUG:2020-07-07 00:06:01,867:arg_mine.api.classify: >>>> starting doc extraction
DEBUG:2020-07-07 00:06:13,361:arg_mine.api.classify: iteration 0 took 11.488 s (10 docs)
DEBUG:2020-07-07 00:06:27,932:arg_mine.api.classify: iteration 1 took 14.569 s (10 docs)
DEBUG:2020-07-07 00:06:27,933:arg_mine.api.classify: 20 URLs took 26.067 s


In [18]:
docs_df, sentences_df, missing_urls = classify.process_responses(responses)

DEBUG:2020-07-07 00:06:27,977:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
DEBUG:2020-07-07 00:06:27,980:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
DEBUG:2020-07-07 00:06:27,981:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
DEBUG:2020-07-07 00:06:27,982:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
DEBUG:2020-07-07 00:06:27,988:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
DEBUG:2020-07-07 00:06:27,990:arg_mine.api.classify: 400 : {'error': 'Website could not be crawled or returned an empty result. Please contact an administrator.'}
DEBUG:2020-07-07 00:06

In [19]:
sentences_df.shape

(312, 11)

# Label the sentences as GT arguments
The given snippit contains context surrounding the given identified key phrase. Tokenizing the phrase will not work.
It is still unclear on whether or not the GT data can be claimed to be an argument; likely not.

> This final dataset covers worldwide English language online news coverage 2015-2020 mentioning "climate change" OR "global warming" OR "climate crisis" OR "greenhouse gas" OR "greenhouse gases" OR "carbon tax" totaling 6.3 million articles. [...]  
>Most importantly, for each match, a short snippet is shown that shows the first instance of one of the climate change phrases above in the article with the 100 characters before and after the appearance, truncated to the nearest word (if the 100th character before or after the phrase appears in the middle of a word, the window will be shrunk to the closest full word). Note that in the majority of cases the first match in the article is selected, but sometimes due to the nature of the finite automaton used to generate the snippets, a later match may be chosen from the article if it allows for a larger context window under certain circumstances.  
>Using a window of 100 characters before and after the match allows for brief non-consumptive snippets that show the context of the match and allow a better understanding of whether the article's mention of climate change was a cursory mention or central to the story and the argument, evidence  and context of the narrative within.

In [20]:
# load the extracted sentences
target_dir = "gdelt-climate-change-docs"
in_data_path = os.path.join(DATA_DIR, "processed", target_dir)

docs_filename = "gdelt_2020_docs.csv"
sentences_filename = "gdelt_2020_sentences.csv"

# load the files into dataframes
docs_df = pd.read_csv(os.path.join(in_data_path, docs_filename))
sentences_df = pd.read_csv(os.path.join(in_data_path, sentences_filename))
sentences_df.dropna(subset=['sentence_original'], inplace=True)

In [21]:
# writing this out to use for parsing later
keyword_list = [
    "climate change",
    "global warming",
    "climate crisis",
    "greenhouse gas",
    "greenhouse gases",
    "carbon tax"
]

In [22]:
doc_id = labelers.match_doc_id(url_df.iloc[0]['content_url'], docs_df)
doc_sentences = labelers.get_doc_sentences(doc_id, sentences_df)

In [23]:
# manually try the labeling algorithm:

sentences_df['has_labeled_arg'] = False

sentences_df.columns


content_url = url_df.iloc[0]
snippit = url_df['labeled_argument'][0]
doc_id = labelers.match_doc_id(content_url['content_url'], docs_df)
doc_sentences = labelers.get_doc_sentences(doc_id, sentences_df)

# tokenize the GT argument
arg_tokens = snippit.split(".") if isinstance(snippit, str) else None
arg_tokens = [s.strip() for s in arg_tokens]

for token in arg_tokens:
    matches = doc_sentences[doc_sentences.sentence_original.str.contains(token, na=False)]['sentence_id']
    print(matches)
    # only look at the first match
    sentences_df.loc[sentences_df['sentence_id'] == matches.values[0], 'has_labeled_arg'] = True

KeyError: 'labeled_argument'

In [None]:
token

In [None]:
sentences_df.loc[sentences_df.doc_id == doc_id]['has_labeled_arg']

In [None]:
url_df_crop = url_df[url_df['content_url'].isin(docs_df.url.values)]

In [None]:
labelers.label_doc_sentences_with_context(url_df_crop.iloc[0], docs_df, sentences_df)

In [None]:
doc_hash = utils.unique_hash("https://www.theglobeandmail.com/featured-reports/article-whats-the-right-university-for-you/")
doc_hash

In [None]:
docs_df[docs_df.doc_id == doc_hash]

In [None]:
labelers.get_doc_sentences(doc_hash, sentences_df)['sentence_original']

In [None]:
doc_sentences.sentence_original.isna().sum()

In [None]:
start_time = time.time()

sentences_df = labelers.label_gdelt_context(url_df, docs_df, sentences_df)


print("labeling took {:0.2f} s".format(time.time() - start_time))

In [None]:
token = ", science, sustainable design engineering and veterinary medicine."
token

In [None]:
doc_sentences = labelers.get_doc_sentences('716931eb57b75ab7161622f9e2d03f6d', sentences_df)
np.where(doc_sentences.sentence_original.str.contains(token))

In [None]:
doc_sentences

In [None]:
doc_sentences.sentence_original.iloc[72]

In [None]:
token

In [None]:
doc_sentences.iloc[477]

In [None]:
doc_sentences.iloc[477].sentence_original.str.contains(token)

In [None]:
doc_sentences.sentence_original.iloc[475:479].values

In [None]:
sentences_df

# Check timing of different approaches

Does grequests give us a performance boost?
* time serial extraction vs using grequests

Does returning all sentences vs just arguments give us a performance hit?
* time extraction of 50 articles to see if API times are significantly different

In [None]:
num_docs = 50

In [None]:
start_time = time.time()

doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:num_docs])

print("iteration took {:.2f} s".format(time.time() - start_time))

In [None]:
start_time = time.time()

responses = classify.fetch_concurrent(topic, url_list = url_df.content_url.values[:num_docs], chunk_size=10)
docs_df, sentences_df, missing_urls = classify.process_responses(responses)

print("iteration took {:.2f} s".format(time.time() - start_time))