# Argument Mining API query testing

Load the target datafile, and see how the query results work

In [1]:
# reload local package definitions for each cell
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time

from dotenv import find_dotenv, load_dotenv
import requests
import grequests
import numpy as np
import pandas as pd
import json

from arg_mine import DATA_DIR
from arg_mine.api import classify, auth, session, errors
from arg_mine import utils
from arg_mine.data import labelers, loaders

  curious_george.patch_all(thread=False, select=False)


In [3]:
# load the env variables to get the API key
user_id, api_key = auth.load_auth_tokens()

In [4]:
csv_datapath = os.path.join(DATA_DIR, "raw", "2020-climate-change-narrative")
csv_filepath = os.path.join(csv_datapath, "WebNewsEnglishSnippets.2020.csv")

url_df = loaders.get_gdelt_df(csv_filepath)
url_df.head()

INFO:2020-07-09 04:34:19,481:arg_mine.data.loaders: reading data from: /opt/workspace/data/raw/2020-climate-change-narrative/WebNewsEnglishSnippets.2020.csv


Unnamed: 0,datetime,title,headline_image_url,content_url,topic_context,timestamp
0,20200115111500,Liberal MPs back Science Minister Karen Andrew...,https://static.ffx.io/images/$zoom_0.2627%2C$m...,https://www.smh.com.au/politics/federal/libera...,the science in her interview with The Age and...,2020-01-15 11:15:00
1,20200117184500,Several crowd-pullers on day two of KLF,https://www.thehindu.com/news/cities/kozhikode...,https://www.thehindu.com/news/cities/kozhikode...,"Guha, who talked about patriotism and jingois...",2020-01-17 18:45:00
2,20200106233000,Seven arrested after gas rig protest,https://www.stourbridgenews.co.uk/resources/im...,https://www.stourbridgenews.co.uk/news/nationa...,three demands for the Scottish and UK Governm...,2020-01-06 23:30:00
3,20200101111500,Australia sending aid to wildfire towns as dea...,https://bloximages.newyork1.vip.townnews.com/h...,https://www.heraldmailmedia.com/news/nation/au...,this season the worst on record and reignited ...,2020-01-01 11:15:00
4,20200107101500,"A hot, dry country caught between fire and a c...",,http://global.chinadaily.com.cn/a/202001/07/WS...,", which is burned to generate electricity, wit...",2020-01-07 10:15:00


## Build the `classify` request

In [5]:
url = url_df.iloc[0].content_url

In [6]:
CLASSIFY_BASE_URL = "https://api.argumentsearch.com/en/classify"
timeout = 5

topic = "climate change"


payload = {
    "topic": topic,
    "userID": user_id,
    "apiKey": api_key,
    "targetUrl": url,
    "topicRelevance": "word2vec",
    "predictStance": True,
    "computeAttention": True,
    "showOnlyArguments": False,
}

response = requests.post(
    CLASSIFY_BASE_URL,
    json=payload,
    timeout=timeout,
)
response.raise_for_status()

In [7]:
out = response.json()
out.keys()

dict_keys(['metadata', 'sentences'])

## try a single URL
Get the doc and sentence objects

In [8]:
response = classify.classify_url_sentences(topic, url_df.content_url.values[0], user_id, api_key)

In [9]:
classify.ClassifiedSentence.from_dict(url, topic, response['sentences'][0])

ClassifiedSentence(url='https://www.smh.com.au/politics/federal/liberals-speak-out-to-back-science-minister-on-climate-change-action-20200115-p53rs1.html', doc_id='657f9dd95eb97597e34d0c05b5a93ba6', topic='climate change', sentence_id='da903878c62343fb482bfad67a7523f1', argument_confidence=0.9836708698421717, argument_label='argument', sentence_original='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sentence_preprocessed='Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.', sort_confidence=None, stance_confidence=0.9978714715086467, stance_label='pro')

In [10]:
response['sentences'][0]

{'argumentConfidence': 0.9836708698421717,
 'argumentLabel': 'argument',
 'sentenceOriginal': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'sentencePreprocessed': 'Mr Morrison said Ms Andrews had "well set out" the government policy and signalled again that he would bring forward future policies to adapt to a changing climate while also doing more to reduce greenhouse gas emissions.',
 'stanceConfidence': 0.9978714715086467,
 'stanceLabel': 'pro'}

In [11]:
len(response['sentences'])

5

In [12]:
# good example of a link that is no longer valid
url = url_df.content_url.values[3]
try: 
    response = classify.classify_url_sentences(topic, url, user_id, api_key)
except errors.Refused as e:
    print(e)

Refused: 400: Website could not be crawled or returned an empty result. Please contact an administrator.


## Run a batch of urls

In [13]:
url_df.content_url.shape

(108459,)

In [14]:
start_time = time.time()

doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:20])

print("iteration took {:.3} s".format(time.time() - start_time))

DEBUG:2020-07-09 04:34:26,676:arg_mine.api.classify: Attempting url 1 of 20
DEBUG:2020-07-09 04:34:28,331:arg_mine.api.classify: Attempting url 2 of 20
DEBUG:2020-07-09 04:34:29,683:arg_mine.api.classify: Attempting url 3 of 20
DEBUG:2020-07-09 04:34:31,342:arg_mine.api.classify: Attempting url 4 of 20
DEBUG:2020-07-09 04:34:31,914:arg_mine.api.classify: Attempting url 5 of 20
DEBUG:2020-07-09 04:34:34,270:arg_mine.api.classify: Attempting url 6 of 20
DEBUG:2020-07-09 04:34:36,128:arg_mine.api.classify: Attempting url 7 of 20
DEBUG:2020-07-09 04:34:36,568:arg_mine.api.classify: Attempting url 8 of 20
DEBUG:2020-07-09 04:34:37,645:arg_mine.api.classify: Attempting url 9 of 20
DEBUG:2020-07-09 04:34:38,849:arg_mine.api.classify: Attempting url 10 of 20
DEBUG:2020-07-09 04:34:40,264:arg_mine.api.classify: Attempting url 11 of 20
DEBUG:2020-07-09 04:34:41,626:arg_mine.api.classify: Attempting url 12 of 20
DEBUG:2020-07-09 04:34:42,728:arg_mine.api.classify: Attempting url 13 of 20
DEBUG:20

iteration took 26.9 s


In [15]:
print(len(refused_doc_list))
refused_doc_list

7


['https://www.heraldmailmedia.com/news/nation/australia-sending-aid-to-wildfire-towns-as-death-toll-rises/article_883fa793-6c0a-547e-8f77-b5964f1d7182.html',
 'https://www.seattlepi.com/news/world/article/Cooler-weather-brings-respite-in-Australian-14950114.php',
 'https://www.airdrietoday.com/national-business/starbucks-goals-for-sustainability-will-require-significant-consumer-buy-in-2037947',
 'https://www.heraldbulletin.com/news/nation_world/davos-chief-welcomes-views-of-trump-greta-thunberg-at-forum/article_24dbb151-7132-5c48-a7df-c13b67004ec0.html',
 'https://ravallirepublic.com/news/local/article_cb291465-d430-5477-8b41-1bedda9beaea.html',
 'https://www.caledonianrecord.com/opinion/columns/shawn-shouldice-small-business-owners-prepare-to-dodge-bullets-during/article_9f1b6399-7ca9-59bc-8788-b4d05ec5825c.html',
 'https://www.nsnews.com/trudeau-meets-pallister-and-the-meng-hearing-in-the-news-for-jan-20-1.24056357']

## try concurrent requests with grequest

`grequest` does concurrent threaded requests, but has memory issues for long lists. We can chunk the async requests and write the outputs to storage to avoid this problem.

In [16]:
responses = classify.fetch_concurrent(topic, url_list = url_df.content_url.values[:20], chunk_size=10)

DEBUG:2020-07-09 04:34:53,692:arg_mine.api.classify: >>>> starting doc extraction
DEBUG:2020-07-09 04:35:07,321:arg_mine.api.classify: iteration 0 took 13.627 s (10 docs)
DEBUG:2020-07-09 04:35:22,510:arg_mine.api.classify: iteration 1 took 15.188 s (10 docs)
DEBUG:2020-07-09 04:35:22,512:arg_mine.api.classify: 20 URLs took 28.820 s


In [17]:
docs_df, sentences_df, missing_urls = classify.process_responses(responses)

In [18]:
sentences_df.shape

(312, 11)

# Label the sentences as GT arguments
The given snippit contains context surrounding the given identified key phrase. Tokenizing the phrase will not work.
It is still unclear on whether or not the GT data can be claimed to be an argument; likely not.

> This final dataset covers worldwide English language online news coverage 2015-2020 mentioning "climate change" OR "global warming" OR "climate crisis" OR "greenhouse gas" OR "greenhouse gases" OR "carbon tax" totaling 6.3 million articles. [...]  
>Most importantly, for each match, a short snippet is shown that shows the first instance of one of the climate change phrases above in the article with the 100 characters before and after the appearance, truncated to the nearest word (if the 100th character before or after the phrase appears in the middle of a word, the window will be shrunk to the closest full word). Note that in the majority of cases the first match in the article is selected, but sometimes due to the nature of the finite automaton used to generate the snippets, a later match may be chosen from the article if it allows for a larger context window under certain circumstances.  
>Using a window of 100 characters before and after the match allows for brief non-consumptive snippets that show the context of the match and allow a better understanding of whether the article's mention of climate change was a cursory mention or central to the story and the argument, evidence  and context of the narrative within.

In [19]:
# load the extracted sentences
target_dir = "gdelt-climate-change-docs"
in_data_path = os.path.join(DATA_DIR, "processed", target_dir)

docs_filename = "gdelt_2020_docs_n1000.csv"
sentences_filename = "gdelt_2020_sentences_n1000.csv"

# load the files into dataframes
docs_df = pd.read_csv(os.path.join(in_data_path, docs_filename))
sentences_df = pd.read_csv(os.path.join(in_data_path, sentences_filename))
sentences_df.dropna(subset=['sentence_original'], inplace=True)

In [20]:
# writing this out to use for parsing later
keyword_list = [
    "climate change",
    "global warming",
    "climate crisis",
    "greenhouse gas",
    "greenhouse gases",
    "carbon tax"
]

In [21]:
doc_id = labelers.match_doc_id(url_df.iloc[0]['content_url'], docs_df)
doc_sentences = labelers.get_doc_sentences(doc_id, sentences_df)

In [22]:
url_df.head()

Unnamed: 0,datetime,title,headline_image_url,content_url,topic_context,timestamp
0,20200115111500,Liberal MPs back Science Minister Karen Andrew...,https://static.ffx.io/images/$zoom_0.2627%2C$m...,https://www.smh.com.au/politics/federal/libera...,the science in her interview with The Age and...,2020-01-15 11:15:00
1,20200117184500,Several crowd-pullers on day two of KLF,https://www.thehindu.com/news/cities/kozhikode...,https://www.thehindu.com/news/cities/kozhikode...,"Guha, who talked about patriotism and jingois...",2020-01-17 18:45:00
2,20200106233000,Seven arrested after gas rig protest,https://www.stourbridgenews.co.uk/resources/im...,https://www.stourbridgenews.co.uk/news/nationa...,three demands for the Scottish and UK Governm...,2020-01-06 23:30:00
3,20200101111500,Australia sending aid to wildfire towns as dea...,https://bloximages.newyork1.vip.townnews.com/h...,https://www.heraldmailmedia.com/news/nation/au...,this season the worst on record and reignited ...,2020-01-01 11:15:00
4,20200107101500,"A hot, dry country caught between fire and a c...",,http://global.chinadaily.com.cn/a/202001/07/WS...,", which is burned to generate electricity, wit...",2020-01-07 10:15:00


In [23]:
# manually try the labeling algorithm:

sentences_df['has_labeled_arg'] = False

sentences_df.columns


content_url = url_df.iloc[0]
snippit = url_df['topic_context'][0]
doc_id = labelers.match_doc_id(content_url['content_url'], docs_df)
doc_sentences = labelers.get_doc_sentences(doc_id, sentences_df)

# tokenize the GT argument
arg_tokens = snippit.split(".") if isinstance(snippit, str) else None
arg_tokens = [s.strip() for s in arg_tokens]

for token in arg_tokens:
    matches = doc_sentences[doc_sentences.sentence_original.str.contains(token, na=False)]['sentence_id']
    print(matches)
    # only look at the first match
    sentences_df.loc[sentences_df['sentence_id'] == matches.values[0], 'has_labeled_arg'] = True

10    4a2fcf5b4ca0e45bcae7fcc6af5e2cd6
Name: sentence_id, dtype: object
11    9eac4f301da33af0eaf7e5a4b1a5c759
Name: sentence_id, dtype: object
12    e3496a47bcad630aea1f078b5202cd43
Name: sentence_id, dtype: object


In [24]:
token

'"Australians want us to get on with the job of meeting our'

In [25]:
sentences_df.loc[sentences_df.doc_id == doc_id]['has_labeled_arg']

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10     True
11     True
12     True
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
Name: has_labeled_arg, dtype: bool

In [26]:
url_df_crop = url_df[url_df['content_url'].isin(docs_df.url.values)]

In [27]:
labelers.label_doc_sentences_with_context(url_df_crop.iloc[0], docs_df, sentences_df)

Unnamed: 0,url,doc_id,topic,sentence_id,argument_confidence,argument_label,sentence_original,sentence_preprocessed,sort_confidence,stance_confidence,stance_label,has_labeled_arg
0,https://www.smh.com.au/politics/federal/libera...,657f9dd95eb97597e34d0c05b5a93ba6,climate change,869465292decf0d6a241c72742397f7a,0.004798,no argument,Liberals speak out to back Science Minister on...,Liberals speak out to back Science Minister on...,,0.000000,,False
1,https://www.smh.com.au/politics/federal/libera...,657f9dd95eb97597e34d0c05b5a93ba6,climate change,c05ac4a4bd407c85710ca74f05b3fe91,0.028969,no argument,Liberal backbenchers have stepped up calls for...,Liberal backbenchers have stepped up calls for...,,0.000000,,False
2,https://www.smh.com.au/politics/federal/libera...,657f9dd95eb97597e34d0c05b5a93ba6,climate change,80819a7cf88de4b13b9969a7c7ada4d5,0.061553,no argument,The vocal support counters a smaller group of ...,The vocal support counters a smaller group of ...,,0.000000,,False
3,https://www.smh.com.au/politics/federal/libera...,657f9dd95eb97597e34d0c05b5a93ba6,climate change,1957469b4190ddce406a6e0359a36086,0.010571,no argument,Liberal MPs are speaking out about the need fo...,Liberal MPs are speaking out about the need fo...,,0.000000,,False
4,https://www.smh.com.au/politics/federal/libera...,657f9dd95eb97597e34d0c05b5a93ba6,climate change,da903878c62343fb482bfad67a7523f1,0.983671,argument,"Mr Morrison said Ms Andrews had ""well set out""...","Mr Morrison said Ms Andrews had ""well set out""...",,0.997871,pro,False
...,...,...,...,...,...,...,...,...,...,...,...,...
31429,http://www.terradaily.com/afp/200122185236.7b9...,9e9c7cd2ec9e0a16afb31198eec12e81,climate change,eef99c4932bfe46e71fa656019c0c55c,0.004049,no argument,PepsiCo said it could elaborate on the announc...,PepsiCo said it could elaborate on the announc...,,0.000000,,False
31430,http://www.terradaily.com/afp/200122185236.7b9...,9e9c7cd2ec9e0a16afb31198eec12e81,climate change,1efc109bdddbb6e51e9b69cc0a1b0701,0.004422,no argument,All rights reserved.,All rights reserved.,,0.000000,,False
31431,http://www.terradaily.com/afp/200122185236.7b9...,9e9c7cd2ec9e0a16afb31198eec12e81,climate change,9d1bcdb2ed3c0816aa1c6e167ed1a246,0.003629,no argument,copyright 2018 Agence France-Presse.,copyright 2018 Agence France-Presse.,,0.000000,,False
31432,http://www.terradaily.com/afp/200122185236.7b9...,9e9c7cd2ec9e0a16afb31198eec12e81,climate change,074fb03ed4cb76dd4f79c9870460644d,0.004962,no argument,Sections of the information displayed on this ...,Sections of the information displayed on this ...,,0.000000,,False


In [28]:
doc_hash = utils.unique_hash("https://www.theglobeandmail.com/featured-reports/article-whats-the-right-university-for-you/")
doc_hash

'716931eb57b75ab7161622f9e2d03f6d'

In [29]:
docs_df[docs_df.doc_id == doc_hash]

Unnamed: 0,doc_id,url,topic,model_version,language,time_argument_prediction,time_attention_computation,time_preprocessing,time_stance_prediction,time_logging,time_total,total_arguments,total_contra_arguments,total_pro_arguments,total_non_arguments,total_classified_sentences
39,716931eb57b75ab7161622f9e2d03f6d,https://www.theglobeandmail.com/featured-repor...,climate change,0.1,en,7.527666,-1,0.000358,-1,0.07955,8.555593,2,0,2,641,643


In [30]:
labelers.get_doc_sentences(doc_hash, sentences_df)['sentence_original']

1409    The Globe and Mail’s 2020 Canadian University ...
1410    For incoming students, choosing a postsecondar...
1411    What are you really looking to get out of your...
1412    Do you want to go to a bigger campus across th...
1413    Whether you’re looking to prioritize the stude...
                              ...                        
2047    If you want to write a letter to the editor, p...
2048    Welcome to The Globe and Mail’s comment commun...
2049    This is a space where subscribers can engage w...
2050    Non-subscribers can read and sort comments but...
2051                             Click here to subscribe.
Name: sentence_original, Length: 642, dtype: object

In [31]:
doc_sentences.sentence_original.isna().sum()

0

In [32]:
url_df[url_df.topic_context.isna()]

Unnamed: 0,datetime,title,headline_image_url,content_url,topic_context,timestamp
120,20200116010000,Thailand mobile app to boost reporting of huma...,https://499ioen9wh92k2blb3elevg9-wpengine.netd...,https://www.nyoooz.com/features/technology/tha...,,2020-01-16 01:00:00
226,20200108131500,Why women are ditching their razor during Janu...,https://www.swindonadvertiser.co.uk/resources/...,https://www.swindonadvertiser.co.uk/news/18143...,,2020-01-08 13:15:00
355,20200122093000,A new UN report warns that the current energy ...,https://assets.weforum.org/editor/RE1MItbJmvtJ...,https://www.weforum.org/agenda/2020/01/this-ch...,,2020-01-22 09:30:00
498,20200119033000,Germany sets 2038 coal power deadline,https://i0.wp.com/www.energy-reporters.com/wp-...,https://www.energy-reporters.com/policy/german...,,2020-01-19 03:30:00
500,20200107110000,Chris Hemsworth pledges $1 million to fight th...,http://cnnphilippines.com/.imaging/mte/demo-cn...,https://cnnphilippines.com:443/world/2020/1/7/...,,2020-01-07 11:00:00
...,...,...,...,...,...,...
107878,20200103074500,Australian Navy begins rescue of people trappe...,https://www.prokerala.com/news/photos/imgs/120...,https://www.prokerala.com/news/articles/a99332...,,2020-01-03 07:45:00
107903,20200107110000,Chris Hemsworth pledges $1 million to fight th...,https://bloximages.newyork1.vip.townnews.com/h...,https://www.henryherald.com/news/world_nation/...,,2020-01-07 11:00:00
108112,20200114060000,Victoria announces bushfire inquiry as Morriso...,https://sl.sbs.com.au/public/image/file/f8e730...,https://www.sbs.com.au/news/victoria-announces...,,2020-01-14 06:00:00
108313,20200108094500,"Prince Charles to visit Israel, Bethlehem and ...",https://www.pinkvilla.com/files/styles/fbimage...,https://www.pinkvilla.com/entertainment/hollyw...,,2020-01-08 09:45:00


In [34]:
start_time = time.time()

sentences_df = labelers.label_gdelt_context(url_df, docs_df, sentences_df)


print("labeling took {:0.2f} s".format(time.time() - start_time))

labeling took 9.14 s


In [35]:
sentences_df.has_labeled_arg.sum()

1153

# Check timing of different approaches

**Does grequests give us a performance boost?**
* time serial extraction vs using grequests

Using `classify.collect_sentences_by_topic()`, a serial method, on 50 URLs gives an average return time of 1min 13s ± 6.65 s per loop (mean ± std. dev. of 7 runs, 1 loop each)

Using `classify.fetch_concurrent` followed by `classify.process_responses` on 50 urls should be parallel, and takes 1min 32s ± 12.9 s per loop (mean ± std. dev. of 7 runs, 1 loop each).

I don't know why using grequests is slower than doing it serially, but it definitely is slower when it should be ~2x faster.


**Does returning all sentences vs just arguments give us a performance hit?**
* time extraction of 50 articles to see if API times are significantly different

In [39]:
num_docs = 50

In [42]:
%%timeit
start_time = time.time()

doc_list, sentence_list, refused_doc_list = classify.collect_sentences_by_topic(topic, url_df.content_url.values[:num_docs])

print("iteration took {:.2f} s".format(time.time() - start_time))

DEBUG:2020-07-09 19:05:43,742:arg_mine.api.classify: Attempting url 1 of 50
DEBUG:2020-07-09 19:05:45,806:arg_mine.api.classify: Attempting url 2 of 50
DEBUG:2020-07-09 19:05:48,334:arg_mine.api.classify: Attempting url 3 of 50
DEBUG:2020-07-09 19:05:49,871:arg_mine.api.classify: Attempting url 4 of 50
DEBUG:2020-07-09 19:05:50,473:arg_mine.api.classify: Attempting url 5 of 50
DEBUG:2020-07-09 19:05:52,714:arg_mine.api.classify: Attempting url 6 of 50
DEBUG:2020-07-09 19:05:54,700:arg_mine.api.classify: Attempting url 7 of 50
DEBUG:2020-07-09 19:05:55,095:arg_mine.api.classify: Attempting url 8 of 50
DEBUG:2020-07-09 19:05:56,193:arg_mine.api.classify: Attempting url 9 of 50
DEBUG:2020-07-09 19:05:57,774:arg_mine.api.classify: Attempting url 10 of 50
DEBUG:2020-07-09 19:06:00,806:arg_mine.api.classify: Attempting url 11 of 50
DEBUG:2020-07-09 19:06:02,034:arg_mine.api.classify: Attempting url 12 of 50
DEBUG:2020-07-09 19:06:03,182:arg_mine.api.classify: Attempting url 13 of 50
DEBUG:20

iteration took 93.41 s


DEBUG:2020-07-09 19:07:18,981:arg_mine.api.classify: Attempting url 2 of 50
DEBUG:2020-07-09 19:07:19,805:arg_mine.api.classify: Attempting url 3 of 50
DEBUG:2020-07-09 19:07:20,934:arg_mine.api.classify: Attempting url 4 of 50
DEBUG:2020-07-09 19:07:21,533:arg_mine.api.classify: Attempting url 5 of 50
DEBUG:2020-07-09 19:07:23,480:arg_mine.api.classify: Attempting url 6 of 50
DEBUG:2020-07-09 19:07:24,778:arg_mine.api.classify: Attempting url 7 of 50
DEBUG:2020-07-09 19:07:25,072:arg_mine.api.classify: Attempting url 8 of 50
DEBUG:2020-07-09 19:07:26,374:arg_mine.api.classify: Attempting url 9 of 50
DEBUG:2020-07-09 19:07:27,889:arg_mine.api.classify: Attempting url 10 of 50
DEBUG:2020-07-09 19:07:29,324:arg_mine.api.classify: Attempting url 11 of 50
DEBUG:2020-07-09 19:07:30,349:arg_mine.api.classify: Attempting url 12 of 50
DEBUG:2020-07-09 19:07:31,780:arg_mine.api.classify: Attempting url 13 of 50
DEBUG:2020-07-09 19:07:32,959:arg_mine.api.classify: Attempting url 14 of 50
DEBUG:2

iteration took 87.03 s


DEBUG:2020-07-09 19:08:45,607:arg_mine.api.classify: Attempting url 2 of 50
DEBUG:2020-07-09 19:08:46,513:arg_mine.api.classify: Attempting url 3 of 50
DEBUG:2020-07-09 19:08:49,610:arg_mine.api.classify: Attempting url 4 of 50
DEBUG:2020-07-09 19:08:50,188:arg_mine.api.classify: Attempting url 5 of 50
DEBUG:2020-07-09 19:08:52,126:arg_mine.api.classify: Attempting url 6 of 50
DEBUG:2020-07-09 19:08:53,343:arg_mine.api.classify: Attempting url 7 of 50
DEBUG:2020-07-09 19:08:53,620:arg_mine.api.classify: Attempting url 8 of 50
DEBUG:2020-07-09 19:08:54,743:arg_mine.api.classify: Attempting url 9 of 50
DEBUG:2020-07-09 19:08:56,175:arg_mine.api.classify: Attempting url 10 of 50
DEBUG:2020-07-09 19:08:57,332:arg_mine.api.classify: Attempting url 11 of 50
DEBUG:2020-07-09 19:08:58,308:arg_mine.api.classify: Attempting url 12 of 50
DEBUG:2020-07-09 19:08:59,407:arg_mine.api.classify: Attempting url 13 of 50
DEBUG:2020-07-09 19:09:00,688:arg_mine.api.classify: Attempting url 14 of 50
DEBUG:2

iteration took 72.61 s


DEBUG:2020-07-09 19:09:58,263:arg_mine.api.classify: Attempting url 2 of 50
DEBUG:2020-07-09 19:09:59,111:arg_mine.api.classify: Attempting url 3 of 50
DEBUG:2020-07-09 19:10:00,249:arg_mine.api.classify: Attempting url 4 of 50
DEBUG:2020-07-09 19:10:01,093:arg_mine.api.classify: Attempting url 5 of 50
DEBUG:2020-07-09 19:10:03,047:arg_mine.api.classify: Attempting url 6 of 50
DEBUG:2020-07-09 19:10:04,290:arg_mine.api.classify: Attempting url 7 of 50
DEBUG:2020-07-09 19:10:04,562:arg_mine.api.classify: Attempting url 8 of 50
DEBUG:2020-07-09 19:10:05,527:arg_mine.api.classify: Attempting url 9 of 50
DEBUG:2020-07-09 19:10:06,735:arg_mine.api.classify: Attempting url 10 of 50
DEBUG:2020-07-09 19:10:08,039:arg_mine.api.classify: Attempting url 11 of 50
DEBUG:2020-07-09 19:10:08,985:arg_mine.api.classify: Attempting url 12 of 50
DEBUG:2020-07-09 19:10:10,079:arg_mine.api.classify: Attempting url 13 of 50
DEBUG:2020-07-09 19:10:11,140:arg_mine.api.classify: Attempting url 14 of 50
DEBUG:2

iteration took 68.82 s


DEBUG:2020-07-09 19:11:07,022:arg_mine.api.classify: Attempting url 2 of 50
DEBUG:2020-07-09 19:11:07,855:arg_mine.api.classify: Attempting url 3 of 50
DEBUG:2020-07-09 19:11:09,111:arg_mine.api.classify: Attempting url 4 of 50
DEBUG:2020-07-09 19:11:09,705:arg_mine.api.classify: Attempting url 5 of 50
DEBUG:2020-07-09 19:11:10,989:arg_mine.api.classify: Attempting url 6 of 50
DEBUG:2020-07-09 19:11:12,316:arg_mine.api.classify: Attempting url 7 of 50
DEBUG:2020-07-09 19:11:12,606:arg_mine.api.classify: Attempting url 8 of 50
DEBUG:2020-07-09 19:11:13,809:arg_mine.api.classify: Attempting url 9 of 50
DEBUG:2020-07-09 19:11:15,038:arg_mine.api.classify: Attempting url 10 of 50
DEBUG:2020-07-09 19:11:16,305:arg_mine.api.classify: Attempting url 11 of 50
DEBUG:2020-07-09 19:11:17,232:arg_mine.api.classify: Attempting url 12 of 50
DEBUG:2020-07-09 19:11:18,211:arg_mine.api.classify: Attempting url 13 of 50
DEBUG:2020-07-09 19:11:19,320:arg_mine.api.classify: Attempting url 14 of 50
DEBUG:2

iteration took 79.17 s


DEBUG:2020-07-09 19:12:26,255:arg_mine.api.classify: Attempting url 2 of 50
DEBUG:2020-07-09 19:12:27,088:arg_mine.api.classify: Attempting url 3 of 50
DEBUG:2020-07-09 19:12:28,435:arg_mine.api.classify: Attempting url 4 of 50
DEBUG:2020-07-09 19:12:29,129:arg_mine.api.classify: Attempting url 5 of 50
DEBUG:2020-07-09 19:12:30,410:arg_mine.api.classify: Attempting url 6 of 50
DEBUG:2020-07-09 19:12:31,638:arg_mine.api.classify: Attempting url 7 of 50
DEBUG:2020-07-09 19:12:31,899:arg_mine.api.classify: Attempting url 8 of 50
DEBUG:2020-07-09 19:12:32,940:arg_mine.api.classify: Attempting url 9 of 50
DEBUG:2020-07-09 19:12:34,283:arg_mine.api.classify: Attempting url 10 of 50
DEBUG:2020-07-09 19:12:35,559:arg_mine.api.classify: Attempting url 11 of 50
DEBUG:2020-07-09 19:12:36,513:arg_mine.api.classify: Attempting url 12 of 50
DEBUG:2020-07-09 19:12:37,461:arg_mine.api.classify: Attempting url 13 of 50
DEBUG:2020-07-09 19:12:38,554:arg_mine.api.classify: Attempting url 14 of 50
DEBUG:2

iteration took 68.51 s


DEBUG:2020-07-09 19:13:34,925:arg_mine.api.classify: Attempting url 2 of 50
DEBUG:2020-07-09 19:13:35,718:arg_mine.api.classify: Attempting url 3 of 50
DEBUG:2020-07-09 19:13:37,194:arg_mine.api.classify: Attempting url 4 of 50
DEBUG:2020-07-09 19:13:37,788:arg_mine.api.classify: Attempting url 5 of 50
DEBUG:2020-07-09 19:13:39,171:arg_mine.api.classify: Attempting url 6 of 50
DEBUG:2020-07-09 19:13:40,378:arg_mine.api.classify: Attempting url 7 of 50
DEBUG:2020-07-09 19:13:40,657:arg_mine.api.classify: Attempting url 8 of 50
DEBUG:2020-07-09 19:13:44,339:arg_mine.api.classify: Attempting url 9 of 50
DEBUG:2020-07-09 19:13:45,538:arg_mine.api.classify: Attempting url 10 of 50
DEBUG:2020-07-09 19:13:46,813:arg_mine.api.classify: Attempting url 11 of 50
DEBUG:2020-07-09 19:13:47,771:arg_mine.api.classify: Attempting url 12 of 50
DEBUG:2020-07-09 19:13:48,753:arg_mine.api.classify: Attempting url 13 of 50
DEBUG:2020-07-09 19:13:49,860:arg_mine.api.classify: Attempting url 14 of 50
DEBUG:2

iteration took 69.46 s


DEBUG:2020-07-09 19:14:44,560:arg_mine.api.classify: Attempting url 2 of 50
DEBUG:2020-07-09 19:14:45,382:arg_mine.api.classify: Attempting url 3 of 50
DEBUG:2020-07-09 19:14:46,852:arg_mine.api.classify: Attempting url 4 of 50
DEBUG:2020-07-09 19:14:47,429:arg_mine.api.classify: Attempting url 5 of 50
DEBUG:2020-07-09 19:14:49,666:arg_mine.api.classify: Attempting url 6 of 50
DEBUG:2020-07-09 19:14:50,945:arg_mine.api.classify: Attempting url 7 of 50
DEBUG:2020-07-09 19:14:51,240:arg_mine.api.classify: Attempting url 8 of 50
DEBUG:2020-07-09 19:14:52,246:arg_mine.api.classify: Attempting url 9 of 50
DEBUG:2020-07-09 19:14:53,408:arg_mine.api.classify: Attempting url 10 of 50
DEBUG:2020-07-09 19:14:54,592:arg_mine.api.classify: Attempting url 11 of 50
DEBUG:2020-07-09 19:14:55,603:arg_mine.api.classify: Attempting url 12 of 50
DEBUG:2020-07-09 19:14:56,633:arg_mine.api.classify: Attempting url 13 of 50
DEBUG:2020-07-09 19:14:57,705:arg_mine.api.classify: Attempting url 14 of 50
DEBUG:2

iteration took 68.05 s
1min 13s ± 6.65 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [43]:
%%timeit
start_time = time.time()
chunk_size=100

responses = classify.fetch_concurrent(topic, url_list = url_df.content_url.values[:num_docs], chunk_size=chunk_size)
docs_df, sentences_df, missing_urls = classify.process_responses(responses)

print("iteration took {:.2f} s".format(time.time() - start_time))

DEBUG:2020-07-09 19:15:51,216:arg_mine.api.classify: >>>> starting doc extraction
DEBUG:2020-07-09 19:16:03,793:arg_mine.api.classify: iteration 0 took 12.575 s (10 docs)
DEBUG:2020-07-09 19:16:17,453:arg_mine.api.classify: iteration 1 took 13.658 s (10 docs)
DEBUG:2020-07-09 19:16:35,129:arg_mine.api.classify: iteration 2 took 17.675 s (10 docs)
DEBUG:2020-07-09 19:16:50,754:arg_mine.api.classify: iteration 3 took 15.623 s (10 docs)
DEBUG:2020-07-09 19:17:15,452:arg_mine.api.classify: iteration 4 took 24.696 s (10 docs)
DEBUG:2020-07-09 19:17:15,453:arg_mine.api.classify: 50 URLs took 84.237 s
DEBUG:2020-07-09 19:17:15,544:arg_mine.api.classify: >>>> starting doc extraction


iteration took 84.32 s


DEBUG:2020-07-09 19:17:30,977:arg_mine.api.classify: iteration 0 took 15.433 s (10 docs)
DEBUG:2020-07-09 19:17:47,554:arg_mine.api.classify: iteration 1 took 16.575 s (10 docs)
DEBUG:2020-07-09 19:18:05,442:arg_mine.api.classify: iteration 2 took 17.886 s (10 docs)
DEBUG:2020-07-09 19:18:23,343:arg_mine.api.classify: iteration 3 took 17.900 s (10 docs)
DEBUG:2020-07-09 19:18:48,018:arg_mine.api.classify: iteration 4 took 24.674 s (10 docs)
DEBUG:2020-07-09 19:18:48,020:arg_mine.api.classify: 50 URLs took 92.477 s
DEBUG:2020-07-09 19:18:48,092:arg_mine.api.classify: >>>> starting doc extraction


iteration took 92.55 s


DEBUG:2020-07-09 19:19:00,686:arg_mine.api.classify: iteration 0 took 12.592 s (10 docs)
DEBUG:2020-07-09 19:19:14,728:arg_mine.api.classify: iteration 1 took 14.041 s (10 docs)
DEBUG:2020-07-09 19:19:32,576:arg_mine.api.classify: iteration 2 took 17.847 s (10 docs)
DEBUG:2020-07-09 19:19:49,459:arg_mine.api.classify: iteration 3 took 16.881 s (10 docs)
DEBUG:2020-07-09 19:20:13,387:arg_mine.api.classify: iteration 4 took 23.926 s (10 docs)
DEBUG:2020-07-09 19:20:13,389:arg_mine.api.classify: 50 URLs took 85.297 s
DEBUG:2020-07-09 19:20:13,467:arg_mine.api.classify: >>>> starting doc extraction


iteration took 85.37 s


DEBUG:2020-07-09 19:20:25,932:arg_mine.api.classify: iteration 0 took 12.465 s (10 docs)
DEBUG:2020-07-09 19:20:39,477:arg_mine.api.classify: iteration 1 took 13.544 s (10 docs)
DEBUG:2020-07-09 19:20:57,156:arg_mine.api.classify: iteration 2 took 17.678 s (10 docs)
DEBUG:2020-07-09 19:21:12,967:arg_mine.api.classify: iteration 3 took 15.809 s (10 docs)
DEBUG:2020-07-09 19:21:37,433:arg_mine.api.classify: iteration 4 took 24.465 s (10 docs)
DEBUG:2020-07-09 19:21:37,435:arg_mine.api.classify: 50 URLs took 83.968 s
DEBUG:2020-07-09 19:21:37,507:arg_mine.api.classify: >>>> starting doc extraction


iteration took 84.04 s


DEBUG:2020-07-09 19:21:49,757:arg_mine.api.classify: iteration 0 took 12.250 s (10 docs)
DEBUG:2020-07-09 19:22:04,790:arg_mine.api.classify: iteration 1 took 15.032 s (10 docs)
DEBUG:2020-07-09 19:22:22,922:arg_mine.api.classify: iteration 2 took 18.131 s (10 docs)
DEBUG:2020-07-09 19:22:39,874:arg_mine.api.classify: iteration 3 took 16.951 s (10 docs)
DEBUG:2020-07-09 19:23:07,462:arg_mine.api.classify: iteration 4 took 27.587 s (10 docs)
DEBUG:2020-07-09 19:23:07,464:arg_mine.api.classify: 50 URLs took 89.957 s
DEBUG:2020-07-09 19:23:07,546:arg_mine.api.classify: >>>> starting doc extraction


iteration took 90.04 s


DEBUG:2020-07-09 19:23:20,875:arg_mine.api.classify: iteration 0 took 13.328 s (10 docs)
DEBUG:2020-07-09 19:23:35,110:arg_mine.api.classify: iteration 1 took 14.234 s (10 docs)
DEBUG:2020-07-09 19:23:52,844:arg_mine.api.classify: iteration 2 took 17.732 s (10 docs)
DEBUG:2020-07-09 19:24:10,392:arg_mine.api.classify: iteration 3 took 17.547 s (10 docs)
DEBUG:2020-07-09 19:24:34,566:arg_mine.api.classify: iteration 4 took 24.173 s (10 docs)
DEBUG:2020-07-09 19:24:34,567:arg_mine.api.classify: 50 URLs took 87.021 s
DEBUG:2020-07-09 19:24:34,637:arg_mine.api.classify: >>>> starting doc extraction


iteration took 87.09 s


DEBUG:2020-07-09 19:24:47,718:arg_mine.api.classify: iteration 0 took 13.080 s (10 docs)
DEBUG:2020-07-09 19:25:01,780:arg_mine.api.classify: iteration 1 took 14.061 s (10 docs)
DEBUG:2020-07-09 19:25:19,404:arg_mine.api.classify: iteration 2 took 17.622 s (10 docs)
DEBUG:2020-07-09 19:25:35,142:arg_mine.api.classify: iteration 3 took 15.737 s (10 docs)
DEBUG:2020-07-09 19:25:57,484:arg_mine.api.classify: iteration 4 took 22.340 s (10 docs)
DEBUG:2020-07-09 19:25:57,486:arg_mine.api.classify: 50 URLs took 82.849 s
DEBUG:2020-07-09 19:25:57,557:arg_mine.api.classify: >>>> starting doc extraction


iteration took 82.92 s


DEBUG:2020-07-09 19:26:11,560:arg_mine.api.classify: iteration 0 took 14.002 s (10 docs)
DEBUG:2020-07-09 19:26:25,407:arg_mine.api.classify: iteration 1 took 13.845 s (10 docs)
DEBUG:2020-07-09 19:26:43,091:arg_mine.api.classify: iteration 2 took 17.682 s (10 docs)
DEBUG:2020-07-09 19:27:00,526:arg_mine.api.classify: iteration 3 took 17.434 s (10 docs)
DEBUG:2020-07-09 19:28:00,279:arg_mine.api.classify: iteration 4 took 59.751 s (10 docs)
DEBUG:2020-07-09 19:28:00,280:arg_mine.api.classify: 50 URLs took 122.723 s


iteration took 122.79 s
1min 32s ± 12.9 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
