# Pipeline to make dataset from Hypothesis.is

### Install the libraries of need

In [1]:
#!pip3 install elasticsearch
#!pip install pandas requests spacy seaborn

In [112]:
import pandas as pd
import os, json
import requests
import seaborn as sns
import spacy
import hashlib
import matplotlib.pyplot as plt
import numpy as np

### [Optional] Fetch the data (run this only if no data is available in the folder or changes in Hypothes.is are expected)

In [3]:
# Let's see the sample data first.
data = {
    'user': "Ezloplop",
    'group_text': "BehSci",
    'group': "Jk8bYJdN",
    'api_key': "my_api_key"
}

url_search = "https://api.hypothes.is/api/search"
url_ = "https://hypothes.is/groups/Jk8bYJdN/behsci"

In [4]:
res = requests.get(url=url_search, params={'group':'Jk8bYJdN', 'limit': 200}, headers={'Authorization': f"Bearer {data['api_key']}"})

In [5]:
json = res.json()
pd.DataFrame(json['rows']).head(5)

Unnamed: 0,id,created,updated,user,uri,text,tags,group,permissions,target,document,links,user_info,flagged,hidden
0,EFWynpMTEeyO3G9alGUJkA,2022-02-21T12:37:42.860894+00:00,2022-08-29T10:32:00.921505+00:00,acct:jackiekrauss@hypothes.is,https://securingdemocracy.gmfus.org/rt-deutsch...,"Schafer, B. (2021, October 5). RT Deutsch Find...","[is:blog, lang:en, RT Deutsch, anti-vaccinatio...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://securingdemocracy.gmfus.o...,{'title': ['RT Deutsch Finds a Home with Anti-...,{'html': 'https://hypothes.is/a/EFWynpMTEeyO3G...,{'display_name': None},False,False
1,dR7DsExZEeyc5Se_QXZPbw,2021-11-23T12:32:44.190592+00:00,2022-08-29T10:31:48.676007+00:00,acct:jackiekrauss@hypothes.is,https://www.newscientist.com/article/2298169-w...,"Sparkes, M. (2021, November 19). Wikipedia tes...","[is:news, lang:en, wikipedia, Artificial intel...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.newscientist.com/arti...,{'title': ['Wikipedia tests AI for spotting co...,{'html': 'https://hypothes.is/a/dR7DsExZEeyc5S...,{'display_name': None},False,False
2,1h_c6CztEeyMa1dlOOluOw,2021-10-14T12:54:15.213834+00:00,2022-08-29T10:31:42.805199+00:00,acct:jackiekrauss@hypothes.is,https://www.theguardian.com/world/2021/oct/11/...,"Henley, J. (2021, October 11). UK’s high Covid...","[is:news, lang:en, COVID-19, United Kingdom, E...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.theguardian.com/world...,{'title': ['UK’s high Covid case rates buck tr...,{'html': 'https://hypothes.is/a/1h_c6CztEeyMa1...,{'display_name': None},False,False
3,LVkBykCTEeypZte95Ybu5Q,2021-11-08T12:55:40.654202+00:00,2022-08-29T10:31:42.592685+00:00,acct:jackiekrauss@hypothes.is,https://twitter.com/BenPBradshaw/status/145545...,"Ben Bradshaw. (2021, November 2). More than 50...","[is:tweet, lang:en, COVID-19, parliament, Unit...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://twitter.com/BenPBradshaw/...,{'title': ['Ben Bradshaw on Twitter']},{'html': 'https://hypothes.is/a/LVkBykCTEeypZt...,{'display_name': None},False,False
4,TV8KuEuTEey7rLN9r-8Mdw,2021-11-22T12:54:17.160769+00:00,2022-08-29T10:31:42.583167+00:00,acct:jackiekrauss@hypothes.is,https://twitter.com/BlakesWort/status/14587947...,"Blake. (2021, November 11). Wie ungeheuer pein...","[is:tweet, lang:de, COVID-19, vaccine mandate,...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://twitter.com/BlakesWort/st...,{'title': ['Blake on Twitter']},{'html': 'https://hypothes.is/a/TV8KuEuTEey7rL...,{'display_name': None},False,False


### Hypothesis.is stores the data by user, so we'll use 'users' info to get data.

In [8]:
users = requests.get(url=f"https://api.hypothes.is/api/groups/{data['group']}/members")

In [9]:
users.json()[1]

{'authority': 'hypothes.is',
 'userid': 'acct:stefanherzog@hypothes.is',
 'username': 'stefanherzog',
 'display_name': 'Stefan Herzog'}

In [10]:
data_per_user = []
for user in users.json():
    print(user)
    user_id = user['userid']
    user_batch = []
    for i in range(0, 5000, 200):
        res = requests.get(url=url_search,
                           params={'group':'Jk8bYJdN', 'user':f'{user_id}', 'limit': 200, 'offset':i},
                           headers={'Authorization': f"Bearer {data['api_key']}"})
        user_batch.append(res.json())
    data_per_user.append(user_batch)

{'authority': 'hypothes.is', 'userid': 'acct:amyhcurtis@hypothes.is', 'username': 'amyhcurtis', 'display_name': None}
{'authority': 'hypothes.is', 'userid': 'acct:stefanherzog@hypothes.is', 'username': 'stefanherzog', 'display_name': 'Stefan Herzog'}
{'authority': 'hypothes.is', 'userid': 'acct:Hahn@hypothes.is', 'username': 'Hahn', 'display_name': None}
{'authority': 'hypothes.is', 'userid': 'acct:Marlene_Wulf@hypothes.is', 'username': 'Marlene_Wulf', 'display_name': None}
{'authority': 'hypothes.is', 'userid': 'acct:edampf@hypothes.is', 'username': 'edampf', 'display_name': None}
{'authority': 'hypothes.is', 'userid': 'acct:lewan@hypothes.is', 'username': 'lewan', 'display_name': None}
{'authority': 'hypothes.is', 'userid': 'acct:gailelhalaby@hypothes.is', 'username': 'gailelhalaby', 'display_name': None}
{'authority': 'hypothes.is', 'userid': 'acct:LeaGlaubig@hypothes.is', 'username': 'LeaGlaubig', 'display_name': None}
{'authority': 'hypothes.is', 'userid': 'acct:Danaeioak@hypothes

In [11]:
total_anns = []
for batch in data_per_user:
    for elem in batch:
        total_anns += elem['rows']

In [13]:
# Store the dataframe into json file
total_anns_df = pd.DataFrame(total_anns)
total_anns_df.to_json('hypothesis_v1__12-03-22.jsonl', orient='records', lines=True)

## 1. Load Data from File

In [2]:
# dataframe of our dataset
df = pd.read_json('hypothesis_v1__12-03-22.jsonl', orient='records', lines=True)

In [3]:
df

Unnamed: 0,id,created,updated,user,uri,text,tags,group,permissions,target,document,links,user_info,flagged,hidden
0,rnlcIho0EeuKgFfKPBbajQ,2020-10-29T22:18:33.169969+00:00,2020-10-29T22:18:33.169969+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,https://twitter.com/i/web/status/1306171100544...,[has:context],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,{'title': ['The impact of Covid-19 on media – ...,{'html': 'https://hypothes.is/a/rnlcIho0EeuKgF...,{'display_name': None},False,False
1,qHdm3ho0EeuTJufnuQnIqQ,2020-10-29T22:18:23.010549+00:00,2020-10-29T22:18:23.010549+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,2020-09-16,[has:date],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,{'title': ['The impact of Covid-19 on media – ...,{'html': 'https://hypothes.is/a/qHdm3ho0EeuTJu...,{'display_name': None},False,False
2,oUZLeBo0EeuthtuX8fq6yQ,2020-10-29T22:18:11.003006+00:00,2020-10-29T22:18:11.003006+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,The impact of Covid-19 on media – rise of info...,"[is:youtube, is:webinar, disinformation, misin...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,{'title': ['The impact of Covid-19 on media – ...,{'html': 'https://hypothes.is/a/oUZLeBo0Eeutht...,{'display_name': None},False,False
3,gHTeKBo0EeubG0_nm_NNpQ,2020-10-29T22:17:15.887213+00:00,2020-10-29T22:17:15.887213+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,,[ann:summary],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,{'title': ['The impact of Covid-19 on media – ...,{'html': 'https://hypothes.is/a/gHTeKBo0EeubG0...,{'display_name': None},False,False
4,fAZg8Bo0Eeu39uviYF17AQ,2020-10-29T22:17:08.501454+00:00,2020-10-29T22:17:08.501454+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,,[ann:title],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,{'title': ['The impact of Covid-19 on media – ...,{'html': 'https://hypothes.is/a/fAZg8Bo0Eeu39u...,{'display_name': None},False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44263,IqqGzNZdEeu7qW8TVqQlWA,2021-06-26T09:01:46.663954+00:00,2021-06-26T09:01:46.663954+00:00,acct:lucyparfitt16@hypothes.is,https://poseidon01.ssrn.com/delivery.php?ID=05...,2021-03-31,[has:date],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://poseidon01.ssrn.com/deliv...,{'title': ['Microsoft Word - Parry et al.docx']},{'html': 'https://hypothes.is/a/IqqGzNZdEeu7qW...,{'display_name': None},False,False
44264,GzcHitZdEeu3utsN1zIlXA,2021-06-26T09:01:34.146545+00:00,2021-06-26T09:01:34.146545+00:00,acct:lucyparfitt16@hypothes.is,https://poseidon01.ssrn.com/delivery.php?ID=05...,"Parry, H. M., Tut, G., Faustini, S., Stephens,...","[is:preprint, lang:en, COVID-19, vaccine, immu...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://poseidon01.ssrn.com/deliv...,{'title': ['Microsoft Word - Parry et al.docx']},{'html': 'https://hypothes.is/a/GzcHitZdEeu3ut...,{'display_name': None},False,False
44265,tkTclNZcEeu7dhcV5i6TGA,2021-06-26T08:58:44.804883+00:00,2021-06-26T08:58:44.804883+00:00,acct:lucyparfitt16@hypothes.is,https://poseidon01.ssrn.com/delivery.php?ID=05...,,[ann:summary],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://poseidon01.ssrn.com/deliv...,{'title': ['Microsoft Word - Parry et al.docx']},{'html': 'https://hypothes.is/a/tkTclNZcEeu7dh...,{'display_name': None},False,False
44266,p5oL9tZcEeuXK79Bg_WtoA,2021-06-26T08:58:20.188404+00:00,2021-06-26T08:58:20.188404+00:00,acct:lucyparfitt16@hypothes.is,https://poseidon01.ssrn.com/delivery.php?ID=05...,,[ann:title],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://poseidon01.ssrn.com/deliv...,{'title': ['Microsoft Word - Parry et al.docx']},{'html': 'https://hypothes.is/a/p5oL9tZcEeuXK7...,{'display_name': None},False,False


In [4]:
# what is inside of our dataset?
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44268 entries, 0 to 44267
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           44268 non-null  object
 1   created      44268 non-null  object
 2   updated      44268 non-null  object
 3   user         44268 non-null  object
 4   uri          44268 non-null  object
 5   text         44268 non-null  object
 6   tags         44268 non-null  object
 7   group        44268 non-null  object
 8   permissions  44268 non-null  object
 9   target       44268 non-null  object
 10  document     44268 non-null  object
 11  links        44268 non-null  object
 12  user_info    44268 non-null  object
 13  flagged      44268 non-null  bool  
 14  hidden       44268 non-null  bool  
dtypes: bool(2), object(13)
memory usage: 4.5+ MB


### 2. Data exploration

What are interesting insights to extract and what fields might need corrections?


In [5]:
# Get one sample, which is one annotation.
sample = df.sample(n=1)

In [6]:
sample.iloc[0].to_dict()

{'id': 'axk04ne5Eeu_VfsJ2bo4Tg',
 'created': '2021-02-25T22:33:01.337475+00:00',
 'updated': '2021-02-25T22:33:01.337475+00:00',
 'user': 'acct:marta_radosevic@hypothes.is',
 'uri': 'https://www.thelancet.com/journals/lanplh/article/PIIS2542-5196(20)30252-7/fulltext',
 'text': 'https://twitter.com/SciBeh/status/1334804877462138887',
 'tags': ['has:context'],
 'group': 'Jk8bYJdN',
 'permissions': {'read': ['group:__world__'],
  'admin': ['acct:marta_radosevic@hypothes.is'],
  'update': ['acct:marta_radosevic@hypothes.is'],
  'delete': ['acct:marta_radosevic@hypothes.is']},
 'target': [{'source': 'https://www.thelancet.com/journals/lanplh/article/PIIS2542-5196(20)30252-7/fulltext'}],
 'document': {'title': ['Neoliberal economics, planetary health, and the COVID-19 pandemic: a Marxist ecofeminist analysis']},
 'links': {'html': 'https://hypothes.is/a/axk04ne5Eeu_VfsJ2bo4Tg',
  'incontext': 'https://hyp.is/axk04ne5Eeu_VfsJ2bo4Tg/www.thelancet.com/journals/lanplh/article/PIIS2542-5196(20)30

**Observation**: "text" seems to be missing in many cases, perhaps it is included in another column such as target

In [7]:
# Checking if the values of 'text' are reliable.
df['text'].value_counts()[:10]

              21698
2020-05         113
2020-07          75
2020-06          72
2020-04          66
2021-11-26       43
2020-04-30       35
2020-08          32
2020-05-06       30
2020-05-05       30
Name: text, dtype: int64

In [8]:
df.query('text != ""')[['id', 'text']]

Unnamed: 0,id,text
0,rnlcIho0EeuKgFfKPBbajQ,https://twitter.com/i/web/status/1306171100544...
1,qHdm3ho0EeuTJufnuQnIqQ,2020-09-16
2,oUZLeBo0EeuthtuX8fq6yQ,The impact of Covid-19 on media – rise of info...
5,uAs1rhozEeuVV7-ike6oNw,"Online Research Tools and Techniques. (2020, ..."
6,FaDHPhozEeuXjAsKbE0ErA,https://twitter.com/i/web/status/1305920211992...
...,...,...
44259,nXwluNZdEeu3vcv8gBb4GQ,2021-04-01
44260,lxgYJtZdEeuQ8F-gZ-Ae9w,Long Covid: Snapshot poll finds more than 1m p...
44263,IqqGzNZdEeu7qW8TVqQlWA,2021-03-31
44264,GzcHitZdEeu3utsN1zIlXA,"Parry, H. M., Tut, G., Faustini, S., Stephens,..."


**Check**: the total number of tags and the unique number of them.

In [9]:
all_tags = df.tags.sum()

In [10]:
len(all_tags)

145633

In [11]:
# Counting hashable objects
from collections import Counter

In [12]:
unique_tags = Counter(all_tags)

In [13]:
len(unique_tags) # Total number of unique tags existing in the dataset.

16474

In [14]:
unique_tags

Counter({'has:context': 5401,
         'has:date': 7688,
         'is:youtube': 111,
         'is:webinar': 23,
         'disinformation': 85,
         'misinformation': 708,
         'online': 110,
         'technology': 98,
         'reliability': 37,
         'infodemic': 44,
         'conspiracy': 47,
         'healthcare': 245,
         'public health': 595,
         'COVID-19': 7364,
         'media': 153,
         'lang:en': 8534,
         'ann:summary': 7083,
         'ann:title': 6841,
         'research': 824,
         'ethics': 46,
         'methodology': 17,
         'data': 693,
         'funding': 101,
         'application': 26,
         'is:interview': 2,
         'response': 318,
         'consequence': 42,
         'global': 167,
         'science': 460,
         'community': 140,
         'history': 21,
         'judgement': 16,
         'vaccine': 1950,
         'economy': 292,
         'lottery': 3,
         'webinar': 127,
         'poster': 1,
         'publish':

**Check**: create the column with only terms and the annotation fields - is, has, ann, lang. 

In [15]:
def create_tag_columns(row):
    row['is:'] = []
    row['has:'] = []
    row['ann:'] = []
    row['lang:'] = []
    row['terms_tags'] = []
    for tag in row['tags']:
        #tags = tags.replace('[','').replace(']','').split(',')
        #for tag in tags:
        if 'is:' in tag:
            row['is:'].append(tag.split(':')[1])
        elif 'has:' in tag:
            row['has:'].append(tag.split(':')[1])
        elif 'ann:' in tag:
            row['ann:'].append(tag.split(':')[1])
        elif 'lang:' in tag:
            row['lang:'].append(tag.split(':')[1])
        else:
            row['terms_tags'].append(tag)
    return row

def create_type_columns(row):
    for col in ['is:', 'has:', 'ann:']:
        for val in row[col]:
            row[f"{col.replace(':','')}_{val}"] = True
    return row

In [16]:
df = df.apply(lambda row: create_tag_columns(row), axis=1)

In [17]:
# Check if the 'source' field in 'target' column is equal to 'uri'
df['target__source'] = df['target'].apply(lambda target: target[0]['source'])
df['source_is_uri'] = df.apply(lambda row: row['uri'] == row['target__source'], axis=1)

In [18]:
df.query('source_is_uri == False') 

Unnamed: 0,id,created,updated,user,uri,text,tags,group,permissions,target,...,user_info,flagged,hidden,is:,has:,ann:,lang:,terms_tags,target__source,source_is_uri


**Check**: Confirmed that 'source' is equal to 'uri'.

In [19]:
# What is the difference between 'target', 'uri', 'links'? 
print("target 0: ", df['target'][0], "\n")
print("uri 0:", df['uri'][0], "\n")
print("links 0:", df['links'][0])

df['links'][0]

target 0:  [{'source': 'https://www.youtube.com/watch?v=QapwrR9C3Z4'}] 

uri 0: https://www.youtube.com/watch?v=QapwrR9C3Z4 

links 0: {'html': 'https://hypothes.is/a/rnlcIho0EeuKgFfKPBbajQ', 'incontext': 'https://hyp.is/rnlcIho0EeuKgFfKPBbajQ/www.youtube.com/watch?v=QapwrR9C3Z4', 'json': 'https://hypothes.is/api/annotations/rnlcIho0EeuKgFfKPBbajQ'}


{'html': 'https://hypothes.is/a/rnlcIho0EeuKgFfKPBbajQ',
 'incontext': 'https://hyp.is/rnlcIho0EeuKgFfKPBbajQ/www.youtube.com/watch?v=QapwrR9C3Z4',
 'json': 'https://hypothes.is/api/annotations/rnlcIho0EeuKgFfKPBbajQ'}

**Check** : the url in source field and "uri" are the same. "link" is the link to the annotation in the knowledge base.

## 3. Data preprocessing
To make the integrated data structure, data should be checked and cleaned.

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44268 entries, 0 to 44267
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              44268 non-null  object
 1   created         44268 non-null  object
 2   updated         44268 non-null  object
 3   user            44268 non-null  object
 4   uri             44268 non-null  object
 5   text            44268 non-null  object
 6   tags            44268 non-null  object
 7   group           44268 non-null  object
 8   permissions     44268 non-null  object
 9   target          44268 non-null  object
 10  document        44268 non-null  object
 11  links           44268 non-null  object
 12  user_info       44268 non-null  object
 13  flagged         44268 non-null  bool  
 14  hidden          44268 non-null  bool  
 15  is:             44268 non-null  object
 16  has:            44268 non-null  object
 17  ann:            44268 non-null  object
 18  lang: 

In [21]:
# A function to generate id from another column
def generate_id(string: str):
    return hashlib.sha1(string.encode("utf-8")).hexdigest()

In [22]:
# Check if the column 'id' is the row 'id', that we can use as a key for the annotation rows.
len(df) == len(df['id'].unique())

True

In [23]:
# What is the example of the 'target' column.
df['target'][0]

[{'source': 'https://www.youtube.com/watch?v=QapwrR9C3Z4'}]

In [24]:
# Check if all rows in 'target' column have 'source' field.
for i in df['target']: 
    if i[0]['source'] == None: 
        print(i)                 # There is no output, every row has source field. 

**Observations**: There is no null value in our dataset, but the empty value exists as '{}'

In [25]:
df['document']

0        {'title': ['The impact of Covid-19 on media – ...
1        {'title': ['The impact of Covid-19 on media – ...
2        {'title': ['The impact of Covid-19 on media – ...
3        {'title': ['The impact of Covid-19 on media – ...
4        {'title': ['The impact of Covid-19 on media – ...
                               ...                        
44263     {'title': ['Microsoft Word - Parry et al.docx']}
44264     {'title': ['Microsoft Word - Parry et al.docx']}
44265     {'title': ['Microsoft Word - Parry et al.docx']}
44266     {'title': ['Microsoft Word - Parry et al.docx']}
44267    {'title': ['f4d9b9_fddbfb2a0c05461cb4bdce2892f...
Name: document, Length: 44268, dtype: object

In [26]:
df['document'].value_counts()

{'title': ['ReconfigBehSci on Twitter']}                                                     1814
{'title': ['COVID-19 and the Labor Market']}                                                  837
{'title': ['Twitter']}                                                                        293
{'title': ['Tweet / Twitter']}                                                                235
{'title': ['Carl T. Bergstrom on Twitter']}                                                   120
                                                                                             ... 
{'title': ['The C.D.C. Waited ‘Its Entire Existence for This Moment.’ What Went Wrong?']}       1
{'title': ['Reform retractions to make them more transparent']}                                 1
{'title': ['About | OpenReview']}                                                               1
{'title': ['CRediT - Contributor Roles Taxonomy']}                                              1
{'title': ['f4d9b9_f

In [27]:
# Check if 'document' column has only one field 'title'. 
count = 1
b = 0
for i in df['document']:
    if i.get("title") == None: 
        b = b + 1
    if len(i) > 1: 
        count = count + 1
print(b, ": the number of the documents without title.")
print(count, ": the maximum number of field in document values.")

112 : the number of the documents without title.
1 : the maximum number of field in document values.


**Check**: 'document' has only one field called 'title'.

In [28]:
# A function to generate new column for non-null valued 'text_' column 
def extract_text_from_target(row):
    if row["text"] != "":
        row["text_"] = row["text"]
    elif "selector" in row["target"][0]:
        row["text_"] = [s["exact"] for s in row["target"][0]["selector"] if s["type"] == "TextQuoteSelector"][0]
    else:
        row["text_"] = ""
    return row

In [29]:
df = df.apply(lambda row: extract_text_from_target(row), axis=1)

In [30]:
df[["text", "text_"]]

Unnamed: 0,text,text_
0,https://twitter.com/i/web/status/1306171100544...,https://twitter.com/i/web/status/1306171100544...
1,2020-09-16,2020-09-16
2,The impact of Covid-19 on media – rise of info...,The impact of Covid-19 on media – rise of info...
3,,"In recent years, disinformation and misinforma..."
4,,The impact of Covid-19 on media – rise of info...
...,...,...
44263,2021-03-31,2021-03-31
44264,"Parry, H. M., Tut, G., Faustini, S., Stephens,...","Parry, H. M., Tut, G., Faustini, S., Stephens,..."
44265,,BackgroundAge is the major risk factor for mor...
44266,,1BNT162b2 vaccination in people over 80 years ...


For the better usage, process the values of 'document' column from pandas.Series to string.

In [31]:
# Fill in the 'document' column value from the 'target' column field 'exact', convert dict -> str
def extract_text_from_document(row):
    if row["document"].get("title"):
        row["document"] = row["document"]["title"][0]
    elif "selector" in row["target"][0]:
        row["document"] = [s["exact"] for s in row["target"][0]["selector"] if s["type"] == "TextQuoteSelector"][0]
    else:
        row["document"] = ""
    return row

In [32]:
df = df.apply(lambda row: extract_text_from_document(row), axis = 1)

In [33]:
# Check the result
df['document'] 

0        The impact of Covid-19 on media – rise of info...
1        The impact of Covid-19 on media – rise of info...
2        The impact of Covid-19 on media – rise of info...
3        The impact of Covid-19 on media – rise of info...
4        The impact of Covid-19 on media – rise of info...
                               ...                        
44263                    Microsoft Word - Parry et al.docx
44264                    Microsoft Word - Parry et al.docx
44265                    Microsoft Word - Parry et al.docx
44266                    Microsoft Word - Parry et al.docx
44267          f4d9b9_fddbfb2a0c05461cb4bdce2892f3cad0.pdf
Name: document, Length: 44268, dtype: object

**Question**: Which column to use for the key of this dataset? 'document' or 'uri'?

In [34]:
# unique value of 'document' column and 'uri' have different number.
len(df['document'].unique()) == len(df['uri'].unique())
print(len(df['document'].unique()), ": Nr. unique value for 'document'")
print(len(df['uri'].unique()), ": Nr. unique value for uri")

7596 : Nr. unique value for 'document'
9001 : Nr. unique value for uri


**Observations**: More unique values in uri than document, decide to go with 'document'.

cf. "Why multiple URLs redirect to the same websites?"

https://moz.com/community/q/topic/58696/i-have-multiple-urls-that-redirect-to-the-same-website-is-this-an-issue/3


In [35]:
# Generate document id from 'document' values.
df['doc_id'] = df['document'].apply(lambda doc: generate_id(doc))

In [36]:
df.head(5)

Unnamed: 0,id,created,updated,user,uri,text,tags,group,permissions,target,...,hidden,is:,has:,ann:,lang:,terms_tags,target__source,source_is_uri,text_,doc_id
0,rnlcIho0EeuKgFfKPBbajQ,2020-10-29T22:18:33.169969+00:00,2020-10-29T22:18:33.169969+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,https://twitter.com/i/web/status/1306171100544...,[has:context],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,...,False,[],[context],[],[],[],https://www.youtube.com/watch?v=QapwrR9C3Z4,True,https://twitter.com/i/web/status/1306171100544...,aa6c8c407d3e5b746cc4ea9889baf7a28eafb181
1,qHdm3ho0EeuTJufnuQnIqQ,2020-10-29T22:18:23.010549+00:00,2020-10-29T22:18:23.010549+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,2020-09-16,[has:date],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,...,False,[],[date],[],[],[],https://www.youtube.com/watch?v=QapwrR9C3Z4,True,2020-09-16,aa6c8c407d3e5b746cc4ea9889baf7a28eafb181
2,oUZLeBo0EeuthtuX8fq6yQ,2020-10-29T22:18:11.003006+00:00,2020-10-29T22:18:11.003006+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,The impact of Covid-19 on media – rise of info...,"[is:youtube, is:webinar, disinformation, misin...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,...,False,"[youtube, webinar]",[],[],[en],"[disinformation, misinformation, online, techn...",https://www.youtube.com/watch?v=QapwrR9C3Z4,True,The impact of Covid-19 on media – rise of info...,aa6c8c407d3e5b746cc4ea9889baf7a28eafb181
3,gHTeKBo0EeubG0_nm_NNpQ,2020-10-29T22:17:15.887213+00:00,2020-10-29T22:17:15.887213+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,,[ann:summary],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,...,False,[],[],[summary],[],[],https://www.youtube.com/watch?v=QapwrR9C3Z4,True,"In recent years, disinformation and misinforma...",aa6c8c407d3e5b746cc4ea9889baf7a28eafb181
4,fAZg8Bo0Eeu39uviYF17AQ,2020-10-29T22:17:08.501454+00:00,2020-10-29T22:17:08.501454+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,,[ann:title],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,...,False,[],[],[title],[],[],https://www.youtube.com/watch?v=QapwrR9C3Z4,True,The impact of Covid-19 on media – rise of info...,aa6c8c407d3e5b746cc4ea9889baf7a28eafb181


In [37]:
# Check if the generated id is unique values of document column
len(df['doc_id'].unique()) == len(df['document'].unique())

True

In [38]:
# Get the sample using doc_id
mysample = df.query('doc_id == "4debd0b8816961d9ee0fb81d9fd47884a3d23346"')
mysample

Unnamed: 0,id,created,updated,user,uri,text,tags,group,permissions,target,...,hidden,is:,has:,ann:,lang:,terms_tags,target__source,source_is_uri,text_,doc_id
10074,GKTDUOxqEeqdGgO3KOC83Q,2020-09-01T15:45:01.004518+00:00,2020-09-01T15:45:01.004518+00:00,acct:gailelhalaby@hypothes.is,https://www.imperial.ac.uk/stories/intersectin...,https://twitter.com/ImperialMed/status/1300375...,[has:context],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.imperial.ac.uk/storie...,...,False,[],[context],[],[],[],https://www.imperial.ac.uk/stories/intersectin...,True,https://twitter.com/ImperialMed/status/1300375...,4debd0b8816961d9ee0fb81d9fd47884a3d23346
10075,-vsbTOxpEeq_kMdMttrjaA,2020-09-01T15:44:11.268133+00:00,2020-09-01T15:44:11.268133+00:00,acct:gailelhalaby@hypothes.is,https://www.imperial.ac.uk/stories/intersectin...,2020\n,[has:date-approx],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.imperial.ac.uk/storie...,...,False,[],[date-approx],[],[],[],https://www.imperial.ac.uk/stories/intersectin...,True,2020\n,4debd0b8816961d9ee0fb81d9fd47884a3d23346
10076,4ydJtOxpEeqb8PfadgIuVw,2020-09-01T15:43:31.179769+00:00,2020-09-01T15:43:31.179769+00:00,acct:gailelhalaby@hypothes.is,https://www.imperial.ac.uk/stories/intersectin...,Two intersecting pandemics. (n.d.). Retrieved ...,"[is:webpage, lang:en, racism, COVID-19, corona...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.imperial.ac.uk/storie...,...,False,[webpage],[],[],[en],"[racism, COVID-19, coronavirus, pandemic, ethn...",https://www.imperial.ac.uk/stories/intersectin...,True,Two intersecting pandemics. (n.d.). Retrieved ...,4debd0b8816961d9ee0fb81d9fd47884a3d23346
10077,AOIWzuxpEeqMhLOdtJbP9A,2020-09-01T15:37:11.588640+00:00,2020-09-01T15:37:11.588640+00:00,acct:gailelhalaby@hypothes.is,https://www.imperial.ac.uk/stories/intersectin...,,[ann:summary],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.imperial.ac.uk/storie...,...,False,[],[],[summary],[],[],https://www.imperial.ac.uk/stories/intersectin...,True,How COVID-19 is laying bare structural racism ...,4debd0b8816961d9ee0fb81d9fd47884a3d23346
10078,_ESrkOxoEeqZZ8M7psIcqA,2020-09-01T15:37:03.663208+00:00,2020-09-01T15:37:03.663208+00:00,acct:gailelhalaby@hypothes.is,https://www.imperial.ac.uk/stories/intersectin...,,[ann:title],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.imperial.ac.uk/storie...,...,False,[],[],[title],[],[],https://www.imperial.ac.uk/stories/intersectin...,True,Two intersecting pandemics,4debd0b8816961d9ee0fb81d9fd47884a3d23346


## 4. Data structure

There are three data structures to create from this pipeline. 

1. data from hypothesis as DataFrame -> each row is an annotation. To get a document, we need to groupby doc_id.
2. data from hypothesis transformed into "documents": a list of documents, where one document has a subfield "annotations", where annotations are. -> OUR REFERENCE
3. objects to ingest in Elastic Search: one document => one dict => one annotation -> we want to create these objects out of (2) 

**"Steps" to get the document defined above (after cleaning steps)**
1. Generate doc_id
2. Group dataset by doc_id
3. The top level of the document object comes from properties across rows (groupby) meaning that all the rows in that group have these same properties.
4. The object annotations contains all the info coming from individual rows within the group (1 row = 1 dict in the list)

### Generate document (2)

In [424]:
'''
2) OUR REFERENCE document format.

documents = [
    {
        "_id": "id",  # 
        "document_uri": "uri", #from df
        "document ": "document", # from df,
        "tags": [], # groupby uri -> concatenated lists of tags 
        "annotations": [ # groupby uri -> each row is dict for the columns "text", "tags", "target", "links"
            {
                "ann_id": "id from the row"
                "tags": []
                "target": [],
                "text": str,
                "links": dict
            }, {

            } 
        ]
    }
]
'''

'\n2) purpose data format\n\ndocuments = [\n    {\n        "_id": "id",  # \n        "document_uri": "uri", #from df\n        "document ": "document", # from df,\n        "tags": [], # groupby uri -> concatenated lists of tags \n        "annotations": [ # groupby uri -> each row is dict for the columns "text", "tags", "target", "links"\n            {\n                "ann_id": "id from the row"\n                "tags": []\n                "target": [],\n                "text": str,\n                "links": dict\n            }, {\n\n            } \n        ]\n    }\n]\n'

1. Original dataframe.

In [148]:
# Give the id column the name ann_id.
df = df.rename(columns={'id': 'ann_id'})

In [149]:
df.columns

Index(['ann_id', 'created', 'updated', 'user', 'uri', 'text', 'tags', 'group',
       'permissions', 'target', 'document', 'links', 'user_info', 'flagged',
       'hidden', 'text_', 'doc_id'],
      dtype='object')

In [58]:
# Delete the columns that were added during data exploration, and keep the original columns
# Generate multi-index dataframe, "df1" 
df = df.drop(columns = ["is:", "has:",  "ann:", "lang:", "source_is_uri", "target__source", "terms_tags"])
df1 = df.set_index(['doc_id', 'ann_id'])


In [150]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 44268 entries, ('aa6c8c407d3e5b746cc4ea9889baf7a28eafb181', 'rnlcIho0EeuKgFfKPBbajQ') to ('551363c260f71f9d57f2d08eb66a8e21a9aa2802', '-eARsNZZEeu-amPoGgaXSg')
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   created      44268 non-null  object
 1   updated      44268 non-null  object
 2   user         44268 non-null  object
 3   uri          44268 non-null  object
 4   text         44268 non-null  object
 5   tags         44268 non-null  object
 6   group        44268 non-null  object
 7   permissions  44268 non-null  object
 8   target       44268 non-null  object
 9   document     44268 non-null  object
 10  links        44268 non-null  object
 11  user_info    44268 non-null  object
 12  flagged      44268 non-null  bool  
 13  hidden       44268 non-null  bool  
 14  text_        44268 non-null  object
dtypes: bool(2), object(13)
memory usage: 6.4+ MB


In [151]:
df1.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,created,updated,user,uri,text,tags,group,permissions,target,document,links,user_info,flagged,hidden,text_
doc_id,ann_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
aa6c8c407d3e5b746cc4ea9889baf7a28eafb181,rnlcIho0EeuKgFfKPBbajQ,2020-10-29T22:18:33.169969+00:00,2020-10-29T22:18:33.169969+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,https://twitter.com/i/web/status/1306171100544...,[has:context],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,The impact of Covid-19 on media – rise of info...,{'html': 'https://hypothes.is/a/rnlcIho0EeuKgF...,{'display_name': None},False,False,https://twitter.com/i/web/status/1306171100544...
aa6c8c407d3e5b746cc4ea9889baf7a28eafb181,qHdm3ho0EeuTJufnuQnIqQ,2020-10-29T22:18:23.010549+00:00,2020-10-29T22:18:23.010549+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,2020-09-16,[has:date],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,The impact of Covid-19 on media – rise of info...,{'html': 'https://hypothes.is/a/qHdm3ho0EeuTJu...,{'display_name': None},False,False,2020-09-16
aa6c8c407d3e5b746cc4ea9889baf7a28eafb181,oUZLeBo0EeuthtuX8fq6yQ,2020-10-29T22:18:11.003006+00:00,2020-10-29T22:18:11.003006+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,The impact of Covid-19 on media – rise of info...,"[is:youtube, is:webinar, disinformation, misin...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,The impact of Covid-19 on media – rise of info...,{'html': 'https://hypothes.is/a/oUZLeBo0Eeutht...,{'display_name': None},False,False,The impact of Covid-19 on media – rise of info...
aa6c8c407d3e5b746cc4ea9889baf7a28eafb181,gHTeKBo0EeubG0_nm_NNpQ,2020-10-29T22:17:15.887213+00:00,2020-10-29T22:17:15.887213+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,,[ann:summary],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,The impact of Covid-19 on media – rise of info...,{'html': 'https://hypothes.is/a/gHTeKBo0EeubG0...,{'display_name': None},False,False,"In recent years, disinformation and misinforma..."
aa6c8c407d3e5b746cc4ea9889baf7a28eafb181,fAZg8Bo0Eeu39uviYF17AQ,2020-10-29T22:17:08.501454+00:00,2020-10-29T22:17:08.501454+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,,[ann:title],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,The impact of Covid-19 on media – rise of info...,{'html': 'https://hypothes.is/a/fAZg8Bo0Eeu39u...,{'display_name': None},False,False,The impact of Covid-19 on media – rise of info...
b0204dd848c1f25b0506d61889bff67127024f7f,uAs1rhozEeuVV7-ike6oNw,2020-10-29T22:11:39.622491+00:00,2020-10-29T22:11:39.622491+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=wGWqBtDkOFs,"Online Research Tools and Techniques. (2020, ...","[is:webinar, is:youtube, online, research, eth...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=w...,Online Research Tools and Techniques,{'html': 'https://hypothes.is/a/uAs1rhozEeuVV7...,{'display_name': None},False,False,"Online Research Tools and Techniques. (2020, ..."
b0204dd848c1f25b0506d61889bff67127024f7f,FaDHPhozEeuXjAsKbE0ErA,2020-10-29T22:07:07.184512+00:00,2020-10-29T22:07:07.184512+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=wGWqBtDkOFs,https://twitter.com/i/web/status/1305920211992...,[has:context],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=w...,Online Research Tools and Techniques,{'html': 'https://hypothes.is/a/FaDHPhozEeuXjA...,{'display_name': None},False,False,https://twitter.com/i/web/status/1305920211992...
b0204dd848c1f25b0506d61889bff67127024f7f,EFakYBozEeuw3TO-taovEg,2020-10-29T22:06:58.254626+00:00,2020-10-29T22:06:58.254626+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=wGWqBtDkOFs,2020-09-15,[has:date],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=w...,Online Research Tools and Techniques,{'html': 'https://hypothes.is/a/EFakYBozEeuw3T...,{'display_name': None},False,False,2020-09-15
b0204dd848c1f25b0506d61889bff67127024f7f,ChBWPBozEeuqI0dyN6mJpw,2020-10-29T22:06:47.797225+00:00,2020-10-29T22:06:47.797225+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=wGWqBtDkOFs,If COVID-19 is the 9/11 moment for global publ...,[ann:summary],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=w...,Online Research Tools and Techniques,{'html': 'https://hypothes.is/a/ChBWPBozEeuqI0...,{'display_name': None},False,False,If COVID-19 is the 9/11 moment for global publ...
b0204dd848c1f25b0506d61889bff67127024f7f,BQ6GDhozEeudISeH9k9VOA,2020-10-29T22:06:39.294849+00:00,2020-10-29T22:06:39.294849+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=wGWqBtDkOFs,,[ann:title],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=w...,Online Research Tools and Techniques,{'html': 'https://hypothes.is/a/BQ6GDhozEeudIS...,{'display_name': None},False,False,Online Research Tools and Techniques


2. Define a function to generate our reference document format.

In [61]:
def generate_document(df: pd.DataFrame) -> dict:
    
    """
    :param df: This dataframe is actually a "grouped-by" dataframe, meaning that doc_id, uri, document and tags are the same for all rows
    :return:
    """
    
    assert len(df['doc_id'].unique()) == 1
    assert len(df['document'].unique()) == 1

    document = {
        "_id": df['doc_id'].iloc[0],
        "document_uri": df["uri"].iloc[0],
        "document": df['document'].iloc[0],
        "tags": [],
        "annotations": [], 
        "created": df['created'].iloc[0], 
        "updated": df['updated'].iloc[0], 
        "user": df['user'].iloc[0], 
        "group": df['group'].iloc[0], 
        "permissions": df['permissions'].iloc[0], 
        "user_info": df['user_info'].iloc[0], 
        "flagged": df['flagged'].iloc[0], 
        "hidden": df['hidden'].iloc[0]
        
    }

    for i, row in df.iterrows():
        annotation = {}
        # this is at level of document (first level)
        document["tags"] += row["tags"]

        # This is at level of annotation (second level)
        annotation["text"] =  row["text_"]
        annotation["tags"] = row["tags"]
        annotation["ann_id"] = row["ann_id"]
        annotation["target"] = row["target"]
        annotation["links"] = row["links"]
        document["annotations"].append(annotation)

    return document


In [62]:
df.columns

Index(['ann_id', 'created', 'updated', 'user', 'uri', 'text', 'tags', 'group',
       'permissions', 'target', 'document', 'links', 'user_info', 'flagged',
       'hidden', 'text_', 'doc_id'],
      dtype='object')

In [63]:
# Generate documents format from dataframe. 
documents = []

for doc_id in df['doc_id'].unique():
    sub_df = df.query("doc_id == @doc_id")
    documents.append(generate_document(sub_df))


In [64]:
# How many document exist in the list documents?
len(documents)

7596

In [66]:
df.head(5)

Unnamed: 0,ann_id,created,updated,user,uri,text,tags,group,permissions,target,document,links,user_info,flagged,hidden,text_,doc_id
0,rnlcIho0EeuKgFfKPBbajQ,2020-10-29T22:18:33.169969+00:00,2020-10-29T22:18:33.169969+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,https://twitter.com/i/web/status/1306171100544...,[has:context],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,The impact of Covid-19 on media – rise of info...,{'html': 'https://hypothes.is/a/rnlcIho0EeuKgF...,{'display_name': None},False,False,https://twitter.com/i/web/status/1306171100544...,aa6c8c407d3e5b746cc4ea9889baf7a28eafb181
1,qHdm3ho0EeuTJufnuQnIqQ,2020-10-29T22:18:23.010549+00:00,2020-10-29T22:18:23.010549+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,2020-09-16,[has:date],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,The impact of Covid-19 on media – rise of info...,{'html': 'https://hypothes.is/a/qHdm3ho0EeuTJu...,{'display_name': None},False,False,2020-09-16,aa6c8c407d3e5b746cc4ea9889baf7a28eafb181
2,oUZLeBo0EeuthtuX8fq6yQ,2020-10-29T22:18:11.003006+00:00,2020-10-29T22:18:11.003006+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,The impact of Covid-19 on media – rise of info...,"[is:youtube, is:webinar, disinformation, misin...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,The impact of Covid-19 on media – rise of info...,{'html': 'https://hypothes.is/a/oUZLeBo0Eeutht...,{'display_name': None},False,False,The impact of Covid-19 on media – rise of info...,aa6c8c407d3e5b746cc4ea9889baf7a28eafb181
3,gHTeKBo0EeubG0_nm_NNpQ,2020-10-29T22:17:15.887213+00:00,2020-10-29T22:17:15.887213+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,,[ann:summary],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,The impact of Covid-19 on media – rise of info...,{'html': 'https://hypothes.is/a/gHTeKBo0EeubG0...,{'display_name': None},False,False,"In recent years, disinformation and misinforma...",aa6c8c407d3e5b746cc4ea9889baf7a28eafb181
4,fAZg8Bo0Eeu39uviYF17AQ,2020-10-29T22:17:08.501454+00:00,2020-10-29T22:17:08.501454+00:00,acct:amyhcurtis@hypothes.is,https://www.youtube.com/watch?v=QapwrR9C3Z4,,[ann:title],Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,The impact of Covid-19 on media – rise of info...,{'html': 'https://hypothes.is/a/fAZg8Bo0Eeu39u...,{'display_name': None},False,False,The impact of Covid-19 on media – rise of info...,aa6c8c407d3e5b746cc4ea9889baf7a28eafb181


3. Create json file out of the documents

**ATTENTION!** this block will overwrite the existing file

In [68]:
import json
with open('hypothesis_documents_v1.jsonl', 'w') as f:
    for doc in documents:
        try:
            f.write(json.dumps(doc, ensure_ascii=False, default = str) + '\n')
        except:
            print(doc)
            raise Exception

---


### [OPTIONAL] Reloading data from file

In [152]:
documents_df = pd.read_json('hypothesis_documents_v1.jsonl', orient='records', lines=True)

In [154]:
documents_df.head(5)

Unnamed: 0,_id,document_uri,document,tags,annotations,created,updated,user,group,permissions,user_info,flagged,hidden
0,aa6c8c407d3e5b746cc4ea9889baf7a28eafb181,https://www.youtube.com/watch?v=QapwrR9C3Z4,The impact of Covid-19 on media – rise of info...,"[has:context, has:date, is:youtube, is:webinar...",[{'text': 'https://twitter.com/i/web/status/13...,2020-10-29T22:18:33.169969+00:00,2020-10-29T22:18:33.169969+00:00,acct:amyhcurtis@hypothes.is,Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",{'display_name': None},False,False
1,b0204dd848c1f25b0506d61889bff67127024f7f,https://www.youtube.com/watch?v=wGWqBtDkOFs,Online Research Tools and Techniques,"[is:webinar, is:youtube, online, research, eth...",[{'text': 'Online Research Tools and Techniqu...,2020-10-29T22:11:39.622491+00:00,2020-10-29T22:11:39.622491+00:00,acct:amyhcurtis@hypothes.is,Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",{'display_name': None},False,False
2,28f2841bd9c943268ae8e3a3abb6b54ce52f994d,https://www.youtube.com/watch?v=97iJIwBQ5qE,COVID-19: The 9/11 Moment for Global Public He...,"[is:youtube, is:interview, COVID-19, public he...",[{'text': 'COVID-19: The 9/11 Moment for Globa...,2020-10-29T21:44:17.929432+00:00,2020-10-29T21:44:17.929432+00:00,acct:amyhcurtis@hypothes.is,Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",{'display_name': None},False,False
3,b371d7336559881079f7219ccf8a3eb487a061d4,http://www.youtube.com/playlist?list=PLOA0aRJ9...,ORWG virtual meeting 08/09/2020,"[is:youtube, webinar, lang:en, poster, publish...",[{'text': 'ORWG Virtual Meeting 08/09/2020 htt...,2020-10-29T21:34:40.377021+00:00,2020-10-29T21:34:40.377021+00:00,acct:amyhcurtis@hypothes.is,Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",{'display_name': None},False,False
4,319a613babccf3c205ddc5dd0d07d4e49cd8a11f,https://www.youtube.com/watch?v=zTrIl52jV0s,"Long covid: diagnosis, management, prognosis","[is:webinar, COVID-19, long, symptom, syndrome...","[{'text': 'Long covid: Diagnosis, management, ...",2020-10-29T21:19:05.041279+00:00,2020-10-29T21:19:05.041279+00:00,acct:amyhcurtis@hypothes.is,Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",{'display_name': None},False,False


In [155]:
documents_df.query('_id == "90d53d57067cde496a9aa6597d1b33ccd20e060c"')['annotations'].iloc[0]

[{'text': '2020-06',
  'tags': ['has:date-approx'],
  'ann_id': 'vI2h2tMpEeqD5ytBmJ66hA',
  'target': [{'source': 'https://www.nber.org/papers/w27401'}],
  'links': {'html': 'https://hypothes.is/a/vI2h2tMpEeqD5ytBmJ66hA',
   'incontext': 'https://hyp.is/vI2h2tMpEeqD5ytBmJ66hA/www.nber.org/papers/w27401',
   'json': 'https://hypothes.is/api/annotations/vI2h2tMpEeqD5ytBmJ66hA'}},
 {'text': 'Aksoy, C. G., Eichengreen, B., & Saka, O. (2020). The Political Scar of Epidemics (Working Paper No. 27401; Working Paper Series). National Bureau of Economic Research. https://doi.org/10.3386/w27401\n',
  'tags': ['is:article',
   'lang:en',
   'COVID-19',
   'politics',
   'confidence',
   'health care',
   'economy',
   'government',
   'model',
   'young'],
  'ann_id': 'tcbO5NMpEeq5watiL2fFhA',
  'target': [{'source': 'https://www.nber.org/papers/w27401'}],
  'links': {'html': 'https://hypothes.is/a/tcbO5NMpEeq5watiL2fFhA',
   'incontext': 'https://hyp.is/tcbO5NMpEeq5watiL2fFhA/www.nber.org/papers

-----

------
### Generate document (3) to ingest into ElasticSearch


document_es = {

    "_id": doc_id + "_" + ann_id, (composition of original doc_id and ann_id)
    "parent_doc_id": doc_id, (the id of the document to which this annotation belongs)
    "document_uri": as they are in the original document
    "document": as they are in the original document
    "ann_id": annotations["ann_id"],
    "tags": annotations["tags"], (not tags from the first level, for that specific annotation)
    rest of the fields of that annotation as they are
    ...

}




In [156]:
df.columns

Index(['ann_id', 'created', 'updated', 'user', 'uri', 'text', 'tags', 'group',
       'permissions', 'target', 'document', 'links', 'user_info', 'flagged',
       'hidden', 'text_', 'doc_id'],
      dtype='object')

In [70]:
# function to generate the 3rd data format.
def generate_document3(df: pd.DataFrame) -> dict:
    """
    
    :param df: This dataframe is actually a "grouped-by" dataframe, meaning that doc_id, uri, document and tags are the same for all rows
    :return:
    
    """
    assert len(df['doc_id'].unique()) == 1
    assert len(df['document'].unique()) == 1

    document = {
        "ann_id": df['doc_id'].iloc[0] + "_" + df['ann_id'].iloc[0],
        "parent_doc_id": df['doc_id'].iloc[0],
        "document_uri": df["uri"].iloc[0],
        "document": df['document'].iloc[0],
        "tags": df['tags'].iloc[0], 
        "created": df['created'].iloc[0], 
        "updated": df['updated'].iloc[0], 
        "user": df["user"].iloc[0],
        "text": df["text"].iloc[0],
        "group": df["group"].iloc[0], 
        "permissions": df["permissions"].iloc[0], 
        "target": df["target"].iloc[0], 
        "links": df["links"].iloc[0], 
        "user_info": df["user_info"].iloc[0], 
        "flagged": df["flagged"].iloc[0], 
        "hidden": df["hidden"].iloc[0], 
        
    }

    
    return document


In [71]:
documents_es = []

for doc_id in df['doc_id'].unique():
    sub_df = df.query("doc_id == @doc_id")
    documents_es.append(generate_document3(sub_df))


In [72]:
documents_es[:5]

[{'ann_id': 'aa6c8c407d3e5b746cc4ea9889baf7a28eafb181_rnlcIho0EeuKgFfKPBbajQ',
  'parent_doc_id': 'aa6c8c407d3e5b746cc4ea9889baf7a28eafb181',
  'document_uri': 'https://www.youtube.com/watch?v=QapwrR9C3Z4',
  'document': 'The impact of Covid-19 on media – rise of infodemics?',
  'tags': ['has:context'],
  'created': '2020-10-29T22:18:33.169969+00:00',
  'updated': '2020-10-29T22:18:33.169969+00:00',
  'user': 'acct:amyhcurtis@hypothes.is',
  'text': 'https://twitter.com/i/web/status/1306171100544602112',
  'group': 'Jk8bYJdN',
  'permissions': {'read': ['group:__world__'],
   'admin': ['acct:amyhcurtis@hypothes.is'],
   'update': ['acct:amyhcurtis@hypothes.is'],
   'delete': ['acct:amyhcurtis@hypothes.is']},
  'target': [{'source': 'https://www.youtube.com/watch?v=QapwrR9C3Z4'}],
  'links': {'html': 'https://hypothes.is/a/rnlcIho0EeuKgFfKPBbajQ',
   'incontext': 'https://hyp.is/rnlcIho0EeuKgFfKPBbajQ/www.youtube.com/watch?v=QapwrR9C3Z4',
   'json': 'https://hypothes.is/api/annotations/

In [73]:
len(documents_es)

7596

In [74]:
# Create json file out of the document
import json
with open('hypothesis_documents_v2.jsonl', 'w') as f:
    for doc in documents_es:
        try:
            f.write(json.dumps(doc, ensure_ascii=False, default = str) + '\n')
        except:
            print(doc)
            raise Exception

## 5. Search by Elasticsearch with our Dataset

### Initiate a client instance and call an API

1. Run the docker application in local 
2. Create the virtual environment 
3. In terminal, copy the command line below and run the elasticsearch

In [None]:
# [Example] Creating virtual environment - untoggle and use it for the first time 
# python3 -m venv .venv
# source .venv/bin/activate

In [166]:
# [Terminal command]
# docker run --rm -p 9200:9200 -p 9300:9300 -e "xpack.security.enabled=false" -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:8.3.3

In [777]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info().body

{'name': '3a5749076feb',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'ZpsjaBDXQJOVUshrVGrWbg',
 'version': {'number': '8.3.3',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '801fed82df74dbe537f89b71b098ccaff88d2c56',
  'build_date': '2022-07-23T19:30:09.227964828Z',
  'build_snapshot': False,
  'lucene_version': '9.2.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

### Object to ingest into Elastic Search

In [778]:
df3 = (
    pd.read_json('hypothesis_documents_v2.jsonl', orient='records', lines=True)
    .dropna()
    .sample(5000, random_state=42)
    .reset_index()
)

In [779]:
df3.columns

Index(['index', 'ann_id', 'parent_doc_id', 'document_uri', 'document', 'tags',
       'created', 'updated', 'user', 'text', 'group', 'permissions', 'target',
       'links', 'user_info', 'flagged', 'hidden'],
      dtype='object')

In [780]:
df3 = df3.drop(['index'], axis = 1)

In [733]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ann_id         5000 non-null   object
 1   parent_doc_id  5000 non-null   object
 2   document_uri   5000 non-null   object
 3   document       5000 non-null   object
 4   tags           5000 non-null   object
 5   created        5000 non-null   object
 6   updated        5000 non-null   object
 7   user           5000 non-null   object
 8   text           5000 non-null   object
 9   group          5000 non-null   object
 10  permissions    5000 non-null   object
 11  target         5000 non-null   object
 12  links          5000 non-null   object
 13  user_info      5000 non-null   object
 14  flagged        5000 non-null   object
 15  hidden         5000 non-null   object
dtypes: object(16)
memory usage: 625.1+ KB


Create 'date' column to fill when there are 'has:date' or 'has:date-approx' in 'tags' -> use it later in date_range search

In [781]:
tags_to_check = ['has:date', 'has:date-approx']

def get_ann_by_tag(row, tag_values):
    for tag in tag_values:
        if tag in row['tags']:
            row['date'] = row['text']
        else:
            row['date'] = ""
    return row

In [782]:
df3 = df3.apply(lambda row: get_ann_by_tag(row, tags_to_check), axis=1)

In [783]:
df3[['text', 'date', 'tags']]

Unnamed: 0,text,date,tags
0,"Ferguson, A. M., Cameron, D., & Inzlicht, M. (...",,"[is:preprint, lang:en, empathy, motivation, ef..."
1,https://twitter.com/CBSMiami/status/1424825696...,,[has:context]
2,"‘I’m sorry, but it’s too late’: Alabama doctor...",,"[is:webpage, lang:en, COVID-19, vaccine, vacci..."
3,https://twitter.com/ScienceMagazine/status/130...,,[has:context]
4,2020-05-26,,[]
...,...,...,...
4995,2020-04-25,,[has:date]
4996,"Blayac, Thierry, Dimitri Dubois, Sebastien Duc...",,"[is:preprint, lang:en, population preference, ..."
4997,2020-06,2020-06,[has:date-approx]
4998,Covid-19 latest updates: Florida to offer vouc...,,"[is:webpage, lang:en, COVID-19, school, parent..."


In [738]:
# Check how many rows with 'date' value
df3[df3['date'] != ""][['text', 'date']]

Unnamed: 0,text,date
110,2020-02-17,2020-02-17
224,2020-06,2020-06
289,2020-06-10\n,2020-06-10\n
294,1988-07,1988-07
344,2021-10,2021-10
...,...,...
4737,04-2020,04-2020
4742,2021-06,2021-06
4793,2020-05,2020-05
4969,2020-07,2020-07


Process mixed formatted values in 'date' column.

In [815]:
def date_format(date):
    if date != "":
        date = date.replace('\n', '')                # replace the Nonetype character to empty string.
        if len(date) == 4:                           # original format was YYYY 
            date = date + "-01-01" 
        if len(date) == 7 and date[0] == 0: 
            date = date[3:] + "-" + date[:2] + "-01" # original format was MM-YYYY
        if len(date) == 7 and date[0] != 0:          # original format was YYYY-MM
            date = date + "-01"
        if len(date) > 7:                            # original format had another character
            date = date.split(" ", 1)[0]
            if len(date) < 7: 
                date = date + "-01-01"
        return date
    else: 
        return date

In [816]:
df3['date'] = df3['date'].apply(lambda date: date_format(date))

In [817]:
Counter(df3['date'])

Counter({'': 4909,
         '2020-02-17': 1,
         '2020-06-01': 16,
         '2020-06-10': 1,
         '1988-07-01': 1,
         '2021-10-01': 2,
         '2020-05-01': 17,
         '2020-01-01': 1,
         '2020-03-01': 2,
         '2013-01-01': 1,
         '2019-07-01': 1,
         '2020-07-20': 1,
         '2007-01-01': 1,
         '04-2020-01': 2,
         '2020-07-01': 10,
         '2020-04-01': 11,
         '2021-04-30': 1,
         '2020-11-01': 1,
         '2018-07-01': 1,
         '2021-01-01': 1,
         '2016-07-01': 1,
         '2020-06-04': 1,
         '2003-12-01': 1,
         '2020-12-01': 1,
         '2022-01-07': 1,
         '2021-08-01': 1,
         '2018-08-01': 1,
         '1967-09-01': 1,
         '2017-02-01': 1,
         '2021-05-01': 1,
         '2021-11-01': 1,
         '2021-07-01': 1,
         '2010-01-01': 1,
         '2012-07-01': 1,
         '2018-10-01': 1,
         '2020-10-01': 1,
         '2021-09-01': 1,
         '2017-01-01': 1,
         '2021-

### 1. Creating index to use in Elastic search

If running the code below for the first time, untoggle the last line so that you can "create" index.

In [818]:
mappings = {
    
    "properties": {
        "ann_id" : {"type": "keyword"}, 
        "parent_doc_id" : {"type": "keyword"}, 
        "document_uri" : {"type": "keyword"}, 
        "document" : {"type": "keyword"}, 
        "tags": {"type": "keyword"},
        "created" : {"type": "date"}, 
        "updated": {"type": "date"}, 
        "user": {"type": "keyword"}, 
        "text" : {"type": "text", "analyzer": "standard"}, 
        "date" : {"type": "date",
                "ignore_malformed": True},
        "group": {"type": "keyword"}, 
        "permissions": {"type": "nested"}, 
        "target": {"type": "nested"}, 
        "links": {"type": "nested"}, 
        "user_info": {"type": "object"}, 
        "flagged": {"type": "text", "analyzer": "standard"}, 
        "hidden": {"type": "text", "analyzer": "standard"}
        
    
    }
}


es.options(ignore_status=[400,404]).indices.delete(index='hypothesis_v1') # delete if you've already created index with the same name before
es.indices.create(index= "hypothesis_v1", mappings = mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'hypothesis_v1'})

In [819]:
df3["user_info"].iloc[90]

{'display_name': None}

### Assigning field data types

1. text: general fields belong here, and analyzer parameter also should be specified e.g.standard, english, french..
2. keyword:  used for structured content such as IDs, email addresses, hostnames, status codes, zip codes, or tags.
3. date: logged data column belongs here.
4. nested: it is an array of objects fields, where object is a json data. Json objects store data hierarchically. Those columns with inner json objects belong here.
5. OTHER data type has not been used, however, it needs to be discussed later.

### 2. Add data to the index created above

In [820]:
from elasticsearch.helpers import bulk

bulk_data = []
for i,row in df3.iterrows():
    bulk_data.append(
        {
            "_index": "hypothesis_v1",
            "_id": row['ann_id'],
            "_source": {
                "parent_doc_id": row["parent_doc_id"],
                "document_uri": row["document_uri"],
                "document": row["document"], 
                "tags": row["tags"], 
                "created": row["created"], 
                "updated": row["updated"], 
                "user": row["user"], 
                "text": row["text"], 
                "date": row["date"],
                "group": row["group"], 
                "permissions": row["permissions"], 
                "target": row["target"], 
                "links": row["links"], 
                "user_info": row["user_info"], 
                "flagged": row["flagged"], 
                "hidden": row["hidden"]
                
                
            }
        }
    )
bulk(es, bulk_data)

(5000, [])

Check if it is working by counting the number of items in the index, the 'count' should be 5000.

In [821]:
es.indices.refresh(index="hypothesis_v1")
es.cat.count(index="hypothesis_v1", format="json")

ListApiResponse([{'epoch': '1682603921', 'timestamp': '13:58:41', 'count': '5000'}])

In [822]:
sample = df3.sample(n=1)

### 3. Search the data with Elastic Search

1. Search annotation with the ann_id given

In [745]:
resp = es.search(
    index="hypothesis_v1",
    body={
        "query": {
            "bool": {
                "filter": {
                    "match_phrase": {
                        "_id": "4011af8ea429e3c113c7328a721f6a2af2fd188f_L5lt6s5MEeqm_pesYHJVVQ",
                    }
                },
                },
        },            
    }
)
resp

  resp = es.search(


ObjectApiResponse({'took': 5, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 0.0, 'hits': [{'_index': 'hypothesis_v1', '_id': '4011af8ea429e3c113c7328a721f6a2af2fd188f_L5lt6s5MEeqm_pesYHJVVQ', '_score': 0.0, '_ignored': ['date'], '_source': {'parent_doc_id': '4011af8ea429e3c113c7328a721f6a2af2fd188f', 'document_uri': 'https://osf.io/preprints/socarxiv/b5zae/', 'document': 'COVID-19_Insights from Innovation Economists', 'tags': ['has:date'], 'created': '2020-07-25T07:55:19.812600+00:00', 'updated': '2020-07-25T07:55:19.812600+00:00', 'user': 'acct:ErikStuchly@hypothes.is', 'text': '2020-04-14', 'date': '', 'group': 'Jk8bYJdN', 'permissions': {'read': ['group:__world__'], 'admin': ['acct:ErikStuchly@hypothes.is'], 'update': ['acct:ErikStuchly@hypothes.is'], 'delete': ['acct:ErikStuchly@hypothes.is']}, 'target': [{'source': 'https://osf.io/preprints/socarxiv/b5zae/'}], 'links': {'html'

In [346]:
sample.iloc[0].to_dict()

{'ann_id': '6612a686969a966757bf2ba58c689ac8d832b1da_1qjj8LsOEeqcGZewIHnrAg',
 'parent_doc_id': '6612a686969a966757bf2ba58c689ac8d832b1da',
 'document_uri': 'https://www.nature.com/articles/d41586-020-01918-0?utm_source=twt_nnc&utm_medium=social&utm_campaign=naturenews&sf235555174=1',
 'document': 'Four tools that help researchers working in collaborations to see the big picture',
 'tags': ['is:article',
  'lang:en',
  'Research',
  'Tools',
  'Project management',
  'Project-management tools',
  'Trello',
  'Team management',
  'Organisation',
  'Jira'],
 'created': '2020-06-30T20:18:19.066921+00:00',
 'updated': '2021-01-18T11:51:22.832844+00:00',
 'user': 'acct:Grace1999@hypothes.is',
 'text': 'Nowogrodzki. A., (2020). Four tools that help researchers working in collaborations to see the big picture.nature. Retrieved from: https://www.nature.com/articles/d41586-020-01918 utm_source=twt_nnc&utm_medium=social&utm_campaign=naturenews&sf235555174=1',
 'group': 'Jk8bYJdN',
 'permissions'

2. Search annotation with the given condition.

In [361]:
# Search user, document value for the condition - coming after "query"
res = es.search(index="hypothesis_v1", query={"terms": {"tags": ["is:preprint", "limitation"]}})
for doc in res['hits']['hits']:
    print("%s) %s" % (doc['_source']['user'], doc['_source']['document']))
res['hits']['hits']

acct:edampf@hypothes.is) Motivational effects on empathic choices
acct:Marlene_Wulf@hypothes.is) How good is good enough for COVID19 apps? The influence of benefits, accuracy, and privacy on willingness to adopt
acct:jackiekrauss@hypothes.is) Immunogenicity and efficacy of heterologous ChadOx1/BNT162b2 vaccination
acct:SIYANYE@hypothes.is) Wearing face masks strongly confuses counterparts in reading emotions
acct:ErikStuchly@hypothes.is) https://doi.org/10.1101/2020.07.23.20160762
acct:Marlene_Wulf@hypothes.is) Uncertainty Visualization
acct:Marlene_Wulf@hypothes.is) Supply and Demand in Disaggregated Keynesian Economies with an Application to the Covid-19 Crisis
acct:edampf@hypothes.is) Derivation of the effective reproduction number R for COVID-19 in relation to mobility restrictions and confinement
acct:Marlene_Wulf@hypothes.is) Do Students have the Means to Learn During the Coronavirus Pandemic? Student Demands for Distance Learning in a Suddenly Digital Landscape
acct:Marlene_Wulf

[{'_index': 'hypothesis_v1',
  '_id': '435eff83fea41211d052e171162287461336a7e1_OQTuJJ8zEeqMOzdbL4Zzzg',
  '_score': 1.0,
  '_source': {'parent_doc_id': '435eff83fea41211d052e171162287461336a7e1',
   'document_uri': 'https://psyarxiv.com/s7qph/',
   'document': 'Motivational effects on empathic choices',
   'tags': ['is:preprint',
    'lang:en',
    'empathy',
    'motivation',
    'effort',
    'choices',
    'psychology',
    'behavior',
    'decision making',
    'moral character'],
   'created': '2020-05-26T09:28:13.544937+00:00',
   'updated': '2020-06-19T08:30:32.814737+00:00',
   'user': 'acct:edampf@hypothes.is',
   'text': 'Ferguson, A. M., Cameron, D., & Inzlicht, M. (2020, May 15). Motivational effects on empathic choices. https://doi.org/10.31234/osf.io/s7qph',
   'group': 'Jk8bYJdN',
   'permissions': {'read': ['group:__world__'],
    'admin': ['acct:edampf@hypothes.is'],
    'update': ['acct:edampf@hypothes.is'],
    'delete': ['acct:edampf@hypothes.is']},
   'target': [{

### 4. Update, delete data and delete indices in ElasticSearch

In progress.

In [680]:
# Delete data from the search engine.
es.delete(index="hypothesis_v1", id="4011af8ea429e3c113c7328a721f6a2af2fd188f_L5lt6s5MEeqm_pesYHJVVQ")

ObjectApiResponse({'_index': 'hypothesis_v1', '_id': '4011af8ea429e3c113c7328a721f6a2af2fd188f_L5lt6s5MEeqm_pesYHJVVQ', '_version': 2, 'result': 'deleted', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 5000, '_primary_term': 1})

In [None]:
# Update data from the search engine
# es is equal to Elasticsearch
# es.update(index='hypothesis_v1',
#                 body={"doc": {"stanford": 1, "parsed_sents": parsed }})

### [Optional] Reviewing the existing data


In [212]:
knowledge_base_feed = pd.read_json('hypothesis_documents_v2.jsonl', orient='records', lines=True)

In [213]:
knowledge_base_feed.head(5)

Unnamed: 0,ann_id,parent_doc_id,document_uri,document,tags,created,updated,user,text,group,permissions,target,links,user_info,flagged,hidden
0,aa6c8c407d3e5b746cc4ea9889baf7a28eafb181_rnlcI...,aa6c8c407d3e5b746cc4ea9889baf7a28eafb181,https://www.youtube.com/watch?v=QapwrR9C3Z4,The impact of Covid-19 on media – rise of info...,[has:context],2020-10-29T22:18:33.169969+00:00,2020-10-29T22:18:33.169969+00:00,acct:amyhcurtis@hypothes.is,https://twitter.com/i/web/status/1306171100544...,Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=Q...,{'html': 'https://hypothes.is/a/rnlcIho0EeuKgF...,{'display_name': None},False,False
1,b0204dd848c1f25b0506d61889bff67127024f7f_uAs1r...,b0204dd848c1f25b0506d61889bff67127024f7f,https://www.youtube.com/watch?v=wGWqBtDkOFs,Online Research Tools and Techniques,"[is:webinar, is:youtube, online, research, eth...",2020-10-29T22:11:39.622491+00:00,2020-10-29T22:11:39.622491+00:00,acct:amyhcurtis@hypothes.is,"Online Research Tools and Techniques. (2020, ...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=w...,{'html': 'https://hypothes.is/a/uAs1rhozEeuVV7...,{'display_name': None},False,False
2,28f2841bd9c943268ae8e3a3abb6b54ce52f994d_5XkVb...,28f2841bd9c943268ae8e3a3abb6b54ce52f994d,https://www.youtube.com/watch?v=97iJIwBQ5qE,COVID-19: The 9/11 Moment for Global Public He...,"[is:youtube, is:interview, COVID-19, public he...",2020-10-29T21:44:17.929432+00:00,2020-10-29T21:44:17.929432+00:00,acct:amyhcurtis@hypothes.is,COVID-19: The 9/11 Moment for Global Public He...,Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=9...,{'html': 'https://hypothes.is/a/5XkVbhovEeueTa...,{'display_name': None},False,False
3,b371d7336559881079f7219ccf8a3eb487a061d4_jUCjB...,b371d7336559881079f7219ccf8a3eb487a061d4,http://www.youtube.com/playlist?list=PLOA0aRJ9...,ORWG virtual meeting 08/09/2020,"[is:youtube, webinar, lang:en, poster, publish...",2020-10-29T21:34:40.377021+00:00,2020-10-29T21:34:40.377021+00:00,acct:amyhcurtis@hypothes.is,ORWG Virtual Meeting 08/09/2020 https://www.yo...,Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'http://www.youtube.com/playlist?l...,{'html': 'https://hypothes.is/a/jUCjBBouEeuQFC...,{'display_name': None},False,False
4,319a613babccf3c205ddc5dd0d07d4e49cd8a11f_X8R4i...,319a613babccf3c205ddc5dd0d07d4e49cd8a11f,https://www.youtube.com/watch?v=zTrIl52jV0s,"Long covid: diagnosis, management, prognosis","[is:webinar, COVID-19, long, symptom, syndrome...",2020-10-29T21:19:05.041279+00:00,2020-10-29T21:19:05.041279+00:00,acct:amyhcurtis@hypothes.is,"Long covid: Diagnosis, management, prognosis. ...",Jk8bYJdN,"{'read': ['group:__world__'], 'admin': ['acct:...",[{'source': 'https://www.youtube.com/watch?v=z...,{'html': 'https://hypothes.is/a/X8R4ihosEeuTB8...,{'display_name': None},False,False


In [120]:
interesting_tags = [tag for tag in knowledge_base_feed['tags'].sum() if "is:" in tag]


In [121]:
Counter(interesting_tags).most_common(10)

[('is:article', 735),
 ('is:preprint', 708),
 ('is:news', 527),
 ('is:webpage', 457),
 ('is:twitter', 225),
 ('is:blog', 110),
 ('is:tweet', 84),
 ('is:other', 58),
 ('is:youtube', 33),
 ('is:report', 27)]

In [122]:
interesting_tags = [tag for tag in knowledge_base_feed['tags'].sum() if "has:" in tag]
Counter(interesting_tags).most_common(10)


[('has:context', 2395),
 ('has:date', 1221),
 ('has:date-approx', 138),
 ('has:problem', 36),
 ('has:update', 6),
 ('has:supplement', 1),
 ('has:contet', 1),
 ('has:', 1),
 ('has:summary', 1),
 ('has:passed', 1)]

In [123]:
interesting_tags = [tag for tag in knowledge_base_feed['tags'].sum() if "ann:" in tag]
Counter(interesting_tags).most_common(10)


[('ann:doi', 306),
 ('ann:summary', 195),
 ('ann:title', 163),
 ('ann:text', 66),
 ('ann:arxiv', 6),
 ('ann:abstract', 1)]

In [124]:
[tag for tag in list(set(knowledge_base_feed['tags'].sum())) if "date" in tag]

['has:date',
 'mask mandate',
 'vaccine candidate',
 'vaccine mandate',
 'mandate',
 'candidate vaccine',
 'has date',
 'candidate',
 'Mask mandates',
 'has:date-approx',
 'update',
 'updated probability',
 'has:update',
 'majority for vaccine mandate',
 'expiry date']

In [125]:
tags_to_check = ['has:update', 'has:date', 'has date', 'update', 'expiry date', 'has:date-approx']

tags_to_check = ['is:tweet', "is:twitter"]


def get_ann_by_tag(row, tag_values):
    for tag in tag_values:
        if tag in row['tags']:
            row['date-like'] = True
            #row['date'] = tag
        #else:
            #row['date-like'] = False
            #row['date'] = None
    return row

In [126]:
_ = knowledge_base_feed.apply(lambda row: get_ann_by_tag(row, tags_to_check), axis=1)

In [127]:
_[_['date-like'] == True][['text', 'tags']]

Unnamed: 0,text,tags
93,Mahan Ghafari | ماهان غفاری on Twitter. (n.d.)...,"[is:twitter, lang:en, COVID-19, infection, epi..."
98,"Dr Nisreen Alwan 🌻 [@Dr2NisreenAlwan]. (2021, ...","[is:twitter, lang:en, COVID-19, is:video, vide..."
99,"Shalin Naik [@shalinhnaik]. (2021, October 14)...","[is:twitter, lang:en, COVID-19, vaccine, podca..."
100,"Luka Mesin [@LukaMesin]. (2021, November 10). ...","[is:twitter, lang:en, COVID-19, Europe, vaccin..."
101,"Eric Feigl-Ding [@DrEricDing]. (2021, November...","[is:twitter, lang:en, COVID-19, is:video, is:a..."
...,...,...
6569,"Sandra Ciesek. (2021, December 8). Unsere erst...","[is:tweet, lang:de, COVID-19, Omicron, Delta v..."
6583,"ZDF Bayern. (2021, November 9). Die 7-Tage-Inz...","[is:tweet, lang:de, COVID-19, Germany, inciden..."
6601,"John Bye [@_johnbye]. (2021, October 6). The n...","[is:tweet, lang:en, COVID-19, sceptic, All Par..."
6609,"Dr Dan Goyal. (2022, March 15). What’s been ha...","[is:tweet, lang:en, COVID-19, policy, schism, ..."


There are 309 annotations which have date information.

### Creating queries to perform to document

In [256]:
from typing import List

queries = [
    {
        "description": "search by id",
        "query": {
            "bool": {
                "must": {
                    "match_phrase": {
                        "_id": "4011af8ea429e3c113c7328a721f6a2af2fd188f_L5lt6s5MEeqm_pesYHJVVQ",
                    }
                },
            },
        }
    },
    {
        "description": "search documents by list of values in tags",
        "query": {
            "bool": {
                "must": [
                    {
                        "terms": {
                            "tags": "<LIST_OF_VALUES>"
                        }
                    }
                ]
            }
        }
    },
    {
        "description": "search document by document type",
        "query": {
            "bool": {
                "must": [
                    {
                        "terms": {
                            "tags": ["is:<TYPE>"]
                        }
                    }
                ]
            }
        }
    },
    {
        "description": "search documents in date range (where date is in 'text' field when the tags contain 'has:date' or 'has:date-approx'",
        "query": {
            "range": {
                "text": {
                    "gte": "<BEGIN_DATE>",
                    "lte": "<END_DATE>"
                }
            }
        }

    }
]

# One example of queries
_queries = [
    {
        "query": {
            "bool": {
                "must": [
                    {
                        "range": {
                            "text": {
                                "gte": "2022-01-01",
                                "lte": "2022-12-31"
                            }
                        }
                    },
                    {
                        "terms": {
                            "tags": ["tag1", "tag2", "tag3"]
                        }
                    }
                ]
            }
        }
    }

]

In [765]:
# Definining subqueries
query_text = {
    "match": {
        "text": {
            "query": "input_text", 
            "operator": "and"
        } 
    }
}
# Is it 'match' or 'match_phrase' or something else?

query_date_range = {
    
    "range": {
        "date": {    
            "gte": "begin_date",
            "lte": "end_date"
        }
    }
    
}
# Is it 'text' or something else? 

query_type = {
    "term" : { "tags" : "TYPE" }
}

query_ann_type = {   
    "term" : { "tags" : "ANN_TYPE" }
}

query_has_property = {
    "term" : { "tags" : "HAS_PROPERTY" }
}

temp_keywords = ["str1", "str2", "str3"]
# query_keywords = [{"term": {"tags": keyword}} for keyword in temp_keywords] -> it is already inside the function search_documents


In [823]:
def search_documents(text: str = None , date_range: List[str] = None , type_: str = None, ann_type = None, has_property: str = None,
                     keywords: List[str] = None) -> list:
    """
    Building a query for elastic search consisting of six subqueries, depending on what's provided

    :param text: free text to be search in the fields 'text' or 'document'
    :param date_range: a list with two values [begin_date, end_date]. When provided, we need to filter documents that have date and whose date is in the range
    :param type_: relates to documents which have a tag "is:<TYPE>", where "<TYPE>" is the type provided
    :param ann_type: relates to documents which have a tag "ann:<ANN_TYPE>", where "<ANN_TYPE>" is the ann_type provided
    :param has_property: relates to documents which have a tag "has:<HAS_PROPERTY>", where "<HAS_PROPERTY>" is the has_property provided
    :param keywords: list of terms expected to be found in the "tags" field ONLY and they appear without any specification (not "is:", "has:, "ann:")
    :return: a list of results with all relevant documents for that query
    """
    
    query = {
        "query": {
            "bool": {
                "filter": [], 
                "must": []
            }
        }
    }

    if text:
        query_text['match']['text']['query'] = text
        query['query']['bool']['filter'].append(query_text)

        
    if date_range:
        
        if len(date_range) > 1:
            query_date_range['range']['date']['gte'] = date_range[0]
            query_date_range['range']['date']['lte'] = date_range[1]
            query['query']['bool']['filter'].append(query_date_range)
            
        elif len(date_range) == 1: 
            query_date_range['range']['date']['gte'] = date_range[0] # only one date -> putting it to a starting date.
            query_date_range['range']['date']['lte'] = '3000-01-01'
            query['query']['bool']['filter'].append(query_date_range)
            
            print(query)
            
        else: 
            pass
    
        
    if type_:
        query_type['term']['tags'] = "is:"+ type_
        query['query']['bool']['filter'].append(query_type)
        
    if ann_type: 
        query_ann_type['term']['tags'] = "ann:"+ann_type
        query['query']['bool']['filter'].append(query_ann_type)
    
    if has_property: 
        query_has_property['term']['tags'] = "has:"+has_property
        query['query']['bool']['filter'].append(query_has_property)
        
    if keywords: 
        temp_keywords = keywords
        query_keywords = [{"term": {"tags": keyword}} for keyword in temp_keywords]
        query['query']['bool']['must'].extend(query_keywords)
    
    res = es.search(index="hypothesis_v1", query = query['query'])
    
    for doc in res['hits']['hits']:
         print("%s) %s" % (doc['_source']['user'], doc['_source']['document']))
    

    return res['hits']['hits']



### Test examples of elasticsearch

In [585]:
df3.sample(n = 1).iloc[0].to_dict()

{'ann_id': '498f05aedc003c1f67f103eae99a632a21ddb445_GdkZ_qmWEeqLPcsGTFzJ1w',
 'parent_doc_id': '498f05aedc003c1f67f103eae99a632a21ddb445',
 'document_uri': 'https://ispmbern.github.io/covid-19/living-review/',
 'document': 'covid-19',
 'tags': [],
 'created': '2020-06-08T14:41:13.068839+00:00',
 'updated': '2020-06-08T14:41:13.068839+00:00',
 'user': 'acct:Marlene_Wulf@hypothes.is',
 'text': '2020-05-26',
 'group': 'Jk8bYJdN',
 'permissions': {'read': ['group:__world__'],
  'admin': ['acct:Marlene_Wulf@hypothes.is'],
  'update': ['acct:Marlene_Wulf@hypothes.is'],
  'delete': ['acct:Marlene_Wulf@hypothes.is']},
 'target': [{'source': 'https://ispmbern.github.io/covid-19/living-review/'}],
 'links': {'html': 'https://hypothes.is/a/GdkZ_qmWEeqLPcsGTFzJ1w',
  'incontext': 'https://hyp.is/GdkZ_qmWEeqLPcsGTFzJ1w/ispmbern.github.io/covid-19/living-review/',
  'json': 'https://hypothes.is/api/annotations/GdkZ_qmWEeqLPcsGTFzJ1w'},
 'user_info': {'display_name': None},
 'flagged': 'False',
 'hi

In [575]:
# Search 'term' in 'text' field
search_documents(text = 'Search')

acct:zoe_ikeotuonye@hypothes.is) r/BehSciAsk - Workshop hackathon: ReSearch Engine: Search Engine for SciBeh’s knowledge base & beyond
acct:Marlene_Wulf@hypothes.is) Rivals in the dark: How competition influences search in decisions under uncertainty
acct:Marlene_Wulf@hypothes.is) r/BehSciResearch - How do people search for, avoid and share information during COVID-19?
acct:Marlene_Wulf@hypothes.is) Vaughan Bell en Twitter: "A brief guide for psychologists wanting to find research on the role of psychology relevant to COVID-19.

You need to search for studies in the same way you search for studies normally. However, some pointers to sites and key words might be useful..." / Twitter
acct:Marlene_Wulf@hypothes.is) Shalin Naik on Twitter
acct:Marlene_Wulf@hypothes.is) How search engines disseminate information about COVID-19 and why they should do better
acct:zoe_ikeotuonye@hypothes.is) Kolina Koltai, PhD on Twitter


[{'_index': 'hypothesis_v1',
  '_id': 'b2e9f1ea0f584be34c8a731b4531a812bda1b0be_peiW1n3LEeuUzQ9wYGfE_Q',
  '_score': 0.0,
  '_source': {'parent_doc_id': 'b2e9f1ea0f584be34c8a731b4531a812bda1b0be',
   'document_uri': 'https://www.reddit.com/r/BehSciAsk/comments/jkz7jx/workshop_hackathon_research_engine_search_engine/',
   'document': 'r/BehSciAsk - Workshop hackathon: ReSearch Engine: Search Engine for SciBeh’s knowledge base & beyond',
   'tags': ['is:blog',
    'COVID-19',
    'resource',
    'policymakers',
    'research',
    'knowledge base',
    'academia',
    'lang:en'],
   'created': '2021-03-05T15:58:37.969234+00:00',
   'updated': '2021-03-05T16:08:21.420160+00:00',
   'user': 'acct:zoe_ikeotuonye@hypothes.is',
   'text': 'r/BehSciAsk - Workshop hackathon: ReSearch Engine: Search Engine for SciBeh’s knowledge base & beyond. (n.d.). Reddit. Retrieved 5 March 2021, from https://www.reddit.com/r/BehSciAsk/comments/jkz7jx/workshop_hackathon_research_engine_search_engine/\n',
   '

In [636]:
# Search document with a given 'tag'. 
search_documents(keywords = ['COVID-19', 'vaccination'])

acct:zoe_ikeotuonye@hypothes.is) ‘I’m sorry, but it’s too late’: Alabama doctor tells unvaccinated, dying COVID patients
acct:SIYANYE@hypothes.is) Modest International Law: COVID-19, International Legal Responses, and Depoliticization
acct:SIYANYE@hypothes.is) The explosion of new coronavirus tests that could help to end the pandemic
acct:Marlene_Wulf@hypothes.is) How good is good enough for COVID19 apps? The influence of benefits, accuracy, and privacy on willingness to adopt
acct:jackiekrauss@hypothes.is) Immunogenicity and efficacy of heterologous ChadOx1/BNT162b2 vaccination
acct:zoe_ikeotuonye@hypothes.is) Congressman Matt Gaetz Unveils Legislation to Ban Federal Support for Vaccine Passports
acct:SIYANYE@hypothes.is) Wearing face masks strongly confuses counterparts in reading emotions
acct:Grace1999@hypothes.is) 85 kids, counselors infected with coronavirus in YMCA camp outbreak, GA officials say
acct:zoe_ikeotuonye@hypothes.is) The Latest: More cases in virus cluster in souther

[{'_index': 'hypothesis_v1',
  '_id': 'a3be7436686d3bb6fd1f1e9ab1dc036e9fd6aa43_KL5N5PU7EeuZZqfcWLEJlA',
  '_score': 1.0,
  '_source': {'parent_doc_id': 'a3be7436686d3bb6fd1f1e9ab1dc036e9fd6aa43',
   'document_uri': 'https://www.al.com/news/2021/07/im-sorry-but-its-too-late-alabama-doctor-on-treating-unvaccinated-dying-covid-patients.html',
   'document': '‘I’m sorry, but it’s too late’: Alabama doctor tells unvaccinated, dying COVID patients',
   'tags': ['is:webpage',
    'lang:en',
    'COVID-19',
    'vaccine',
    'vaccination',
    'death',
    'patient',
    'healthcare'],
   'created': '2021-08-04T15:46:39.989240+00:00',
   'updated': '2021-08-04T15:46:39.989240+00:00',
   'user': 'acct:zoe_ikeotuonye@hypothes.is',
   'text': '‘I’m sorry, but it’s too late’: Alabama doctor tells unvaccinated, dying COVID patients. (2021, July 21). Al. https://www.al.com/news/2021/07/im-sorry-but-its-too-late-alabama-doctor-on-treating-unvaccinated-dying-covid-patients.html\n',
   'group': 'Jk8b

In [824]:
# Search document with a date range - Not working. 
search_documents(date_range = ['2020'])

{'query': {'bool': {'filter': [{'range': {'date': {'gte': '2020', 'lte': '3000-01-01'}}}], 'must': []}}}
acct:Lu17Cheryl@hypothes.is) Myths about COVID-19 vaccination - HackMD
acct:amyhcurtis@hypothes.is) EMEs and COVID-19: Shutting Down in a World of Informal and Tiny Firms
acct:tadedvorak@hypothes.is) Face Masks Considerably Reduce COVID-19 Cases in Germany: A Synthetic Control Method Approach
acct:cheyennechooi@hypothes.is) What Lies Beneath: Tackling Vaccine Hesitancy
acct:amyhcurtis@hypothes.is) A Model of Asset Price Spirals and Aggregate Demand Amplification of a "Covid-19" Shock
acct:NatasjaDerbyMcCabe@hypothes.is) CoVaxxy
acct:katietaylor_99@hypothes.is) Data Gaps and the Policy Response to the Novel Coronavirus
acct:ErikStuchly@hypothes.is) BBC Radio 4 - The Political School, Episode 1
acct:amyhcurtis@hypothes.is) The Impact of COVID-19 on Student Experiences and Expectations: Evidence from a Survey
acct:amyhcurtis@hypothes.is) Pandemic Shocks and Fiscal-Monetary Policies in 

[{'_index': 'hypothesis_v1',
  '_id': 'af92dc8b0576b8bb72803880146f48328c466101_yB8aUHLfEeu0Ju90gjQpuA',
  '_score': 0.0,
  '_source': {'parent_doc_id': 'af92dc8b0576b8bb72803880146f48328c466101',
   'document_uri': 'https://hackmd.io/ovEzSQWcRp2bctQn8MYElQ',
   'document': 'Myths about COVID-19 vaccination - HackMD',
   'tags': ['has:date-approx'],
   'created': '2021-02-19T18:25:02.540845+00:00',
   'updated': '2021-02-19T18:25:02.540845+00:00',
   'user': 'acct:Lu17Cheryl@hypothes.is',
   'text': '2020-02-17',
   'date': '2020-02-17',
   'group': 'Jk8bYJdN',
   'permissions': {'read': ['group:__world__'],
    'admin': ['acct:Lu17Cheryl@hypothes.is'],
    'update': ['acct:Lu17Cheryl@hypothes.is'],
    'delete': ['acct:Lu17Cheryl@hypothes.is']},
   'target': [{'source': 'https://hackmd.io/ovEzSQWcRp2bctQn8MYElQ'}],
   'links': {'html': 'https://hypothes.is/a/yB8aUHLfEeu0Ju90gjQpuA',
    'incontext': 'https://hyp.is/yB8aUHLfEeu0Ju90gjQpuA/hackmd.io/ovEzSQWcRp2bctQn8MYElQ',
    'json': 

In [607]:
# Search document by type
search_documents(type_ = "twitter")

acct:Marlene_Wulf@hypothes.is) (1) Politics&LifeSciences on Twitter: "Pleased to announce our newest Research Tool Report on First View: "https://t.co/MDkOCaFmQN: A tool for biopolitical researchers, policymakers, &amp; citizens" by Glass &amp; Balachandran | @PsychTable @CUP_PoliSci @glenngeher @ml_fisher #evolution #research @tjw51 https://t.co/IMh6xIxr9T" / Twitter
acct:Marlene_Wulf@hypothes.is) Whitney R. Robinson on Twitter: "1/ An #EpiTwitter 🧵 about theory... https://t.co/rSjfkHG21r" / Twitter
acct:NatasjaDerbyMcCabe@hypothes.is) JoHo on Twitter
acct:zoe_ikeotuonye@hypothes.is) dbRaevn on Twitter
acct:NatasjaDerbyMcCabe@hypothes.is) Marino van Zelst🌱 on Twitter
acct:Marlene_Wulf@hypothes.is) Alex Holcombe en Twitter: "Many journals are fast-tracking relevant articles. But with cases growing by 30%+ a day in many places, better model is @Meta_Psy or @F1000Research where reviews appear as soon as they are written, rather than waiting for the slowest reviewer. @Meta_Psy uses @hypot

[{'_index': 'hypothesis_v1',
  '_id': '16f2accd3eaaf1c865f6ebebe768200b1d0a0709_xoronqpKEeqSK9uFG3LdiA',
  '_score': 0.0,
  '_source': {'parent_doc_id': '16f2accd3eaaf1c865f6ebebe768200b1d0a0709',
   'document_uri': 'https://twitter.com/PLSJournal/status/1270038273965150212',
   'document': '(1) Politics&LifeSciences on Twitter: "Pleased to announce our newest Research Tool Report on First View: "https://t.co/MDkOCaFmQN: A tool for biopolitical researchers, policymakers, &amp; citizens" by Glass &amp; Balachandran | @PsychTable @CUP_PoliSci @glenngeher @ml_fisher #evolution #research @tjw51 https://t.co/IMh6xIxr9T" / Twitter',
   'tags': ['is:twitter',
    'lang:en',
    'research tool',
    'report',
    'first view',
    'tool',
    'biopolitical reasearcher',
    'policy maker',
    'citizen'],
   'created': '2020-06-09T12:14:32.291773+00:00',
   'updated': '2020-06-09T12:14:32.291773+00:00',
   'user': 'acct:Marlene_Wulf@hypothes.is',
   'text': '(1) Politics&LifeSciences on Twitte

In [650]:
# Search document if they contains all the tags input
search_documents(keywords = ['COVID-19', 'pandemic', 'is:news','testing', 'tracking'])

acct:chaeyeonlim@hypothes.is) We’re About to Lose Track of the Pandemic


[{'_index': 'hypothesis_v1',
  '_id': 'a6a594acbfa3f4902a8f01934d2704e7b12ca04f_xw9VLGQUEeyYZuNtUs4cWQ',
  '_score': 25.02397,
  '_source': {'parent_doc_id': 'a6a594acbfa3f4902a8f01934d2704e7b12ca04f',
   'document_uri': 'https://www.theatlantic.com/ideas/archive/2021/12/were-about-to-lose-track-of-the-pandemic/621097/',
   'document': 'We’re About to Lose Track of the Pandemic',
   'tags': ['is:news',
    'lang:en',
    'COVID-19',
    'variant',
    'omicron',
    'pandemic',
    'transmission',
    'infection',
    'health crisis',
    'public health',
    'PCR test',
    'testing',
    'data reporting',
    'hospitalization',
    'US',
    'death rate',
    'hospitalization rate',
    'data tracker',
    'tracking',
    'holiday',
    'case',
    'new case',
    'winter'],
   'created': '2021-12-23T17:21:34.292161+00:00',
   'updated': '2022-08-26T14:29:46.520195+00:00',
   'user': 'acct:chaeyeonlim@hypothes.is',
   'text': 'Kissane, E. (2021, December 23). We’re About to Lose Trac

**Problem** This returns even though only one of the tags exist in the annotation.

In [538]:
# search_documents(keywords = ['vaccination', 'COVID-19'])

acct:zoe_ikeotuonye@hypothes.is) ‘I’m sorry, but it’s too late’: Alabama doctor tells unvaccinated, dying COVID patients
acct:SIYANYE@hypothes.is) Modest International Law: COVID-19, International Legal Responses, and Depoliticization
acct:SIYANYE@hypothes.is) The explosion of new coronavirus tests that could help to end the pandemic
acct:Marlene_Wulf@hypothes.is) How good is good enough for COVID19 apps? The influence of benefits, accuracy, and privacy on willingness to adopt
acct:jackiekrauss@hypothes.is) Immunogenicity and efficacy of heterologous ChadOx1/BNT162b2 vaccination
acct:zoe_ikeotuonye@hypothes.is) Congressman Matt Gaetz Unveils Legislation to Ban Federal Support for Vaccine Passports
acct:SIYANYE@hypothes.is) Wearing face masks strongly confuses counterparts in reading emotions
acct:Grace1999@hypothes.is) 85 kids, counselors infected with coronavirus in YMCA camp outbreak, GA officials say
acct:zoe_ikeotuonye@hypothes.is) The Latest: More cases in virus cluster in souther

[{'_index': 'hypothesis_v1',
  '_id': 'a3be7436686d3bb6fd1f1e9ab1dc036e9fd6aa43_KL5N5PU7EeuZZqfcWLEJlA',
  '_score': 1.0,
  '_source': {'parent_doc_id': 'a3be7436686d3bb6fd1f1e9ab1dc036e9fd6aa43',
   'document_uri': 'https://www.al.com/news/2021/07/im-sorry-but-its-too-late-alabama-doctor-on-treating-unvaccinated-dying-covid-patients.html',
   'document': '‘I’m sorry, but it’s too late’: Alabama doctor tells unvaccinated, dying COVID patients',
   'tags': ['is:webpage',
    'lang:en',
    'COVID-19',
    'vaccine',
    'vaccination',
    'death',
    'patient',
    'healthcare'],
   'created': '2021-08-04T15:46:39.989240+00:00',
   'updated': '2021-08-04T15:46:39.989240+00:00',
   'user': 'acct:zoe_ikeotuonye@hypothes.is',
   'text': '‘I’m sorry, but it’s too late’: Alabama doctor tells unvaccinated, dying COVID patients. (2021, July 21). Al. https://www.al.com/news/2021/07/im-sorry-but-its-too-late-alabama-doctor-on-treating-unvaccinated-dying-covid-patients.html\n',
   'group': 'Jk8b