# Use tweets for evaluating the performances of an IR system

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import pymongo

## Create dataset and ground truth

In [3]:
db = pymongo.MongoClient()['twitter']['tweets']

In [4]:
tweets = list(db.find())

In [5]:
corpus = dict([(tweet['id'], tweet['text']) for tweet in tweets])

In [7]:
list(corpus.items())[:3]

[('1463847474049789952',
  'In case you missed it: Daily Covid-19 caseloads per 100,000 citizens in the UK now rank behind more than 10 European countries, including Belgium and Greece https://t.co/phSyAbo6Ux https://t.co/Swy51Zuf0B'),
 ('1463839920204787716',
  'English football is set for a radical makeover that would include powers to block change of ownership of clubs, a veto for supporters over key decisions and a redistributive levy on Premier League player transfers, under proposals by a review of the sport https://t.co/f2Xr75Jdhq'),
 ('1463836848917204998',
  'Good Morning New York. Here are the Top Stories Today.\nhttps://t.co/zHRPaGT1ib https://t.co/qvKD7BflG7')]

In [8]:
entities, domains = {}, {}
metadata = []
for i, tweet in enumerate(tweets):
    if 'context_annotations' in tweet.keys():
        for annotation in tweet['context_annotations']:
            domain, entity = annotation['domain'], annotation['entity']
            if domain['id'] not in domains:
                domains[domain['id']] = domain
            if entity['id'] not in entities:
                entities[entity['id']] = entity
            metadata.append({
                'entity': entity['id'], 'domain': domain['id'], 'tweet': tweet['id']
            })
M = pd.DataFrame(metadata)

In [9]:
E = pd.DataFrame(entities).T
D = pd.DataFrame(domains).T

In [10]:
E.head(2)

Unnamed: 0,id,name,description
1220701888179359745,1220701888179359745,COVID-19,
10000277815,10000277815,English Premier League Soccer,Action from English Premier League Soccer matc...


In [11]:
D.head(2)

Unnamed: 0,id,name,description
123,123,Ongoing News Story,Ongoing News Stories like 'Brexit'
3,3,TV Shows,Television shows from around the world


In [12]:
M.head(2)

Unnamed: 0,entity,domain,tweet
0,1220701888179359745,123,1463847474049789952
1,10000277815,3,1463839920204787716


## Expected results

In [13]:
entity_queries = list(set(E['name'].values))
domain_queries = list(set(D['name'].values))
indexed_tweets = set([int(x) for x in list(E.id.values) + list(D.id.values)])

In [14]:
def get_entity_results(query):
    etweets = set()
    entity_ids = E[E['name']==query].id.values
    for eid in entity_ids:
        etweets = etweets.union(set(M[M.entity==eid].tweet.values))
    return etweets

def get_domain_results(query):
    etweets = set()
    domain_ids = D[D['name']==query].id.values
    for eid in domain_ids:
        etweets = etweets.union(set(M[M.domain==eid].tweet.values))
    return etweets

In [15]:
len(get_domain_results('TV Shows'))

2042

In [16]:
list(get_domain_results('TV Shows'))[:3]

['1483393732279881729', '1465496068527960070', '1468504535249612800']

In [17]:
search_base = [(x, corpus[x]) for x in set(M.tweet.values)]

In [18]:
len(search_base)

22350

In [19]:
esize = M.groupby('entity').count()
esize['query'] = [E.loc[x]['name'] for x in esize.index.values]
dsize = pd.DataFrame(M.groupby('domain').tweet.nunique(), columns=['tweet'])
dsize['query'] = [D.loc[x]['name'] for x in dsize.index.values]

In [23]:
esize.sort_values('tweet', ascending=False).head(20)

Unnamed: 0_level_0,domain,tweet,query
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
781974596148793345,4427,4427,Business & finance
1220701888179359745,3985,3985,COVID-19
1066114840832356353,3074,3074,The Telegraph
1113138554496942080,3071,3071,The Daily Telegraph
1113097508572426242,2983,2983,Daily Mirror
1066122339568386048,2819,2819,The Guardian
857212166100754432,1964,1964,Boris Johnson
825047692124442624,1661,1661,Food
864931126132985856,1198,1198,Vladimir Putin
826817907946450944,714,714,Food Blogs


In [24]:
dsize.sort_values('tweet', ascending=False).head(20)

Unnamed: 0_level_0,tweet,query
domain,Unnamed: 1_level_1,Unnamed: 2_level_1
47,10721,Brand
10,6368,Person
65,5525,Interests and Hobbies Vertical
123,3980,Ongoing News Story
45,3331,Brand Vertical
35,2961,Politician
66,2681,Interests and Hobbies Category
67,2573,Interests and Hobbies
3,2042,TV Shows
46,1988,Brand Category


### Example

In [26]:
query_answers = get_entity_results('COVID-19')
y_true = [1 if x in query_answers else 0 for x, _ in search_base]

In [27]:
index_true = [i for i, x in enumerate(y_true) if x == 1]
for i in index_true[:5]:
    print(search_base[i])

('1466659258687229957', 'Booster jabs “massively” strengthen the body’s defences against Covid https://t.co/ctWOFrLUdK')
('1476481949606850560', 'Hospitals urged to discharge as many patients as possible to make space for Covid admissions https://t.co/vDoDOvpXJq https://t.co/cXAGSUnAYA')
('1481762738548486147', "Downing Street staff held a 'lockdown-breaking party' on the eve of Prince Philip's funeral https://t.co/btt23FHRfT")
('1476373047930146818', 'Up to three Covid jabs a year could be needed for protection, data suggests https://t.co/rl7XFIxUz1')
('1471452866120335361', 'Club DJ, 47, had bizarre symptom before testing positive for Covid\n\nhttps://t.co/dVE4VECWRJ https://t.co/xd4fSO8Iqq')


In [None]:
len(index_true)

## The naive information system
- Create the index as BOW and Inverted index
- Compute pseudo cosine similarity
- Transform the query
- Perform evaluation

In [28]:
from nltk.tokenize import TweetTokenizer
from collections import defaultdict, Counter

In [29]:
from nltk.corpus import stopwords
from string import punctuation

In [30]:
stopw = set(stopwords.words('english'))
tokenizer = TweetTokenizer()
tokenize = lambda text: [word for word in tokenizer.tokenize(text.lower()) 
                         if word not in punctuation and word not in stopw and not word.startswith('http')] 

In [31]:
BOW = defaultdict(lambda: defaultdict(lambda: 0))
I = defaultdict(set)

In [32]:
for i, text in search_base:
    for token in tokenize(text):
        BOW[i][token] += 1
        I[token].add(i)

### TFIDF from scratch

In [34]:
N = len(search_base)
IDF = dict([(word, np.log(N / len(posting))) for word, posting in I.items()])
TFIDF = {}
for document, bow in BOW.items():
    size = sum(bow.values())
    tfidf_bow = dict([(word, (tf / size) * IDF[word]) for word, tf in bow.items()])
    TFIDF[document] = tfidf_bow

In [35]:
list(TFIDF.items())[0]

('1466659258687229957',
 {'booster': 0.4767557527881229,
  'jabs': 0.5202397244669298,
  '“': 0.3411993707852379,
  'massively': 0.9321434419481771,
  '”': 0.34721096395349105,
  'strengthen': 0.822282213081366,
  'body': 0.5780475095444456,
  '’': 0.1786938892137286,
  'defences': 0.7711996507047669,
  'covid': 0.24186916823231766})

### Search

In [36]:
def query(query_text):
    counts = Counter(tokenize(query_text)).most_common()
    qtfidf = [(w, tf * IDF[w]) for w, tf in counts if w in IDF.keys()]
    return qtfidf

In [37]:
query('COVID-19')

[('covid', 2.4186916823231766), ('19', 4.65328943433229)]

### Pseudo cosine similarity
$$
\cos(\theta) = \frac{{\textbf x} \cdot {\textbf y}}{{\mid\mid {\textbf x} \mid\mid \mid\mid {\textbf y} \mid\mid}} = 
\frac{\sum\limits_{i=1}^{N} a_i b_i}{\sqrt{\sum\limits_{i=1}^{N} a_{i}^{2}} \sqrt{\sum\limits_{i=1}^{N} b_{i}^{2}}}
$$

In [57]:
def cos(bow1, bow2):
    sq1 = np.sqrt(sum([np.power(x, 2) for x in bow1.values()]))
    sq2 = np.sqrt(sum([np.power(x, 2) for x in bow2.values()]))
    k = 0
    for word, score in bow1.items():
        try:
            k += score * bow2[word]
        except KeyError:
            pass
    score = k / (sq1 * sq2)
    if pd.isna(score):
        score = 0
    return score

In [58]:
list(TFIDF.keys())[:4]

['1466659258687229957',
 '1476481949606850560',
 '1467801537023983618',
 '1489548886419156993']

In [59]:
print(TFIDF['1466323790753845253'])
print(TFIDF['1474394435656065042'])

{'prime': 0.481057491296492, 'minister': 0.4348154911929283, 'criticises': 0.8068671450986402, 'facebook': 0.5632554965367834, 'social': 0.5235458106930185, 'media': 0.548198210688846, 'companies': 0.5429614121371144, 'migrant': 0.637699544031533, 'trafficking': 0.6718744734037386, 'adverts': 0.8915969311373605}
{'🥂': 0.2711336673421811, 'winston': 0.28761191327011626, 'churchill': 0.26525232680044064, 'hailed': 0.22641449440282982, 'pint': 0.24877408087250544, 'bottle': 0.2280691168024282, 'champagne': 0.21921631532817787, '“': 0.22012862631305666, 'ideal': 0.26525232680044064, 'size': 0.23792013775569212, '”': 0.2240070735183813, 'declaring': 0.27833184641683306, 'enough': 0.1751488426248756, 'two': 0.13143259184907366, 'lunch': 0.1962179344295932, 'one': 0.11762562624810395, 'dinner': 0.17231460534128415, 'sale': 0.1855452168709806, 'wartime': 0.32305101935618435, 'prime': 0.15517983590209417, 'minister': 0.14026306167513813, '’': 0.05764319006894471, 'favoured': 0.28761191327011626

In [60]:
cos(TFIDF['1466323790753845253'], TFIDF['1474394435656065042'])

0.057594817449885274

### Search

In [61]:
def search(query_text):
    q = dict(query(query_text))
    outcomes = []
    for doc_id, text in search_base:
        outcomes.append((doc_id, cos(q, BOW[doc_id])))
    return outcomes

In [62]:
out = search('COVID-19')

  score = k / (sq1 * sq2)


In [63]:
out[:4]

[('1466659258687229957', 0.14584418522526704),
 ('1476481949606850560', 0.14584418522526704),
 ('1467801537023983618', 0.0),
 ('1489548886419156993', 0.0)]

### Output selection or ranking?

In [64]:
ground_truth = get_entity_results('COVID-19')

In [78]:
TP, FP, FN, TN, answers = 0, 0, 0, 0, []
threshold = np.percentile(np.array([y for x, y in out]), 97)
for doc_id, score in out:
    if score >= threshold:
        answers.append((doc_id, score))
        if doc_id in ground_truth:
            TP += 1
        else:
            FP += 1
    else:
        if doc_id in ground_truth:
            FN += 1
        else:
            TN += 1

In [79]:
print('precision', TP / (TP + FP))
print('recall', TP / (TP + FN))

precision 0.9630484988452656
recall 0.20954773869346735


In [80]:
print(TP, FP, FN, TN)

834 32 3146 18338


In [81]:
print((TP + TN) / (TP + TN + FP + FN))

0.8578076062639821


In [83]:
for i, (doc_id, score) in enumerate(answers):
    if not doc_id in ground_truth:
        print(doc_id, score)
        print(corpus[doc_id])

1484116996719472640 0.29576543565613694
#homesunderthehammer Martin Roberts breaks silence on future on show after 19 years
https://t.co/PXr7NtTrx3
1499316200693764098 0.17401312244502845
In case you missed it: Roman Abramovich has put Chelsea football club up for sale after 19 years of ownership, waiving £1.5bn it owes him and pledging to donate proceeds of the sale to victims of the war in Ukraine https://t.co/UJym4fpC2U
1485937183244656653 0.29576543565613694
19 injured including three kids as London bus smashes into shop https://t.co/z9jK5ATVBW https://t.co/FSZjJiYer1
1497530548771766273 0.16476678778574827
🔴The MoD have said that the bulk of Russian forces involved in the advance on Ukraine's capital Kyiv were now 30 km (19 miles) from the city centre

Track the key verified areas hit in Ukraine by the Russian military here ⬇️
https://t.co/5C3Liu3ljR https://t.co/Qd5whiLiSs
1496462367642824709 0.25614038083958635
Married ex-Amazon delivery driver who stalked Emma Raducanu, 19, is 

In [84]:
A = set([x for x, y in answers])
c = 0
for doc_id in ground_truth:
    if not doc_id in A:
        print(corpus[doc_id])
        c += 1
    if c > 10:
        break

Officials are drawing up plans for the introduction of vaccine passports to slow the spread of Omicron https://t.co/OHCZk6Ti5z
RT @DailyMailUK: Spain bans unvaccinated Brit travellers aged 12 and over amid Omicron fears https://t.co/andLMWDAWe
Industry leaders fear new Covid isolation rules could spark staffing crisis https://t.co/ZHpQtbCYOO
OECD warns Omicron may intensify supply shortages and inflation https://t.co/iATBoFiCxi
Your rights on working from home explained https://t.co/rkTMIgZEV6
Lockdown no excuse for failure to protect Arthur Labinjo-Hughes, says MP https://t.co/nzk9JBabZd
Prices boom for luxury homes outside London as buyers look for more space amid Covid pandemic https://t.co/irtaZObJlF
Facemasks are likely to remain a legal requirement on public transport and indoors but work from home guidance and vaccine passports are expected to be scrapped at the end of the month https://t.co/K2QnvIJ4Zf
Pub operators say they are expecting one of the best sales over a weekend sin