In [1]:
import numpy as np
import pandas as pd
import torch
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
sys.path.append('../')
from utils import *
from graphUtils import *

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

# Dataset


In [3]:
import pickle
all_claims = pickle.load(open('../../data/corona/corona_allClaims','rb'))
user_claims = pickle.load(open('../../data/corona/user_claims.pkl','rb'))
all_tables = pickle.load(open('../../data/corona/corona_tables','rb'))

In [4]:
from tqdm import tqdm
import networkx as nx

table_columns = {1:'table',2:'country',3:'january',4:'february',5:'march',6:'april',7:'may',8:'june'}
G=nx.Graph()
K = 3

i = 0
nodes_labels = {}
row_ids = {}
id_rows = {}

for row in tqdm(all_tables):
    i+=1
    row_name = str('RW'+str(i))
    G.add_node(row_name , label= row_name, type='Row')
    row_ids[row_name] = ' '.join([r for r in row])
    id_rows[' '.join(row)] = row_name
    
    j=0
    for cl in row:
        j+=1
        col_name = table_columns[j]
        if cl == '': continue
        if not G.has_node(col_name):     G.add_node(col_name , label= col_name, type='Column')
        n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(str(cl),K)]
        for tg in n_grams:
            G.add_node(tg,label=tg, type='Token')
            G.add_edge(row_name,tg)
            G.add_edge(col_name,tg)
            


i = 0
claim_ids = {}
id_claim = {}
all_claims.update(user_claims)


for claim in tqdm(all_claims):
    i += 1
    text = (claim)
    claim_name = str('Claim'+str(i))
    G.add_node(claim_name , label= claim_name, type='Claim')
    claim_ids[claim_name] = claim
    id_claim[claim] = claim_name
    
    n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(text,K)]

    for tg in n_grams:
        if not G.has_node(tg):            
            continue
           
        if not G.has_edge(claim_name,tg):            G.add_edge(claim_name,tg)
        

100%|██████████| 1158/1158 [00:01<00:00, 702.94it/s]
100%|██████████| 7058/7058 [00:01<00:00, 4096.01it/s]


In [5]:
ground_truth = {}

for cl in claim_ids:
    ground_truth[cl] = []
    if claim_ids[cl] in user_claims:
        for r in user_claims[claim_ids[cl]]:
            for rr in id_rows:
                if ' '.join(reversed(r)) in rr:             
                    ground_truth[cl].append(id_rows[rr])
    else:
        for r in id_rows:
            if ' '.join(all_claims[claim_ids[cl]][0:2]) in r:
                ground_truth[cl].append(id_rows[r])

# Method0: BM25

In [7]:
from gensim.summarization.bm25 import get_bm25_weights
from gensim.summarization.bm25 import BM25

In [8]:
corpus = [row for row in all_tables]
results = BM25(corpus)

In [9]:
import numpy as np
query_BM25 = {}
for query in tqdm(ground_truth):
    text = claim_ids[query]
    scores = results.get_scores(text.split())
    arr = np.array(scores)
    topK = arr.argsort()[::-1]
    query_BM25[query] = [(corpus[idx],scores[idx]) for idx in topK]

100%|██████████| 7058/7058 [00:15<00:00, 455.40it/s]


In [10]:
for KK in [1,2,3,5,10,20,50,50000]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for query in query_BM25:
        if query not in ground_truth or len(ground_truth[query])==0: continue
        if query in [id_claim[c] for c in user_claims]: continue
            
        i+=1
        preds = [id_rows[' '.join(f)] for (f,j) in query_BM25[query]][0:KK]
        golds = [g for g in ground_truth[query]]
        
        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)



#################### 1 ###########################

MRR: 0.1412037037037037 MAP: 0.1388888888888889 HAS POSITIVE: 0.1412037037037037

#################### 2 ###########################

MRR: 0.21643518518518517 MAP: 0.21357783564814814 HAS POSITIVE: 0.2916666666666667

#################### 3 ###########################

MRR: 0.2578124999999971 MAP: 0.25444878472221955 HAS POSITIVE: 0.4157986111111111

#################### 5 ###########################

MRR: 0.31111111111111095 MAP: 0.3073929398148139 HAS POSITIVE: 0.6514756944444444

#################### 10 ###########################

MRR: 0.33897018298060017 MAP: 0.3352407591122872 HAS POSITIVE: 0.8310185185185185

#################### 20 ###########################

MRR: 0.33958778160395503 MAP: 0.3359237541769066 HAS POSITIVE: 0.8392650462962963

#################### 50 ###########################

MRR: 0.3398344050608905 MAP: 0.3361788879824261 HAS POSITIVE: 0.8470775462962963

#################### 50000 #########################

# Method1: Unsupervised SentenceBERT

In [11]:
from gensim.parsing.preprocessing import remove_stopwords

vocabs = set()
for node in G.nodes():
    if G.nodes()[node]['type'] == 'Token':
        if len(node.split('_')) == 1: vocabs.add(node)

def return_filtered(text):
    text = remove_stopwords(normalize_text(text))
    t = ''
    for token in word_tokenize(text):
        if token in vocabs: t += token + ' '
    return t   

In [11]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m252.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting torchvision
  Downloading torchvision-0.15.1-cp38-cp38-manylinux1_x86_64.whl (33.8 MB)
[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/33.8 MB[0m [31m16.4 kB/s[0m eta [36m0:33:13[0m

In [7]:
from sentence_transformers import SentenceTransformer

ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
model = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

In [None]:
special_tokens_dict = {'additional_special_tokens': ['[COL]','[VAL]']}

In [None]:
word_embedding_model = model._first_module()   #Your models.Transformer object
word_embedding_model.tokenizer.add_special_tokens(special_tokens_dict)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

In [None]:
table_content = []
for t in all_tables:
  for c in range(0,len(t)):
    text += ' [COL] ' + str(table_columns[c+1]) + ' [VAL] ' + str(t[c])
  
  table_content.append(text)

In [None]:
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

claim_embs = model.encode([t for t in table_content],show_progress_bar=True)

In [None]:
pred_quey = {}

for claim in tqdm([g for g in all_claims.keys()],position=0):
  text = claim 

  m_emb = model.encode(claim)
  
  temp = []
  for rv in range(0,len(row_ids)):
    temp.append(([r for r in row_ids.keys()][rv],cosine_similarity(m_emb.reshape(1, -1),claim_embs[rv].reshape(1, -1))[0][0]))
  pred_quey[id_claim[claim]] = sorted(temp,key=lambda dist:dist[1],reverse=True)
  

In [None]:
for KK in [1,5,20,500]: 
    i = 0
    MAP, MR, hasP = 0,0,0

    for query in pred_query:
        if query not in ground_truth or len(ground_truth[query])==0: continue
        if query not in [id_claim[c] for c in user_claims]: continue
            
        i+=1
        preds = [f for (f,j) in pred_query[query]][0:KK]
        golds = [g for g in ground_truth[query]]
        
        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)


# Method2: Supervised SentenceBERT

In [None]:
from sentence_transformers import SentenceTransformer
SBmodel = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

In [None]:
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense

In [None]:
table_embs = {}
for row in tqdm(id_rows,position=0):
  table_embs[row] = model.encode(row)

In [None]:
query_score = []

for claim in tqdm([g for g in all_claims.keys()],position=0):
  text = claim
    
  m_emb = model.encode(text)
    
  for rw in id_rows:
    temp = []
    temp.append(cosine_similarity(m_emb.reshape(1, -1),table_embs[rw].reshape(1, -1))[0][0])
      
    if id_rows[rw] in ground_truth[claim]: temp.append(1)
    else: temp.append(0)
    query_score.append(temp)

In [None]:
import numpy as np

dataset = np.array(query_score)
X = dataset[:,0:1]
y = dataset[:,1]

In [None]:
import keras
from keras import losses,optimizers

model = Sequential()
model.add(Dense(20, input_dim=1, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.Adam(lr=1e-3), metrics=['accuracy'])

In [None]:
class_weight = {0: 1.,1: 500.}

model.fit(X, y, epochs=50, batch_size=2048,class_weight=class_weight)

In [None]:
query_predictions = {}

for query in tqdm(ground_truth,position=0):
  if query not in [id_claim[c] for c in user_claims]: continue

    
  text = claim_ids[query]
            
  m_emb = SBmodel.encode(text)

  seen = []
  data,scores = [],[]

  for rw in id_rows:
    seen.append(rw)
    data.append(cosine_similarity(m_emb.reshape(1, -1),table_embs[rw].reshape(1, -1))[0][0])

  res = model.predict(np.array(data))
        
  for i in range(0,len(res)):
      scores.append((seen[i],res[i][0]))
        
  query_predictions[query] = sorted(scores, key=lambda dist: dist[1],reverse = True)

In [None]:
for KK in [1,5,20,500]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for query in query_predictions:
        if id_claim[query] not in ground_truth or len(ground_truth[id_claim[query]])==0: continue
        if query in [id_claim[c] for c in user_claims]: continue
        
        i+=1
        preds = [id_rows[f] for (f,j) in query_predictions[query]][0:KK]
        golds = [f for f in ground_truth[id_claim[query]]]

        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)

# Method3: Reranking

In [None]:
corona_SB = query_predictions
corona_BM25 = query_BM25 

In [None]:
import dlib
data = dlib.ranking_pair()

In [None]:
import time
st = time.time()
for query in tqdm(ground_truth,position=0):

    m_BM = [' '.join(i) for (i,j) in corona_BM25[ query]]
    m_SB = [i for (i,j) in corona_SB[query]]
    
    
    for r in row_ids:
        if r in ground_truth[query]: 
            data.relevant.append(dlib.vector([m_BM.index(row_ids[r])+1, m_SB.index(r)+1]))
        else:  data.nonrelevant.append(dlib.vector([m_BM.index(row_ids[r])+1, m_SB.index(r)+1]))

trainer = dlib.svm_rank_trainer()
trainer.c = 1000

rank = trainer.train(data)
print(time.time()-st)

In [None]:
st = time.time()
i=0
rerank_corona = {}

for query in tqdm(ground_truth,position=0):
    i+=1

    m_BM = [' '.join(i) for (i,j) in corona_BM25[ query]]
    m_SB = [i for (i,j) in corona_SB[query]]

    temp = []
    for r in row_ids:
        temp.append((r,rank(dlib.vector([m_BM.index(row_ids[r])+1, m_SB.index(r)+1]))))
    temp = sorted(temp, key=lambda dist: dist[1],reverse = True)
    rerank_corona[query] = temp
print((time.time()-st)/i)

In [None]:
for KK in [1,5,20,500]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for query in rerank_corona:
        if query not in ground_truth or len(ground_truth[query])==0: continue
        if query in [id_claim[c] for c in user_claims]: continue
        
        i+=1
        preds = [f for (f,j) in rerank_corona[query]][0:KK]
        golds = [f for f in ground_truth[query]]

        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)

# Method4: Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
data = []
for query in all_tables:
    text = ' '.join([str(m) for m in query]) 
    data.append(text)
tagged_data = [TaggedDocument(words=word_tokenize(_d), tags=[str(i)]) for i, _d in enumerate(data)]

In [None]:
%env PYTHONHASHSEED=0
max_epochs = 10
vec_size = 50

model = Doc2Vec(size=vec_size, min_count=10, dm =0, workers=1, window=4,seed=0, epochs=max_epochs
                )

model.build_vocab(tagged_data)

model.train(tagged_data, total_examples=model.corpus_count,epochs=model.epochs)


print("Model Saved")

In [None]:
import numpy as np
query_d2v = {}
for query in tqdm(ground_truth):
    text = claim_ids[query]
    
    query_d2v[query] = utils.cosine_distance(model,text,id_rows,500)

In [None]:
for KK in [1,5,20,500]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for query in query_d2v:
        if claim_ids[query] not in ground_truth: continue
        i+=1
        preds = [id_review[f] for (f,j) in query_d2v[query]][0:KK]
        golds = [f for f in ground_truth[row_ids[query]]]

        MAP += MAP_K(golds,preds)
        MRR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)