# Data Preprocessing

### Please download and extract STS dataset from here: https://gluebenchmark.com/tasks

In [2]:
import csv
import os
from tqdm import tqdm 
import time

In [10]:
from nltk import tokenize
from gensim.parsing.preprocessing import remove_stopwords
from tqdm import tqdm
import nltk
import networkx as nx
ps = nltk.stem.PorterStemmer()

In [5]:
import sys
sys.path.append('../')
from utils import *
from graphUtils import *

In [7]:
ground_truth = {}
first = set()
second = set()

for f in (['test','dev','train']):

    tsv_file = open(f"../../../MatchingText/STS-B/original/sts-{f}.tsv")
    read_tsv = csv.reader(tsv_file, delimiter="\t")

    for row in read_tsv:
        if len(row) != 7: continue
        if float(row[4])>= 2:
            first.add(normalize_text(row[5]))
            second.add(normalize_text(row[6]))
            if row[5] not in ground_truth:
                ground_truth[normalize_text(row[5])] = [normalize_text(row[6])]
            else:
                ground_truth[normalize_text(row[5])].append(normalize_text(row[6]))

In [8]:
len(ground_truth)

5004

In [9]:
len(first), len(second)

(5004, 4987)

# 1. TDM

In [13]:
from nltk.corpus import wordnet

G=nx.Graph()
K = 2
i = 0
nodes_labels = {}
claim_ids = {}
id_claim = {}

for claim in tqdm(first):
    node = remove_stopwords(normalize_text(claim))
    i+=1

    node_name = str('CLM'+str(i))
    G.add_node(node_name , label= node_name, type='Claim')
    claim_ids[node_name] = claim
    id_claim[claim] = node_name
                
        
    n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(node,K)]
    n_grams = sorted(n_grams, key=lambda dist: len(dist),reverse = True)
    
    for tg in n_grams:
        token = tg
            
        G.add_node(token,label=token, type='Token')
            
        if not G.has_edge(node_name,token): G.add_edge(node_name,token)

100%|██████████| 5004/5004 [00:01<00:00, 3279.10it/s]


In [14]:
i = 0
fact_ids = {}
id_fact = {}
node_maps = []

for fact in tqdm(second):
    node = remove_stopwords(normalize_text(fact))
    i += 1
    name = str('FCT'+ str(i))
    
    fact_ids[name] = fact
    id_fact[fact] = name
    
    G.add_node(name,label = name, type='Fact')
    
    n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(node,K)]
    n_grams = sorted(n_grams, key=lambda dist: len(dist),reverse = True)
    
    for tg in n_grams:
        token = tg
        
        if not G.has_node(token): continue

        if not G.has_edge(name,token):            G.add_edge(name,token)


100%|██████████| 4987/4987 [00:01<00:00, 3282.99it/s]


In [15]:
len(G.nodes()), len(G.edges())

(44176, 105647)

# Expansion

In [None]:
import conceptnet_lite
conceptnet_lite.connect("conceptnet.db")

In [None]:
from conceptnet_lite import Label, edges_for
from tqdm import tqdm

for node in tqdm(G.copy().nodes()):
    if G.nodes()[node]['type'] != 'Token': continue
    
    try:
        for e in edges_for(Label.get(text=G.nodes()[node]['label'].replace('_',' '), language='en').concepts, same_language=True):
            if e.start.text == node:
                new_node = e.end.text
            else:
                new_node = e.start.text
            rel = e.relation.name
            
            for n in normalize_text(new_node).split():
                if not G.has_node(n):
                    G.add_node(n, label = n, type = 'Token')
            G.add_edge(node,n,type= rel)
    except:
        continue
        
for n in G.copy().nodes():
    if G.degree()[n] < 2:
        G.remove_node(n)

In [None]:
len(G.nodes()),len(G.edges())

# Random Walks

In [None]:
import random
def random_walk(node,l):
    res = ''
    
    p = 0
    chosen = node
    
    res += chosen

    while (p<l):
        chosen = random.sample([n for n in nx.neighbors(G,chosen)],1)[0]
        #if G.nodes[chosen]['type'] in ['Claim','Fact','Token','node']:
        res += ' ' + chosen
        p+=1
        
    return res


def generate_random_walks(k,l):
    rws = []
    
    for i in tqdm(range(0,k),position=0):
        for node in G.nodes():
            if len([n for n in nx.neighbors(G,node)]) == 0:
                continue
            #if G.nodes[node]['type'] in ['Claim','Fact','node']:
            rws.append(random_walk(node,l))
    return rws


In [None]:
docs = []
random_paths = generate_random_walks(100,l=25)
for p in random_paths:
    docs.append(p)

In [None]:
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from nltk.tokenize import word_tokenize

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
from tqdm import tqdm 
tagged_data = []
for d in tqdm(docs,position=0):
    tagged_data.append(word_tokenize(d))

In [None]:
%env PYTHONHASHSEED=0
max_epochs = 10
vec_size = 100

model = Word2Vec(size=vec_size, min_count=0, window=20, sg=1, seed=0, workers = 4)

model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

print("Model is Ready")

In [None]:
claim_facts = {}
for claim in tqdm(ground_truth,position=0):
    if claim not in id_claim: continue
    cl_id = id_claim[claim]
    filtered_facts = {}
    
    if cl_id not in model.wv: continue
    claim_facts[cl_id] = distance_w2v (model,cl_id,fact_ids,50000)

In [None]:
for KK in [1,5,20,30000]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for claim in claim_facts:
        
        i+=1
        preds = [fact_ids[f] for (f,j) in claim_facts[claim]][0:KK]
        golds = ground_truth[claim_ids[claim]]

        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)


    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)

# 2. SenteneBERT

In [None]:
from sentence_transformers import SentenceTransformer
SBmodel = SentenceTransformer('bert-base-nli-mean-tokens')

In [90]:
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
sentences_embs_f = SBmodel.encode([s for s in fact_ids.values()],show_progress_bar=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=117.0, style=ProgressStyle(description_widt…




In [91]:
# import time
st = time.time()
pred_sentences = {}


for claim in tqdm(ground_truth,position=0):
    m_emb = SBmodel.encode(claim)
  
    temp = []
    for s in range(0,len(fact_ids)):
        temp.append(([r for r in fact_ids.keys()][s],cosine_similarity(m_emb.reshape(1, -1),sentences_embs_f[rv].reshape(1, -1))[0][0]))
    pred_sentences[claim] = sorted(temp,key=lambda dist:dist[1],reverse=True)
  
#print(time.time()-st)

100%|██████████| 3753/3753 [1:27:12<00:00,  1.39s/it]


In [92]:
for KK in [1,5,20,500]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for sent in pred_sentences:
        if sent not in ground_truth: continue
        i+=1
        preds = [fact_ids[f] for (f,j) in pred_sentences[sent]][0:KK]
        golds = [f for f in ground_truth[sent]]

        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)


#################### 1 ###########################

MRR: 0.8206767918998135 MAP: 0.8205435651478817 HAS POSITIVE: 0.8206767918998135

#################### 5 ###########################

MRR: 0.8814681588062885 MAP: 0.8813349320543566 HAS POSITIVE: 0.9656274980015987

#################### 20 ###########################

MRR: 0.8849612105883561 MAP: 0.8848723927537349 HAS POSITIVE: 0.9944044764188649

#################### 500 ###########################

MRR: 0.8850912109314689 MAP: 0.8850023930968477 HAS POSITIVE: 0.9994670929922729


# 3. ReRank

In [105]:
from gensim.summarization.bm25 import get_bm25_weights
from gensim.summarization.bm25 import BM25

In [106]:
corpus = [sent.split() for sent in id_fact]
results = BM25(corpus)
sentences_full = {}
for r in id_fact:    sentences_full[r] = r

In [107]:
import numpy as np
sentences_BM25 = {}
for sent in tqdm(ground_truth):
    m_id = id_claim[sent]
    text = sent
    scores = results.get_scores((text).split())
    arr = np.array(scores)
    topK = arr.argsort()[::-1]
    sentences_BM25[m_id] = [(id_fact[' '.join(corpus[idx])],scores[idx]) for idx in topK]


100%|██████████| 5004/5004 [01:01<00:00, 81.33it/s] 


In [109]:
from sentence_transformers import SentenceTransformer
SBmodel = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

Some weights of the model checkpoint at /home/pignal/.cache/torch/sentence_transformers/sbert.net_models_bert-base-nli-stsb-mean-tokens/0_BERT were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [110]:
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense

In [111]:
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [112]:
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

embeds_sent = {}
embeds = {}
for sent in tqdm(id_fact,position=0):
    embeds [sent] = SBmodel.encode(rv)
    
    embeds_sent [sent] = []
    
    sents = nltk.tokenize.sent_tokenize(sent)
    for s in sents:
        embeds_sent[sent].append(SBmodel.encode(s))


100%|██████████| 4987/4987 [09:36<00:00,  8.65it/s]


In [113]:
K = 5

score = []
for claim in tqdm(ground_truth,position=0):
  m_emb = SBmodel.encode(claim)
    
  for s in id_fact:
    temp = []
    for sent in embeds_sent[s]:
        temp.append(cosine_similarity(m_emb.reshape(1, -1),sent.reshape(1, -1))[0][0])
    temp = sorted(temp,reverse=True)[0:3]
    
    temp.append(cosine_similarity(m_emb.reshape(1, -1),embeds[review].reshape(1, -1))[0][0])
    
    while len(temp) < K:        temp.append(0)
        

    
    if id_fact[sent] in ground_truth[claim]: temp.append(1)
    else: temp.append(0)
        
    score.append(temp)



100%|██████████| 5004/5004 [3:04:48<00:00,  2.22s/it]  


In [114]:
import numpy as np

dataset = np.array(score)
X = dataset[:,0:4]
y = dataset[:,5]

In [115]:
import keras
from keras import losses,optimizers

model = Sequential()
model.add(Dense(20, input_dim=4, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.Adam(lr=1e-3), metrics=['accuracy'])

In [116]:
st = time.time()
class_weight = {0: 1.,1: 50.}

model.fit(X, y, epochs=50, batch_size=2048)
print(time.time()-st)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
1362.0752198696136


In [117]:
K = 5

predictions = {}

for claim in tqdm(ground_truth,position=0):
  m_emb = SBmodel.encode(claim)

  seen = []
  data,scores = [],[]

  for sent in id_fact:
    seen.append(id_fact[sent])

    temp = []
    for sent in embeds_sent[sent]:
        temp.append(cosine_similarity(m_emb.reshape(1, -1),sent.reshape(1, -1))[0][0])
    temp = sorted(temp,reverse=True)[0:3]
    
    temp.append(cosine_similarity(m_emb.reshape(1, -1),review_embeds[review].reshape(1, -1))[0][0])
    
    while len(temp) < K-1:        temp.append(0)
    
    
    
    data.append(temp)

  res = model.predict(np.array(data))
        
  for i in range(0,len(res)):
      scores.append((seen[i],res[i][0]))
        
  predictions[claim] = sorted(scores, key=lambda dist: dist[1])

100%|██████████| 5004/5004 [3:19:42<00:00,  2.39s/it]  


In [119]:
import dlib

In [120]:
predictions_SB = predictions
predictions_BM25 = sentences_BM25

In [121]:
data = dlib.ranking_pair()

In [122]:
st = time.time()
for claim in tqdm(ground_truth,position=0):
    if claim not in predictions_SB or id_claim[claim] not in predictions_BM25: continue

    m_BM = [i for (i,j) in predictions_BM25[ id_claim[claim]]]
    m_SB = [i for (i,j) in predictions_SB[claim]]
    
    
    for r in fact_ids:
        if fact_ids[r] in ground_truth[claim]: 
            data.relevant.append(dlib.vector([m_BM.index(r)+1, m_SB.index(r)+1]))
        else:  
            data.nonrelevant.append(dlib.vector([m_BM.index(r)+1, m_SB.index(r)+1]))

trainer = dlib.svm_rank_trainer()
trainer.c = 1000

rank = trainer.train(data)

100%|██████████| 5004/5004 [42:17<00:00,  1.97it/s] 


In [123]:
st = time.time()
rerank = {}

for claim in tqdm(ground_truth,position=0):
    if claim not in predictions_SB or id_claim[claim] not in predictions_BM25: continue
    m_BM = [i for (i,j) in predictions_BM25[ id_claim[claim]]]
    m_SB = [i for (i,j) in predictions_SB[claim]]
    
    temp = []
    for r in fact_ids:
        temp.append((r,rank(dlib.vector([m_BM.index(r)+1, m_SB.index(r)+1]))))
    temp = sorted(temp, key=lambda dist: dist[1],reverse = True)
    rerank[claim] = temp

100%|██████████| 5004/5004 [40:42<00:00,  2.05it/s]


In [124]:
for KK in [1,5,20,200]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MR, hasP = 0,0,0

    for sent in rerank:
        if sent not in ground_truth: continue
        i+=1
        preds = [fact_ids[f] for (f,j) in rerank[sent]][0:KK]
        golds = [f for f in ground_truth[sent]]

        MAP += MAP_K(golds,preds)
        MR += MRR(golds,preds)
        hasP += HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)



#################### 1 ###########################

MRR: 0.7144284572342127 MAP: 0.7143285371702638 HAS POSITIVE: 0.7144284572342127

#################### 5 ###########################

MRR: 0.7887023714361845 MAP: 0.7886024513722356 HAS POSITIVE: 0.8990807354116707

#################### 20 ###########################

MRR: 0.7962447867284447 MAP: 0.7961698466804831 HAS POSITIVE: 0.9670263788968825

#################### 200 ###########################

MRR: 0.7970618005843749 MAP: 0.7969868605364132 HAS POSITIVE: 0.9950039968025579
