In [None]:
!pip install torch==1.2.0

In [None]:
!pip install sentence_transformers==0.3.5.1

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"

from STSDataReaderBinaryPositives import STSDataReaderBinaryPositives
from MNRLoss import MNRLoss
from MNRShuffler import ShuffledSentencesDataset, ShuffledSentenceTransformer, ModelExampleBasedShuffler
from torch.utils.data import DataLoader
import math
import os
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import *
import pandas as pd
import logging
import csv
import random
import shutil
import numpy as np
import torch
import pyltr


from tqdm import tqdm_notebook as tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers.util import pytorch_cos_sim

import pickle

import warnings
warnings.filterwarnings('ignore')

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

# Prepare data

In [None]:
res = []
for el in tqdm(os.listdir('datasets/CLEF/train/vclaims')):
    res.append(pd.read_json(os.path.join('datasets/CLEF/train/vclaims', el), typ='series'))
res = pd.concat(res,axis=1).T
res = res.assign(joined=lambda x: x['vclaim'] + ' [SEP] ' + x['title'])
res = res.assign(joined_sub=lambda x: x['vclaim'] + ' [SEP] ' + x['subtitle'])
res = res.assign(joined_all=lambda x: x['vclaim'] + ' [SEP] ' + x['title'] + ' [SEP] ' + x['subtitle'])
res.to_csv('datasets/CLEF/train/vclaims.tsv', sep='\t', index=False)

res = []
for el in tqdm(os.listdir('datasets/CLEF/semi-supervised-data-clef-format/vclaims')):
    res.append(pd.read_json(os.path.join('datasets/CLEF/semi-supervised-data-clef-format/vclaims', el),
                            typ='series'))
res = pd.concat(res,axis=1).T
res = res.assign(joined=lambda x: x['vclaim'] + ' [SEP] ' + x['title'])
res = res.assign(joined_sub=lambda x: x['vclaim'] + ' [SEP] ' + x['subtitle'])
res = res.assign(joined_all=lambda x: x['vclaim'] + ' [SEP] ' + x['title'] + ' [SEP] ' + x['subtitle'])
res.to_csv('datasets/CLEF/train/new_vclaims.tsv', sep='\t', index=False)

res = pd.concat([pd.read_csv('datasets/CLEF/train/vclaims.tsv', sep='\t'), pd.read_csv('datasets/CLEF/train/new_vclaims.tsv', sep='\t')])
res.reset_index(drop=True, inplace=True)
res.to_csv('datasets/CLEF/train/new_vclaims.tsv', sep='\t', index=False)

#____________________________________________________________________________________________


verclaims = pd.read_csv('datasets/CLEF/train/new_vclaims.tsv', sep='\t')
res = pd.concat([
    pd.read_csv('datasets/CLEF/train/tweets-train-dev.tsv', sep='\t', header=None),
    pd.read_csv('datasets/CLEF/semi-supervised-data-clef-format/tweets-all.tsv', sep='\t', header=None)
])
res.reset_index(drop=True, inplace=True)
res.to_csv('datasets/CLEF/train/new_tweets-train-dev.tsv', sep='\t', index=False)

tweets = pd.read_csv('datasets/CLEF/train/new_tweets-train-dev.tsv', sep='\t')
tweets.columns = ['iclaim_id', 'iclaim']

res = pd.concat([
    pd.read_csv('datasets/CLEF/train/qrels-train.tsv', sep='\t', header=None),
    pd.read_csv('datasets/CLEF/semi-supervised-data-clef-format/qrels-train-30.tsv',  sep='\t', header=None)
])
res.reset_index(drop=True, inplace=True)
res.to_csv('datasets/CLEF/train/new_qrels-train.tsv', sep='\t', index=False)

qrels = pd.read_csv('datasets/CLEF/train/new_qrels-train.tsv', sep='\t')
qrels.columns = ['iclaim_id', 'pad', 'vclaim_id', 'relevance']
qrels = qrels.join(tweets.set_index('iclaim_id'), on='iclaim_id')
qrels = qrels.join(verclaims.set_index('vclaim_id'), on='vclaim_id')
qrels.to_csv('datasets/CLEF/train/semi_train_part.tsv', sep='\t', index=False)
pd.concat([qrels.iloc[:800], qrels.iloc[999:]
          ]).to_csv('datasets/CLEF/train/train_part.tsv', sep='\t', index=False)
qrels.iloc[800:999].to_csv('datasets/CLEF/train/dev_part.tsv', sep='\t', index=False)

qrels = pd.read_csv('datasets/CLEF/train/qrels-dev.tsv', sep='\t', header=None)
qrels.columns = ['iclaim_id', 'pad', 'vclaim_id', 'relevance']
qrels = qrels.join(tweets.set_index('iclaim_id'), on='iclaim_id')
qrels.to_csv('datasets/CLEF/train/semi_dev_part.tsv', sep='\t', index=False)
qrels.to_csv('datasets/CLEF/train/test.tsv', sep='\t', index=False)

tweets_test = pd.read_csv('datasets/CLEF/test/subtask-2a--english/tweets-test.tsv', sep='\t').drop_duplicates(subset=['iclaim_id'])
tweets_test.columns = ['iclaim_id', 'iclaim']
qrels_test = pd.read_csv('datasets/CLEF/predictions/qrels-test.tsv', sep='\t', header=None)
qrels_test.columns = ['iclaim_id', 'pad', 'vclaim_id', 'relevance']
qrels_test = qrels_test.join(tweets_test.set_index('iclaim_id'), on='iclaim_id')
qrels_test.to_csv('datasets/CLEF/train/semi_test.tsv', sep='\t', index=False)

# Train SBERT

In [None]:
for seed in [4]:
    for start in [2]:
        for alpha_thr in [0.9]:
            model = ShuffledSentenceTransformer('stsb-bert-base')

            sts_reader_pos = STSDataReaderBinaryPositives('datasets/CLEF/', 
                    s1_col_idx=4, s2_col_idx=12, score_col_idx=3,normalize_scores=False, thr=0.6, get_positives=False)

            train_batch_size = 8
            num_epochs = 8

            train_data_MNR = ShuffledSentencesDataset(sts_reader_pos.get_examples('train/semi_train_part.tsv'), model
                                                     , ref_alpha=alpha_thr, start_ref_epoch=start)
            train_dataloader_MNR = DataLoader(train_data_MNR, shuffle=False, batch_size=train_batch_size)
            use_rescale = False
            if alpha_thr != 1:
                use_rescale = True
            train_loss_MNR = MNRLoss(model=model, norm_dim=1, tau=None, use_rescale=use_rescale)

            max_corpus_size = 30000

            ir_queries = {}
            ir_needed_qids = set()
            ir_corpus = {}
            ir_relevant_docs = {}

            with open(os.path.join('datasets/CLEF/', 'train/semi_dev_part.tsv'), encoding='utf8') as fIn:
                next(fIn)
                for line in fIn:
                    line = line.strip().split('\t')
                    qid, query, duplicate_ids = line[0], line[4], line[2].split(',')
                    ir_queries[qid] = query
                    ir_relevant_docs[qid] = set(duplicate_ids)

                    for qid in duplicate_ids:
                        ir_needed_qids.add(qid)
            distraction_questions = {}
            verclaims = pd.read_csv('datasets/CLEF/train/vclaims.tsv', sep='\t')
            for i, line in verclaims.iterrows():
                qid, question = str(line['vclaim_id']), str(line['joined_all'])

                if qid in ir_needed_qids:
                    ir_corpus[qid] = question
                else:
                    distraction_questions[qid] = question

            other_qid_list = list(distraction_questions.keys())
            random.shuffle(other_qid_list)

            for qid in other_qid_list[0:max(0, max_corpus_size-len(ir_corpus))]:
                ir_corpus[qid] = distraction_questions[qid]

            ir_evaluator = evaluation.InformationRetrievalEvaluator(ir_queries, ir_corpus, ir_relevant_docs,
                                                            map_at_k=[1,3,5,10,2000], mrr_at_k=[1,3], accuracy_at_k=[1])
            max_corpus_size = 30000

            ir_queries = {}
            ir_needed_qids = set()
            ir_corpus = {}
            ir_relevant_docs = {}

            with open(os.path.join('datasets/CLEF/', 'train/semi_test.tsv'), encoding='utf8') as fIn:
                next(fIn)
                for line in fIn:
                    line = line.strip().split('\t')
                    qid, query, duplicate_ids = line[0], line[4], line[2].split(',')
                    ir_queries[qid] = query
                    ir_relevant_docs[qid] = set(duplicate_ids)

                    for qid in duplicate_ids:
                        ir_needed_qids.add(qid)


            distraction_questions = {}
            verclaims = pd.read_csv('datasets/CLEF/train/vclaims.tsv', sep='\t')
            for i, line in verclaims.iterrows():
                qid, question = str(line['vclaim_id']), str(line['joined_all'])
                if qid in ir_needed_qids:
                    ir_corpus[qid] = question
                else:
                    distraction_questions[qid] = question

            other_qid_list = list(distraction_questions.keys())
            random.shuffle(other_qid_list)

            for qid in other_qid_list[0:max(0, max_corpus_size-len(ir_corpus))]:
                ir_corpus[qid] = distraction_questions[qid]

            ir_evaluator_test = evaluation.InformationRetrievalEvaluator(ir_queries, ir_corpus, ir_relevant_docs,
                                                            map_at_k=[1,3,5,10,2000], mrr_at_k=[1,3], accuracy_at_k=[1])

            max_corpus_size = 30000

            ir_queries = {}
            ir_needed_qids = set()
            ir_corpus = {}
            ir_relevant_docs = {}
            
            seq_evaluator = evaluation.SequentialEvaluator([ir_evaluator, ir_evaluator_test],
                                                           main_score_function=lambda scores: scores[0])

            shuffler = ModelExampleBasedShuffler(group_size=4, allow_same=True)
            warmup_steps = math.ceil(len(train_data_MNR)*num_epochs/train_batch_size*0.1)
            model_save_path = 'checkpoints_/model_1e-5_6_{}_{}_{}'.format(alpha_thr, start, seed)
            shutil.rmtree(model_save_path, ignore_errors=True)

            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)

            model.fit(train_objectives=[(train_dataloader_MNR, train_loss_MNR)],
                      evaluator=seq_evaluator,
                      epochs=num_epochs,
                      evaluation_steps=1000,
                      warmup_steps=warmup_steps,
                      output_path=model_save_path,
                      optimizer_params={'alpha_lr':0.4, 'lr': 1e-5},
                      shuffler=shuffler,
                      shuffle_idxs=[0]
                     )

            model = ShuffledSentenceTransformer(model_save_path)

            ir_evaluator_test.csv_file = "1e-5_6_{}_{}_{}_test.csv".format(alpha_thr, start, seed)
            ir_evaluator.csv_file = "1e-5_6_{}_{}_{}.csv".format(alpha_thr, start, seed)
            ir_evaluator_test(model, output_path='evaluations/')
            ir_evaluator(model, output_path='evaluations/')

# Train TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class TFIDF:
    def __init__(self, vectorizer):
        self.model = vectorizer
    
    def encode(self, query, show_progress_bar, batch_size, convert_to_tensor):
        return torch.Tensor(self.model.transform(query).todense())

vectorizer = TfidfVectorizer(sublinear_tf=True)
vectorizer.fit(ir_evaluator.corpus)  # repeat with different data (e. g. with line['joined'])
model = TFIDF(vectorizer)
ir_evaluator(model)

with open('checkpoints_/tfidf.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Reranking

In [None]:
verclaims = pd.read_csv('datasets/CLEF/train/vclaims.tsv', sep='\t')
queries = pd.read_csv('datasets/CLEF/train/dev_part.tsv', sep='\t')
queries_test = pd.read_csv('datasets/CLEF/train/test.tsv', sep='\t')
qrels_test = queries_test[['iclaim_id', 'iclaim']].drop_duplicates(subset=['iclaim_id'])
tweets_test = pd.read_csv('datasets/CLEF/test/subtask-2a--english/tweets-test.tsv', sep='\t').drop_duplicates(subset=['iclaim_id'])

with open('checkpoints_/tfidf.pkl', 'rb') as f:
    tf_idf = pickle.load(f)
    corpus = verclaims['vclaim'].values
    tf_idf_embeds = tf_idf.transform(corpus)

with open('checkpoints_/tfidf_joined.pkl', 'rb') as f:
    tf_idf_joined = pickle.load(f)
    corpus_joined = verclaims['joined'].values
    tf_idf_joined_embeds = tf_idf_joined.transform(corpus_joined)
    
with open('checkpoints_/tfidf_joined_all.pkl', 'rb') as f:
    tf_idf_joined_all = pickle.load(f)
    corpus_joined_all = verclaims['joined_all'].values.astype(str)
    tf_idf_joined_all_embeds = tf_idf_joined_all.transform(corpus_joined_all)
    
embedder_base = SentenceTransformer('model1')
embeds_base_joined = embedder_base.encode(corpus_joined)
embedder_large_joined = SentenceTransformer('model2')
embeds_large_joined = embedder_large_joined.encode(corpus_joined)
embedder_large_joined_all = SentenceTransformer('model3')
embeds_large_joined_all = embedder_large_joined_all.encode(corpus_joined_all)


def format_str(rel, qid, features, doc_id, query_id):
    features_str = []
    for i, el in enumerate(features):
        features_str.append(str(i + 1) + ':' + str(el))
    return '{} qid:{} '.format(rel, qid) + " ".join(features_str) + ' #' + doc_id + '|' + query_id


def get_features(queries, file, max_wrote=30, train_mode=True):
    if train_mode:
        gold_ids = queries['vclaim_id'].values
    queries_ids = queries['iclaim_id'].values
    queries = queries['iclaim'].values
    embeds_base_queries = embedder_base.encode(queries)
    embeds_large_joined_all_queries = embedder_large_joined_all.encode(queries)
    embeds_large_joined_queries = embedder_large_joined.encode(queries)

    data = []
    with open(file, 'w') as f:
        for qid, query in tqdm(enumerate(queries)):
            scores_embeds_large_joined = pytorch_cos_sim([embeds_large_joined_queries[qid]],
                                                               embeds_large_joined)[0].numpy()
            scores_embeds_large_joined_all = pytorch_cos_sim([embeds_large_joined_all_queries[qid]],
                                                               embeds_large_joined_all)[0].numpy()
            scores_embeds_base = pytorch_cos_sim([embeds_base_queries[qid]], 
                                                          embeds_base_joined)[0].numpy()
            scores_joined_all = cosine_similarity(tf_idf_joined_all.transform([query]), tf_idf_joined_all_embeds)[0]
            scores_joined = cosine_similarity(tf_idf_joined.transform([query]), tf_idf_joined_embeds)[0]
            scores = cosine_similarity(tf_idf.transform([query]), tf_idf_embeds)[0]
            
            repr_rank_embeds_large_joined = np.argsort(scores_embeds_large_joined)[::-1].argsort()
            repr_rank_embeds_large_joined_all = np.argsort(scores_embeds_large_joined_all)[::-1].argsort()
            repr_rank_embeds_base = np.argsort(scores_embeds_base)[::-1].argsort()
            repr_rank_joined_all = np.argsort(scores_joined_all)[::-1].argsort()
            repr_rank_joined = np.argsort(scores_joined)[::-1].argsort()
            repr_rank = np.argsort(scores)[::-1].argsort()
            
            features = np.stack([
                scores_embeds_large_joined, repr_rank_embeds_large_joined,
                scores_embeds_large_joined_all, repr_rank_embeds_large_joined_all,
                scores_embeds_base, repr_rank_embeds_base,
                scores_joined_all, repr_rank_joined_all,
                scores_joined, repr_rank_joined,
                scores, repr_rank,
            ], axis=1)
            
            features_dict = dict(zip(verclaims['vclaim_id'].values, features))
            
            if train_mode:
                gold_id = gold_ids[qid]
                f.write(format_str('1', qid, features_dict[gold_id], gold_id, queries_ids[qid]))
                f.write('\n')
            
            top_hits = sorted(features_dict.items(), key=lambda item: item[1][1])
            wrote = 0
            for el in top_hits:
                if wrote >= max_wrote:
                    break
                if not train_mode or el[0] != gold_id:
                    f.write(format_str('0', qid, el[1], el[0], queries_ids[qid]))
                    f.write('\n')
                    wrote += 1
                    
get_features(queries, 'datasets/CLEF/train/new_reranking_dev_part.txt', max_wrote=100)
get_features(queries_test, 'datasets/CLEF/train/new_reranking_test.txt', max_wrote=100)
get_features(qrels_test, 'datasets/CLEF/train/new_reranking_test_pred.txt', train_mode=False, max_wrote=100)
get_features(tweets_test, 'datasets/CLEF/train/new_reranking_test_pred_TEST.txt', train_mode=False, max_wrote=100)

with open('datasets/CLEF/train/new_reranking_dev_part.txt') as trainfile, \
    open('datasets/CLEF/train/new_reranking_test.txt') as valifile:
    TX, Ty, Tqids, Tcom = pyltr.data.letor.read_dataset(trainfile)
    EX, Ey, Eqids, Ecom = pyltr.data.letor.read_dataset(valifile)


k = '169'
TX, Ty, Tqids, VX, Vy, Vqids = TX[Tqids <= k], Ty[Tqids <= k], Tqids[Tqids <= k], TX[Tqids > k], Ty[Tqids > k], Tqids[Tqids > k]


metric = pyltr.metrics.AP(k=5)

monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=400)

metric = pyltr.metrics.AP(k=5)

# Only needed if you want to perform validation (early stopping & trimming)
monitor = pyltr.models.monitors.ValidationMonitor(
    VX, Vy, Vqids, metric=metric, stop_after=400)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=1500,
    learning_rate=0.02,
    max_features=0.3,
    query_subsample=0.3,
    max_leaf_nodes=12,
    max_depth=3,
    min_samples_leaf=1,
    verbose=1,
    random_state=57
)

In [None]:
model.fit(TX, Ty, Tqids, monitor=monitor)

with open('datasets/CLEF/train/new_reranking_test.txt') as valifile:
    EX, Ey, Eqids, Ecom = pyltr.data.letor.read_dataset(valifile)
Epred = model.predict(EX)
metric.calc_mean(Eqids, Ey, Epred)

print(model.feature_importances_)

In [None]:
with open('datasets/CLEF/train/new_reranking_test_pred.txt') as testfile:
    EX, Ey, Eqids, Ecom = pyltr.data.letor.read_dataset(testfile)
scores = model.predict(EX)
q_ids = []
doc_ids = []
assert len(set(Ecom)) == len(Ecom)
for el in Ecom:
    doc_id, q_id = el.split('|')
    q_ids.append(q_id)
    doc_ids.append(doc_id)
    
ans = pd.DataFrame.from_dict({'tweet_id': q_ids, 'Q0': ['Q0'] * len(q_ids), 'vclaim_id': doc_ids,
        'rank': ['1'] * len(q_ids),
       'score': scores, 'tag': ['lambdamart'] * len(q_ids)})

ans = ans.sort_values(['tweet_id','score'],ascending=False)
ans.to_csv('datasets/CLEF/predictions/lambdamart_test_semi.tsv', sep='\t', index=False, header=None)

In [None]:
!python datasets/CLEF/predictions/scorer/main.py -g datasets/CLEF/predictions/gold_test.tsv -p datasets/CLEF/predictions/lambdamart_test_semi.tsv

In [None]:
with open('datasets/CLEF/train/new_reranking_test_pred_TEST.txt') as testfile:
    EX, Ey, Eqids, Ecom = pyltr.data.letor.read_dataset(testfile)
scores = model.predict(EX)
q_ids = []
doc_ids = []
assert len(set(Ecom)) == len(Ecom)
for el in Ecom:
    doc_id, q_id = el.split('|')
    q_ids.append(q_id)
    doc_ids.append(doc_id)
    
ans = pd.DataFrame.from_dict({'tweet_id': q_ids, 'Q0': ['Q0'] * len(q_ids), 'vclaim_id': doc_ids,
        'rank': ['1'] * len(q_ids),
       'score': scores, 'tag': ['lambdamart'] * len(q_ids)})

ans = ans.sort_values(['tweet_id','score'],ascending=False)
ans.to_csv('datasets/CLEF/predictions/lambdamart_test_semi_TEST.tsv', sep='\t', index=False, header=None)

In [None]:
!python datasets/CLEF/predictions/scorer/main.py -g datasets/CLEF/predictions/qrels-test.tsv -p datasets/CLEF/predictions/lambdamart_test_semi_TEST.tsv