# TREC 2021 Deep Learning Track (Information Retrieval)<br>Task2<br>Passages Ranking
Auther: Jingxuan Liu<br>
Id: 173098550

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from sklearn.model_selection import train_test_split
import json
pd.options.mode.chained_assignment = None

In [2]:
def get_passage(passage_id):
    (string1, string2, bundlenum, position) = passage_id.split('_')
    assert string1 == 'msmarco' and string2 == 'passage'

    with open(f'./msmarco_v2_passage/msmarco_passage_{bundlenum}', 'rt', encoding='utf8') as in_fh:
        in_fh.seek(int(position))
        json_string = in_fh.readline()
        passage = json.loads(json_string)
        assert passage['pid'] == passage_id
        return passage
    
def get_document(document_id):
    (string1, string2, bundlenum, position) = document_id.split('_')
    assert string1 == 'msmarco' and string2 == 'doc'

    with open(f'./msmarco_v2_doc/msmarco_doc_{bundlenum}', 'rt', encoding='utf8') as in_fh:
        in_fh.seek(int(position))
        json_string = in_fh.readline()
        document = json.loads(json_string)
        assert document['docid'] == document_id
        return document

passage = get_passage('msmarco_passage_41_45753370')
print(passage.keys())
passage['passage']

dict_keys(['pid', 'passage', 'spans', 'docid'])


'TANF funds should target the families with the lowest incomes and should be used primarily for cash assistance. Congress also should replace the work participation rate with access measures to ensure that states serve families in need and with performance measures based on employment and earnings outcomes.'

# Prepare Training Data

In [3]:
train_qrels_df = pd.read_csv('./train/passv2_train_qrels.tsv', names = ['0','file','1'], header = None,delimiter = "\t")
train_queries_df = pd.read_csv('./train/passv2_train_queries.tsv', delimiter = "\t" , header=None, names = ['id','topic'])
train_top100_df = pd.read_csv('./train/passv2_train_top100.txt', delimiter = " ", names = ['id','used','file','rank','score','username'])

Dictionary for Passage 

In [4]:
passages = train_qrels_df['file'].values
passages_index = dict()
for pas in passages:
    passage = get_passage(pas)
    if passage['docid'] != "":
        passages_index[pas] = passage['docid']
print(f'Found {len(passages_index)} passage number.')
dict(list(passages_index.items())[0:5])

Found 245838 passage number.


{'msmarco_passage_08_840101254': 'msmarco_doc_08_1219973188',
 'msmarco_passage_01_444503625': 'msmarco_doc_01_1169432284',
 'msmarco_passage_20_461843390': 'msmarco_doc_18_1691014504',
 'msmarco_passage_00_837399976': 'msmarco_doc_00_1646189544',
 'msmarco_passage_08_12770678': 'msmarco_doc_07_1345903696'}

Dictionary for Id and Topic

In [5]:
topics_index = train_queries_df.set_index('id').to_dict()['topic']
print(f'Found {len(topics_index)} doc number.')
dict(list(topics_index.items())[0:10])

Found 277144 doc number.


{121352: 'define extreme',
 510633: 'tattoo fixers how much does it cost',
 674172: 'what is a bank transit number',
 570009: 'what are the four major groups of elements',
 54528: 'blood clots in urine after menopause',
 507001: 'symptoms of an enlarged heart in dogs',
 466926: 'number of times congress voted to repeal aca',
 224811: 'how does a firefly light up',
 918533: 'what was introduced to the human diet in what year',
 80926: 'can you use wallapop on your computer'}

Dictionary for Document and Title

In [6]:
title_df = pd.read_csv('./train/docv2_train_qrels.tsv', names = ['0','file','1'], header = None,delimiter = "\t")
docs = title_df['file'].values
documents_index = dict()
for doc in docs:
    document = get_document(doc)
    if document['title'] != "":
        documents_index[doc] = document['title']
print(f'Found {len(documents_index)} doc number.')
dict(list(documents_index.items())[0:5])

Found 267649 doc number.


{'msmarco_doc_10_1691063043': 'French and the francophonie in Canada',
 'msmarco_doc_05_72507775': 'Westminster, California (CA 92683) profile: population, maps, real estate, averages, homes, statistics, relocation, travel, jobs, hospitals, schools, crime, moving, houses, news, sex offenders',
 'msmarco_doc_19_673141443': 'Westminster, California - Wikipedia',
 'msmarco_doc_19_673231526': 'Westminster, Massachusetts - Wikipedia',
 'msmarco_doc_19_673209131': 'Westminster, Maryland - Wikipedia'}

In [7]:
train_top100_df['id'] = train_top100_df.id.map(topics_index)
train_top100_df['file'] = train_top100_df.file.map(passages_index)
train_top100_df['title'] = train_top100_df.file.map(documents_index)
train_top100_df = train_top100_df.dropna()
train_top100_df['x_train'] = train_top100_df['id']+" "+train_top100_df['title']
train_top100_df.head(5)

Unnamed: 0,id,used,file,rank,score,username,title,x_train
93,ECT is a treatment that is used for,Q0,msmarco_doc_42_1017960230,94,10.552398,Anserini,"Effects of ECT, ECT Side Effects | HealthyPlace",ECT is a treatment that is used for Effects ...
113,The vitamin that prevents beriberi is,Q0,msmarco_doc_06_1452720146,14,12.9991,Anserini,"Beriberi: Overview, Causes, and Symptoms",The vitamin that prevents beriberi is Beribe...
164,The vitamin that prevents beriberi is,Q0,msmarco_doc_28_275379947,65,12.0319,Anserini,"Beriberi - Simple English Wikipedia, the free ...",The vitamin that prevents beriberi is Beribe...
327,dosimetry medical definition,Q0,msmarco_doc_01_1997961934,28,11.1158,Anserini,What is Medical Dosimetry?,dosimetry medical definition What is Medical ...
533,shu means what,Q0,msmarco_doc_05_1004011146,34,8.9978,Anserini,What does shu mean?,shu means what What does shu mean?


# Tokenize and Pad Data for Training Data

In [8]:
x = train_top100_df['x_train'].values
max = 0 
s = ''
for i in x:
    if len(i)>max:
        max = len(i)
        s= i
print(i)

_________ justice is designed to repair the harm to victim, the community and the offender caused by the offender criminal act. question 19 options: Restorative justice - Wikipedia


In [9]:
max_feature = 20000
max_text_length = 30

x = train_top100_df['x_train'].values
y = train_top100_df['score'].values

x_tokenizer = text.Tokenizer(max_feature)
x_tokenizer.fit_on_texts(list(x))
x_tokenized = x_tokenizer.texts_to_sequences(x)
x_train_val = sequence.pad_sequences(x_tokenized, maxlen = max_text_length)
x_train_val

array([[   0,    0,    0, ...,   88,   80, 3372],
       [   0,    0,    0, ...,   35,    9,   47],
       [   0,    0,    0, ...,    3,   69,  207],
       ...,
       [   0,    0,    0, ..., 5424, 3822,  119],
       [   0,    0,    0, ...,  348,   69,   38],
       [   0,    0,    0, ..., 1456, 2801,   13]], dtype=int32)

# Prepare Embedding Matrix using Pre-trained GloVe Embeddings Data

In [10]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip -q glove.6B.zip

embedding_dim = 100
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print(f'Found {len(embeddings_index)} word vectors.')

Found 400000 word vectors.


In [11]:
embedding_matrix = np.zeros((max_feature, embedding_dim))
for word, index in x_tokenizer.word_index.items():
    if index > max_feature - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.15180001,  0.38409001,  0.89340001, ..., -0.27123001,
         0.22157   ,  0.92111999],
       [-0.54263997,  0.41475999,  1.03219998, ..., -1.29690003,
         0.76217002,  0.46349001],
       ...,
       [ 0.25466001,  0.11273   , -0.91995001, ...,  0.66464001,
        -0.38611999, -0.23083   ],
       [ 0.80694002, -0.8786    ,  0.22487   , ..., -1.27059996,
        -0.84288001, -0.25398999],
       [-1.45879996, -0.472     ,  0.90109003, ..., -0.23966999,
         0.51719999, -0.80901998]])

# Build the Model

Add Embedding Layer

In [12]:
model = Sequential()
model.add(Embedding(max_feature,
                    embedding_dim,
                    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                    trainable=False))
model.add(Dropout(0.2))

Build Rest Model

In [13]:
filters = 250
kernel_size = 3
hidden_dims = 250

model.add(Conv1D(filters,
                 kernel_size,
                 padding= 'valid'))
model.add(MaxPooling1D())
model.add(Conv1D(filters,
                 5,
                 padding = 'valid',
                 activation = 'relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation = 'relu'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         2000000   
_________________________________________________________________
dropout (Dropout)            (None, None, 100)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 250)         75250     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 250)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 250)         312750    
_________________________________________________________________
global_max_pooling1d (Global (None, 250)               0         
_________________________________________________________________
dense (Dense)                (None, 250)               6

In [14]:
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the Model

In [15]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y, 
                                                 test_size = 0.15, random_state = 1)

In [16]:
model.fit(x_train, y_train, 
         batch_size = 32, 
         epochs = 10, 
         validation_data = (x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b458e310>

# Evaluate Model<br>Part1: passages ranking

In [17]:
test_queries_df = pd.read_csv('./test/2021_queries.tsv', delimiter = "\t" , header=None, names = ['id','topic'])
test_queries_df.head(5)

Unnamed: 0,id,topic
0,787021,what is produced by muscle
1,1049187,who recorded be my baby
2,1049519,who said no one can make you feel inferior
3,788054,what is ptf
4,2082,At about what age do adults normally begin to ...


In [18]:
new_topics_index = test_queries_df.set_index('id').to_dict()['topic']
topic_values = test_queries_df['id'].values
docs = train_qrels_df['file'].values

Mapping Passage Documents to Each Topics

In [19]:
rows = []
for t in topic_values[0:300]:
    for d in docs[0:100]:
        rows.append([t, d])

In [20]:
rank_df = pd.DataFrame(rows, columns = ['id', 'file'])
test_df = rank_df.copy()
test_df['topic'] = test_df.id.map(new_topics_index)
test_df['docid'] = test_df.file.map(passages_index)
test_df['title'] = test_df.docid.map(documents_index)
test_df = test_df.dropna()
test_df['x_test'] = test_df['topic'] + " " + test_df['title']

In [21]:
x_test = test_df['x_test'].values
x_test_tokenized = x_tokenizer.texts_to_sequences(x_test)
x_testing = sequence.pad_sequences(x_test_tokenized, maxlen = max_text_length)
x_testing

array([[   0,    0,    0, ..., 3695, 3399, 4002],
       [   0,    0,    0, ..., 3614,   74,   13],
       [   0,    0,    0, ...,    4, 1487,  558],
       ...,
       [   0,    0,    0, ...,  459,   65,  207],
       [   0,    0,    0, ...,  223, 7873,  632],
       [   0,    0,    0, ...,   44,    6,  230]], dtype=int32)

In [22]:
y_testing = model.predict(x_testing, verbose=1, batch_size=32)



In [23]:
test_df['score'] = [float(x) for x in y_testing]
test_df = test_df.drop(columns=['x_test','title','topic','docid'])
rank_df = test_df.sort_values(by=['id', 'score'], ascending = False)

In [24]:
rank_df['rank'] = rank_df.groupby('id')['score'].rank(ascending=False).astype(int)
np.savetxt(r'./passage_ranking.txt', rank_df.values, fmt='%s')
rank_df

Unnamed: 0,id,file,score,rank
24695,1136769,msmarco_passage_07_641172265,15.686891,1
24637,1136769,msmarco_passage_10_737790036,15.311656,2
24622,1136769,msmarco_passage_04_74703297,15.035100,3
24648,1136769,msmarco_passage_19_248839014,14.566853,4
24614,1136769,msmarco_passage_04_613515887,14.427825,5
...,...,...,...,...
471,2082,msmarco_passage_53_273579196,11.953814,92
452,2082,msmarco_passage_08_740029949,11.950468,93
468,2082,msmarco_passage_01_507777427,11.680532,94
409,2082,msmarco_passage_02_556351008,11.669019,95


# Part2: top100_reranking
<rb>I used passv2_tran_top100.txt for reranking

In [25]:
test_top100_df = pd.read_csv('./train/passv2_train_top100.txt', delimiter = " ", names = ['id','used','file','rank','score','username'])
test_top100_df.head(10)

Unnamed: 0,id,used,file,rank,score,username
0,5,Q0,msmarco_passage_49_25899182,1,12.1278,Anserini
1,5,Q0,msmarco_passage_06_781809452,2,11.9428,Anserini
2,5,Q0,msmarco_passage_09_146319807,3,11.7703,Anserini
3,5,Q0,msmarco_passage_18_567713921,4,11.5883,Anserini
4,5,Q0,msmarco_passage_30_434058059,5,11.588299,Anserini
5,5,Q0,msmarco_passage_39_274709263,6,11.5319,Anserini
6,5,Q0,msmarco_passage_65_807511329,7,11.5129,Anserini
7,5,Q0,msmarco_passage_66_132124624,8,11.512899,Anserini
8,5,Q0,msmarco_passage_65_453740470,9,11.4108,Anserini
9,5,Q0,msmarco_passage_53_503988399,10,11.2986,Anserini


In [26]:
test_top100_df['topic'] = test_top100_df.id.map(topics_index)
test_top100_df['docid'] = test_top100_df.file.map(passages_index)
test_top100_df['title'] = test_top100_df.docid.map(documents_index)
test_top100_df = test_top100_df.dropna()
test_top100_df['x_test'] = test_top100_df['topic']+" "+test_top100_df['title']

In [27]:
x_top100_testing = test_top100_df['x_test'].values
x_top100_test_tokenized = x_tokenizer.texts_to_sequences(x_top100_testing)
x_top100_testing = sequence.pad_sequences(x_top100_test_tokenized, maxlen = max_text_length)
y_top100_testing = model.predict(x_top100_testing, verbose=1, batch_size=32)



In [28]:
rerank_top100_df = test_top100_df.drop(columns =['x_test','title','docid','topic',
                                                 'username','rank','score','used'])
rerank_top100_df['score'] = [float(x) for x in y_top100_testing]
rerank_top100_df = rerank_top100_df.sort_values(by=['id','score'], ascending = False)

In [29]:
rerank_top100_df['rank'] = rerank_top100_df.groupby('id')['score'].rank(ascending=False).astype(int)
np.savetxt(r'./passage_top100_reranking.txt', rerank_top100_df.values, fmt='%s')
rerank_top100_df

Unnamed: 0,id,file,score,rank
27713486,1185868,msmarco_passage_20_598584736,19.330769,1
27713373,1185865,msmarco_passage_11_701770901,13.365574,1
27713216,1185862,msmarco_passage_06_840296102,15.564853,1
27712998,1185855,msmarco_passage_01_180296442,9.139565,1
27712787,1185849,msmarco_passage_17_166239036,8.547846,1
...,...,...,...,...
533,54,msmarco_passage_05_514104820,9.709645,1
327,31,msmarco_passage_02_6149215,9.243010,1
164,16,msmarco_passage_32_177724543,11.951515,1
113,16,msmarco_passage_06_754042363,11.867783,2
