# 8. Querying

In [1]:
#Load the results from feature_generation_documents.ipynb, we will need them for computation
#Note: pkl files are excluded from git for being to large, so you have to run other script once
import pandas as pd
import numpy as np
import gensim
tfidf = pd.read_pickle('../1_document_representation/pickle/tfidf.pkl')
BIM25 = pd.read_pickle('../1_document_representation/pickle/BIM25.pkl')
BIM25_alt = pd.read_pickle('../1_document_representation/pickle/BIM25_alt.pkl')
unigram_LM = pd.read_pickle('../1_document_representation/pickle/unigramLM.pkl')
idf = pd.read_pickle('../1_document_representation/pickle/idf.pkl')
#We need this line to find the collection_vocabulary.py here, else we cannot load the col.pkl object
import sys
sys.path.append('../0_Collection_and_Inverted_Index/')
from collection_vocabulary import Collection
col=Collection()
#Weighted Document Embeddings
documents_fasttext = pd.read_pickle('../1_document_representation/pickle/documents_fasttext.pkl')
documents_word2vec = pd.read_pickle('../1_document_representation/pickle/documents_word2vec.pkl')

#For embeddings weighting of queries:
fasttext_embeddings = pd.read_pickle('../1_document_representation/pickle/fasttext_embeddings.pkl')
word2vec_embeddings = pd.read_pickle('../1_document_representation/pickle/word2vec_embeddings.pkl')



### Sample single term queries

Let's look at the same single term query  - "cancer". And compare the results of the three retrieval models.

In [2]:
#TFIDF
a= tfidf.loc['cancer'].sort_values(ascending=False).head(10) # if you transpose you can directly select by the index term  > tf.transpose().cancer
a

MED-1718    0.695754
MED-4227    0.695754
MED-4050    0.695754
MED-5355    0.695754
MED-4433    0.695754
MED-5353    0.695754
MED-5352    0.695754
MED-3378    0.695754
MED-3447    0.695754
MED-1599    0.695754
Name: cancer, dtype: float64

In [3]:
# Unigram LM
b= unigram_LM.loc['cancer'].sort_values(ascending=False).head(10)
b

MED-3703    0.081339
MED-2137    0.061650
MED-2174    0.057772
MED-4391    0.052210
MED-890     0.048745
MED-5184    0.048281
MED-3551    0.047909
MED-3555    0.047347
MED-2258    0.045462
MED-3699    0.044962
Name: cancer, dtype: float64

In [4]:
c= BIM25.loc['cancer'].sort_values(ascending=False).head(10)
c

MED-3703    0.923341
MED-2174    0.891008
MED-2137    0.887197
MED-3555    0.884823
MED-3699    0.883993
MED-3551    0.881528
MED-2258    0.877689
MED-4391    0.877334
MED-890     0.876812
MED-5353    0.876768
Name: cancer, dtype: float64

Obviously, there is very little overlap in the top 10 retrieved documents. Only the top-ranked doc of the probabilisitic ranking models matches.

Now, let's get the query representations and compute the scores for each document

In [2]:
#Now to get the queries
train_queries = pd.read_csv('../nfcorpus/train.all.queries', sep='\t', header=None)
train_queries.columns = ['id', 'text']
dev_queries = pd.read_csv('../nfcorpus/dev.all.queries', sep='\t', header=None)
dev_queries.columns = ['id', 'text']
test_queries = pd.read_csv('../nfcorpus/test.all.queries', sep='\t', header=None)
test_queries.columns = ['id', 'text']

#And the relevance scores given
train_rel = pd.read_csv('../nfcorpus/train.2-1-0.qrel', sep='\t', header=None)
print(train_rel.describe())
test_rel = pd.read_csv('../nfcorpus/test.2-1-0.qrel', sep='\t', header=None)
print(test_rel.describe())
dev_rel = pd.read_csv('../nfcorpus/dev.2-1-0.qrel', sep='\t', header=None)
#As we can see, column 1 is always 0, so drop it
train_rel = train_rel.drop([1], axis=1)
dev_rel = dev_rel.drop([1], axis=1)
test_rel = test_rel.drop([1], axis=1)
train_rel.columns = ['qid', 'docid', 'rel']
dev_rel.columns = ['qid', 'docid', 'rel']
test_rel.columns = ['qid', 'docid', 'rel']

#The corpus also divides documents into train, dev and test, so we need to stick to that as well
#(in order to get comparable results)
train_docs = pd.read_csv('../nfcorpus/train.docs', sep='\t', header=None)
train_docs.columns = ['id', 'text']
dev_docs = pd.read_csv('../nfcorpus/dev.docs', sep='\t', header=None)
dev_docs.columns = ['id', 'text']
test_docs = pd.read_csv('../nfcorpus/test.docs', sep='\t', header=None)
test_docs.columns = ['id', 'text']

              1              3
count  110575.0  110575.000000
mean        0.0       1.038698
std         0.0       0.192874
min         0.0       1.000000
25%         0.0       1.000000
50%         0.0       1.000000
75%         0.0       1.000000
max         0.0       2.000000
             1             3
count  12334.0  12334.000000
mean       0.0      1.046700
std        0.0      0.211004
min        0.0      1.000000
25%        0.0      1.000000
50%        0.0      1.000000
75%        0.0      1.000000
max        0.0      2.000000


For tfidf with cosine similarity, we need the tfidf vectors for each query (using idf of our document corpus and tf for each query):

In [6]:
#you can skip this if you already did it once, just start loading the matrices from pkl files
def get_query_term_matrix(queries, col):
    query_term_matrix = []
    for query in queries.itertuples():
        tf_vector =[]
        for word in col.vocabulary:
            n= query.text.count(word)
            tf_vector.append(n)
        query_term_matrix.append(tf_vector)
    return pd.DataFrame(data=query_term_matrix,index=queries.id,columns=col.vocabulary)

In [7]:
#let's compute the term_matrix for our query texts
train_matrix = get_query_term_matrix(train_queries, col)
dev_matrix = get_query_term_matrix(dev_queries, col)
test_matrix = get_query_term_matrix(test_queries, col)

In [8]:
test_matrix.head()

Unnamed: 0_level_0,'hort,+,-,--a,--all,--have,--mainly,--of,--showed,--the,...,zooplankton,zoxazolamine,zr,zu,zuccarini,zucchini,zugesetztem,zusatzstoffe-online,zygote,zymography
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PLAIN-1008,0,0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PLAIN-1018,0,0,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PLAIN-102,0,1,54,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PLAIN-1028,0,0,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PLAIN-1039,0,0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#Another speed up, save the matrices
train_matrix.to_pickle('pickle/train_matrix.pkl')
dev_matrix.to_pickle('pickle/dev_matrix.pkl')
test_matrix.to_pickle('pickle/test_matrix.pkl')

## If you already ran this once, you can simply load the matrices instead of computing them

In [3]:
train_matrix = pd.read_pickle('pickle/train_matrix.pkl')
dev_matrix = pd.read_pickle('pickle/dev_matrix.pkl')
test_matrix = pd.read_pickle('pickle/test_matrix.pkl')

In [10]:
# As seen in the tf idf computation of the documents, we will use an inverted matrix
train_matrix_inverted = train_matrix.transpose()
dev_matrix_inverted = dev_matrix.transpose()
test_matrix_inverted = test_matrix.transpose()

In [11]:
def compute_query_tfidf(inv_query_matrix, idf):
    #TF
    # nominator part
    nominator=inv_query_matrix.mask(inv_query_matrix!=0,other=(np.log10(inv_query_matrix)+1))
    # denominator part
    most_frequent_term=inv_query_matrix.max(axis=0) # determine most frequent term in each query
    denominator= np.log10(most_frequent_term)
    denominator+=1
    tf=nominator.div(denominator, axis=1)
    tfidf_query= tf.mul(idf, axis=0) # we multiply the tf scores in every query with the corresponding idf scores
    return tfidf_query

In [12]:
#Now, let's get the tfidf scores for each query, please ignore the error
train_tfidf = compute_query_tfidf(train_matrix_inverted, idf)
dev_tfidf = compute_query_tfidf(dev_matrix_inverted, idf)
test_tfidf = compute_query_tfidf(test_matrix_inverted, idf)

  after removing the cwd from sys.path.


In [13]:
#Save those as pkl as well
train_tfidf.to_pickle('pickle/train_tfidf.pkl')
dev_tfidf.to_pickle('pickle/dev_tfidf.pkl')
test_tfidf.to_pickle('pickle/test_tfidf.pkl')

## Again, load it to save time

In [8]:
#Or load them, if they already exist
train_tfidf = pd.read_pickle('pickle/train_tfidf.pkl')
dev_tfidf = pd.read_pickle('pickle/dev_tfidf.pkl')
test_tfidf = pd.read_pickle('pickle/test_tfidf.pkl')

In [5]:
test_tfidf.head()

id,PLAIN-1008,PLAIN-1018,PLAIN-102,PLAIN-1028,PLAIN-1039,PLAIN-1050,PLAIN-1066,PLAIN-1078,PLAIN-1088,PLAIN-1098,...,PLAIN-91,PLAIN-913,PLAIN-924,PLAIN-934,PLAIN-946,PLAIN-956,PLAIN-966,PLAIN-977,PLAIN-987,PLAIN-997
'hort,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+,0.0,0.0,0.750049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-,2.529077,2.433363,2.366582,2.628729,2.621103,2.5047,2.604743,2.726138,2.44186,2.918655,...,1.971896,1.79211,2.682064,1.856777,2.744308,2.022176,2.561169,2.587771,2.434983,2.680533
--a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--all,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### As for the embeddings, we also need to weight them with their tfidf for each query, in order to compute similarity between document and queries:

In [15]:
def get_weighted_embeddings(embeddings, tfidf_embed):
    sum_of_tfidf_weights=tfidf_embed.sum(axis=0)#vector containing the normalizing constant for each doc
    embeddings_dict={}
    # we have to make use of the following workaround to avoid memory errors
    # 1. calculate 100d embeddings vector for each doc/query and store it in dictionary
    # 2. recreate a a dataframe containg the embeddings for all docs/queries from the dictionary
    for doc in tfidf_embed.columns:
        if doc not in embeddings_dict.keys():
            embedding=(tfidf_embed[doc].mask(tfidf_embed[doc]!=0, other=(tfidf_embed[doc]*embeddings)).sum(axis=0))/sum_of_tfidf_weights[doc]
            embeddings_dict[doc]=embedding
        else:
            print('calculated embeddings successfully and stored them in dictionary')
    weighted_embedding = pd.DataFrame.from_dict(embeddings_dict)
    return weighted_embedding

In [16]:
#Fasttext Embeddings
train_queries_fasttext = get_weighted_embeddings(fasttext_embeddings, train_tfidf)
dev_queries_fasttext = get_weighted_embeddings(fasttext_embeddings, dev_tfidf)
test_queries_fasttext = get_weighted_embeddings(fasttext_embeddings, test_tfidf)

#Save them for later speed up
train_queries_fasttext.to_pickle('pickle/train_queries_fasttext.pkl')
dev_queries_fasttext.to_pickle('pickle/dev_queries_fasttext.pkl')
test_queries_fasttext.to_pickle('pickle/test_queries_fasttext.pkl')

In [17]:
#Word2vec Embeddings
train_queries_word2vec = get_weighted_embeddings(word2vec_embeddings, train_tfidf)
dev_queries_word2vec = get_weighted_embeddings(word2vec_embeddings, dev_tfidf)
test_queries_word2vec = get_weighted_embeddings(word2vec_embeddings, test_tfidf)

#Save them as well
train_queries_word2vec.to_pickle('pickle/train_queries_word2vec.pkl')
dev_queries_word2vec.to_pickle('pickle/dev_queries_word2vec.pkl')
test_queries_word2vec.to_pickle('pickle/test_queries_word2vec.pkl')

### Speed up: Load the weighted embeddings, instead of computing them

In [4]:
#Fasttext Embeddings
train_queries_fasttext = pd.read_pickle('pickle/train_queries_fasttext.pkl')
dev_queries_fasttext = pd.read_pickle('pickle/dev_queries_fasttext.pkl')
test_queries_fasttext = pd.read_pickle('pickle/test_queries_fasttext.pkl')

#Word2Vec Embeddings
train_queries_word2vec = pd.read_pickle('pickle/train_queries_word2vec.pkl')
dev_queries_word2vec = pd.read_pickle('pickle/dev_queries_word2vec.pkl')
test_queries_word2vec = pd.read_pickle('pickle/test_queries_word2vec.pkl')

## Finally, put it all together to compute the scores for every query document pair

In [9]:
def compute_scores(queries, documents, rel, queries_tfidf, queries_fasttext, queries_word2vec):
    #Get the documents defined in the nfcorpous
    doc_keys = documents.id
    tfidf_part = tfidf.loc[:, doc_keys]
    BIM25_part = BIM25.loc[:, doc_keys]
    BIM25_alt_part = BIM25_alt.loc[:, doc_keys]
    unigram_LM_part = unigram_LM.loc[:, doc_keys]
    documents_fasttext_part = documents_fasttext.loc[:, doc_keys]
    documents_word2vec_part = documents_word2vec.loc[:, doc_keys]
    list_of_df = []
    query_keys = queries['id']
    print('Computing', len(query_keys), 'queries on', len(doc_keys), 'documents')
    
    #Get the cosine between queries and docs (much faster than inside the loop)
    cosine = cosine_similarity(queries_tfidf.transpose(), tfidf_part.transpose())
    #Also, cosine between the embeddings:
    fasttext = cosine_similarity(queries_fasttext.transpose(), documents_fasttext_part.transpose())
    word2vec = cosine_similarity(queries_word2vec.transpose(), documents_word2vec_part.transpose())
    print('Cosines computed, start iterating...')
    i = 0
    for key in query_keys:
        text = str(queries.loc[queries['id'] == key].text)
        tfidf_scores = tfidf_part.loc[text.split()].sum()
        bim25_scores = BIM25_part.loc[text.split()].sum()
        bim25_alt_scores = BIM25_alt_part.loc[text.split()].sum()
        unigram_scores = unigram_LM_part.loc[text.split()].product()
        total = pd.DataFrame()
        total['tfidf'] = tfidf_scores
        total['bim25'] = bim25_scores
        total['bim25_alt'] = bim25_alt_scores
        total['unigram'] = unigram_scores
        total['cosine'] = cosine[i]
        total['fasttext'] = fasttext[i]
        total['word2vec'] = word2vec[i]
        total['qid'] = key.replace('PLAIN-', '')
        #Rel only contains 1 and 2, everything that is not in there is set to 0
        total['rel'] = 0
        rel_temp = rel.loc[(rel['qid'] == key)]
        for row in rel_temp.itertuples():
            total.at[row.docid, 'rel'] = row.rel
        total.set_index(np.arange(len(doc_keys)))
        total.rename(columns={'': 'docid'}, inplace=True)
        list_of_df.append(total)
        i+=1
        if (i%100 == 0):
            print(i, 'queries computed')
    scores = pd.concat(list_of_df)
    print(i, 'queries computed')
    #Sanity check: should be same
    print(len(scores))
    print(len(doc_keys)*len(query_keys))
    return scores

In [10]:
from scipy.spatial import distance
def cosine_similarity(query, docs):
    cos_similarity = 1-distance.cdist(query, docs, metric='cosine')
    return cos_similarity

In [11]:
train_scores = compute_scores(train_queries, train_docs, train_rel, train_tfidf, train_queries_fasttext, train_queries_word2vec)
dev_scores = compute_scores(dev_queries, dev_docs, dev_rel, dev_tfidf, dev_queries_fasttext, dev_queries_word2vec)
test_scores = compute_scores(test_queries, test_docs, test_rel, test_tfidf, test_queries_fasttext, test_queries_word2vec)

Computing 2594 queries on 3612 documents
Cosines computed, start iterating...
100 queries computed
200 queries computed
300 queries computed
400 queries computed
500 queries computed
600 queries computed
700 queries computed
800 queries computed
900 queries computed
1000 queries computed
1100 queries computed
1200 queries computed
1300 queries computed
1400 queries computed
1500 queries computed
1600 queries computed
1700 queries computed
1800 queries computed
1900 queries computed
2000 queries computed
2100 queries computed
2200 queries computed
2300 queries computed
2400 queries computed
2500 queries computed
2594 queries computed
9369528
9369528
Computing 325 queries on 3193 documents
Cosines computed, start iterating...
100 queries computed
200 queries computed
300 queries computed
325 queries computed
1037725
1037725
Computing 325 queries on 3162 documents
Cosines computed, start iterating...
100 queries computed
200 queries computed
300 queries computed
325 queries computed
10276

In [12]:
train_scores.head()

Unnamed: 0,tfidf,bim25,bim25_alt,unigram,cosine,fasttext,word2vec,qid,rel
MED-10,0.0,0.0,0.0,2.3145690000000002e-23,0.015284,0.888365,0.878828,10,0
MED-14,0.0,0.0,0.0,2.3145690000000002e-23,0.010135,0.870183,0.860293,10,0
MED-118,0.0,0.0,0.0,2.3145690000000002e-23,0.024806,0.934861,0.932443,10,0
MED-301,0.0,0.0,0.0,2.3145690000000002e-23,0.019779,0.92305,0.906691,10,0
MED-306,0.0,0.0,0.0,2.3145690000000002e-23,0.030273,0.955033,0.946509,10,0


In [13]:
test_scores.describe()

Unnamed: 0,tfidf,bim25,bim25_alt,unigram,cosine,fasttext,word2vec,rel
count,1027650.0,1027650.0,1027650.0,1027650.0,1027650.0,1027650.0,1027650.0,1027650.0
mean,0.1121619,0.1650738,0.1594483,2.65621e-08,0.01158282,0.9107293,0.8926954,0.01256264
std,0.3264698,0.5390886,0.5161146,8.205315e-06,0.01065434,0.04365257,0.04860128,0.1163006
min,0.0,0.0,0.0,1.4853279999999998e-44,0.0,0.5786127,0.5770608,0.0
25%,0.0,0.0,0.0,5.475276e-35,0.002505088,0.8860722,0.8633588,0.0
50%,0.0,0.0,0.0,2.436599e-31,0.009550024,0.9182195,0.9002982,0.0
75%,0.0,0.0,0.0,1.934189e-25,0.01772193,0.9433185,0.9297699,0.0
max,10.7151,19.83245,18.21768,0.006253765,0.1456515,0.9930003,0.9915299,2.0


For reduced task: create files according to rank lib documentation: https://sourceforge.net/p/lemur/wiki/RankLib%20File%20Format/

In [14]:
#Create csv for Ranklib, code taken from answer here: https://stackoverflow.com/questions/37439533/pandas-custom-file-format
feature_columns = ['tfidf','bim25','bim25_alt','unigram','cosine','fasttext','word2vec']
cols2id = {col:str(i+1) for i,col in enumerate(feature_columns)}

def f(x):
    if x.name in feature_columns:
        return cols2id[x.name] + ':' + x.astype(str)
    elif x.name == 'qid':
        return 'qid:' + x.astype(str)
    else:
        return x

(train_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/train.csv', sep=' ', index=False, header=None)
)
(dev_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/dev.csv', sep=' ', index=False, header=None)
)
(test_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/test.csv', sep=' ', index=False, header=None)
)

### To get a baseline for each of our features, we also build models just using one of them:

In [15]:
feature_columns = ['tfidf']
cols2id = {col:str(i+1) for i,col in enumerate(feature_columns)}

def f(x):
    if x.name in feature_columns:
        return cols2id[x.name] + ':' + x.astype(str)
    elif x.name == 'qid':
        return 'qid:' + x.astype(str)
    else:
        return x

(train_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/train_tfidf.csv', sep=' ', index=False, header=None)
)
(dev_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/dev_tfidf.csv', sep=' ', index=False, header=None)
)
(test_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/test_tfidf.csv', sep=' ', index=False, header=None)
)

In [16]:
feature_columns = ['bim25']
cols2id = {col:str(i+1) for i,col in enumerate(feature_columns)}

def f(x):
    if x.name in feature_columns:
        return cols2id[x.name] + ':' + x.astype(str)
    elif x.name == 'qid':
        return 'qid:' + x.astype(str)
    else:
        return x

(train_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/train_bm25.csv', sep=' ', index=False, header=None)
)
(dev_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/dev_bm25.csv', sep=' ', index=False, header=None)
)
(test_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/test_bm25.csv', sep=' ', index=False, header=None)
)

In [17]:
feature_columns = ['bim25_alt']
cols2id = {col:str(i+1) for i,col in enumerate(feature_columns)}

def f(x):
    if x.name in feature_columns:
        return cols2id[x.name] + ':' + x.astype(str)
    elif x.name == 'qid':
        return 'qid:' + x.astype(str)
    else:
        return x

(train_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/train_bm25_alt.csv', sep=' ', index=False, header=None)
)
(dev_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/dev_bm25_alt.csv', sep=' ', index=False, header=None)
)
(test_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/test_bm25_alt.csv', sep=' ', index=False, header=None)
)

In [18]:
feature_columns = ['unigram']
cols2id = {col:str(i+1) for i,col in enumerate(feature_columns)}

def f(x):
    if x.name in feature_columns:
        return cols2id[x.name] + ':' + x.astype(str)
    elif x.name == 'qid':
        return 'qid:' + x.astype(str)
    else:
        return x

(train_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/train_unigram.csv', sep=' ', index=False, header=None)
)
(dev_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/dev_unigram.csv', sep=' ', index=False, header=None)
)
(test_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/test_unigram.csv', sep=' ', index=False, header=None)
)

In [19]:
feature_columns = ['cosine']
cols2id = {col:str(i+1) for i,col in enumerate(feature_columns)}

def f(x):
    if x.name in feature_columns:
        return cols2id[x.name] + ':' + x.astype(str)
    elif x.name == 'qid':
        return 'qid:' + x.astype(str)
    else:
        return x

(train_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/train_cosine.csv', sep=' ', index=False, header=None)
)
(dev_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/dev_cosine.csv', sep=' ', index=False, header=None)
)
(test_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/test_cosine.csv', sep=' ', index=False, header=None)
)

In [20]:
feature_columns = ['fasttext']
cols2id = {col:str(i+1) for i,col in enumerate(feature_columns)}

def f(x):
    if x.name in feature_columns:
        return cols2id[x.name] + ':' + x.astype(str)
    elif x.name == 'qid':
        return 'qid:' + x.astype(str)
    else:
        return x

(train_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/train_fasttext.csv', sep=' ', index=False, header=None)
)
(dev_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/dev_fasttext.csv', sep=' ', index=False, header=None)
)
(test_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/test_fasttext.csv', sep=' ', index=False, header=None)
)

In [21]:
feature_columns = ['word2vec']
cols2id = {col:str(i+1) for i,col in enumerate(feature_columns)}

def f(x):
    if x.name in feature_columns:
        return cols2id[x.name] + ':' + x.astype(str)
    elif x.name == 'qid':
        return 'qid:' + x.astype(str)
    else:
        return x

(train_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/train_word2vec.csv', sep=' ', index=False, header=None)
)
(dev_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/dev_word2vec.csv', sep=' ', index=False, header=None)
)
(test_scores.apply(lambda x: f(x))[['rel','qid'] + feature_columns]
  .to_csv('../3_ranklib_framework/baseline/test_word2vec.csv', sep=' ', index=False, header=None)
)