In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import seaborn as sns
np.random.seed(42)

In [2]:
conts = pd.read_csv('../data/documentContents.csv')
header = ['seed'] + ['rec'+str(i) for i in range(14)]
recs = pd.read_csv('../data/recommendationPairs.csv',names = header,index_col=0)
full = pd.read_csv('../data/out.csv')

In [3]:
conts = conts.rename(columns={'Abstract/Review/Summarry':'text'})
conts = conts[~conts.text.isna()]
conts['text'] = conts.text.apply(lambda x: x[2:-2])
conts['text_len'] = conts.text.apply(lambda x: len(x))
conts = conts[conts.text_len > 150]
conts['zbMATH_ID']=conts['zbMATH_ID'].astype(int)
golden_lookup=conts.set_index('zbMATH_ID')

In [4]:
lookup = full.rename(columns={'de':'id'})[['id','text']]
lookup['text_len'] = lookup.text.apply(lambda x: len(x))
lookup = lookup[lookup.text_len > 150]
lookup['id']=lookup['id'].astype(int)
lookup=lookup.set_index('id')

In [5]:
lookup

Unnamed: 0_level_0,text,text_len
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3928129,Two fold-over 14 point designs of the \(2^ 4\t...,415
3928148,By analogy with the univariate case the author...,583
3928149,This is a study of asymptotic properties of va...,384
3928151,"Let \(\{\) X(t)\(\}\), \(t\in {\mathbb{Z}}\), ...",1155
3928154,In this paper the authors study the prediction...,655
...,...,...
7500464,Summary: We study determinants of the square-t...,346
7500465,"Summary: In this study, we use the union of th...",344
7500466,"Summary: In this work, the Catalan transformat...",582
7500467,Summary: A numerical treatment via a differenc...,544


In [6]:
def create_pairs(df):
    relevant_docs={}
    for seed,row in tqdm(df.iterrows()):
        rec_list = row[~row.isna()].astype(int).to_list()
        relevant_docs[seed] = set(rec_list)
    return relevant_docs
    

In [7]:
relevant_docs = create_pairs(recs)

80it [00:00, 1717.84it/s]


In [8]:
def get_contents_queries(df,lookup):
    queries={}
    for seed,recs in df.items():
        if seed not in lookup.index:
            print(seed)
            continue
        query = lookup.loc[seed].text
        if isinstance(query, pd.Series):
            query = query.iloc[0]
        queries[seed] = query
    return queries
        

In [9]:
def get_contents_recs(df,lookup):
    queries={}
    for _,recs in df.items():
        for rec_id in recs:
            if rec_id not in lookup.index:
                continue
            rec = lookup.loc[rec_id].text
            if isinstance(rec, pd.Series):
                rec = rec.iloc[0]
            queries[rec_id] = rec
    return queries

In [10]:
corpus_golden = get_contents_recs(relevant_docs,golden_lookup)

In [11]:
queries = get_contents_queries(relevant_docs,golden_lookup)

In [12]:
def get_contents_corpus(n,lookup):
    a=lookup.sample(n).reset_index()
    return dict(zip(a['id'], a['text']))
        

In [13]:
corpus = get_contents_corpus(50000,lookup)

In [14]:
corpus.update(corpus_golden)

In [15]:
corpus

{2610621: 'Beweis des von \\textit{G. Pólya} als Aufgabe aufgestellten Satzes: Aus der Existenz von \n\\[\n\\lim_{n=\\infty}(u_1+u_2+\\cdots+u_{n-1}+cu_n)=\\alpha\n\\]\n folgt stets die Existenz von \n\\[\n\\lim_{n=\\infty}(u_1+u_2+\\cdots+u_{n-1}+u_n)=\\alpha\n\\]\n dann und nur dann, wenn \\(c=0\\), oder \\({\\mathfrak R}(c)>\\frac 12\\) ist.',
 6927204: 'Summary: Due to the impact of the nonlinear factor caused by large azimuth misalignment, the conventional gyrocompass alignment method is hard to favorably meet the requirement of alignment speed under the condition of large azimuth misalignment of INS. In order to solve this problem, an improved gyrocompass alignment method is presented in this paper. The improved method is designed based on the nonlinear model for large azimuth misalignment and performed by opening the azimuth loop. The influence of the nonlinear factor on gyrocompass alignment will be reduced when opening the azimuth loop. Simulation and experimental results show

In [16]:
import pickle

with open('corpus.pickle', 'wb') as file:
    pickle.dump(corpus, file)

with open('relevant_docs.pickle', 'wb') as file:
    pickle.dump(relevant_docs, file)

with open('queries.pickle', 'wb') as file:
    pickle.dump(queries, file)