In [2]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import seaborn as sns
np.random.seed(42)

In [5]:
conts = pd.read_csv('../data/zbmath_doc_contents.csv')
header = ['seed'] + ['rec'+str(i) for i in range(14)]
recs = pd.read_csv('../data/recommendationPairs.csv',names = header,index_col=0)
full = pd.read_csv('../data/out.csv')

In [6]:
conts = conts.rename(columns={'Abstract/Review/Summarry':'text'})
conts = conts[~conts.text.isna()]
conts['text'] = conts.text.apply(lambda x: x[2:-2])
conts['text_len'] = conts.text.apply(lambda x: len(x))
conts = conts[conts.text_len > 150]
conts['zbMATH_ID']=conts['zbMATH_ID'].astype(int)
golden_lookup=conts.set_index('zbMATH_ID')

In [9]:
lookup = full.rename(columns={'de':'id'})[['id','text']]
lookup['text_len'] = lookup.text.apply(lambda x: len(str(x)))
lookup = lookup[lookup.text_len > 150]
lookup['id']=lookup['id'].astype(int)
lookup=lookup.set_index('id')

In [10]:
lookup

Unnamed: 0_level_0,text,text_len
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3928129,Two fold-over 14 point designs of the \(2^ 4\t...,415
3928148,By analogy with the univariate case the author...,583
3928149,This is a study of asymptotic properties of va...,384
3928151,"Let \(\{\) X(t)\(\}\), \(t\in {\mathbb{Z}}\), ...",1155
3928154,In this paper the authors study the prediction...,655
...,...,...
7500464,Summary: We study determinants of the square-t...,346
7500465,"Summary: In this study, we use the union of th...",344
7500466,"Summary: In this work, the Catalan transformat...",582
7500467,Summary: A numerical treatment via a differenc...,544


In [11]:
def create_pairs(df):
    relevant_docs={}
    for seed,row in tqdm(df.iterrows()):
        rec_list = row[~row.isna()].astype(int).to_list()
        relevant_docs[seed] = set(rec_list)
    return relevant_docs
    

In [12]:
relevant_docs = create_pairs(recs)

0it [00:00, ?it/s]

80it [00:00, 2758.53it/s]


In [13]:
relevant_docs

{1566951: {930151, 1579464, 4181495, 5083606, 6338806},
 1363213: {1036371, 1363213, 1445144, 1801581, 2165994, 6225939},
 1308161: {1356576, 4193896, 5007259, 5638157},
 1303018: {224045, 427914, 951967, 5120555, 5354085},
 1591097: {1758339, 2136591, 3867686, 5049067},
 2105416: {1059629, 5271305, 6216299},
 1013949: {1315242, 1864745, 1983902, 3767833, 6677815},
 1262405: {849952, 1162060, 1232539, 1423821, 1523448, 1585265},
 1661079: {1121710, 3506862, 6766194},
 1371474: {2067093, 3545808, 6095689, 6572608},
 1247615: {3297205, 3595379, 3745438, 5077259},
 1138672: {3275185, 3929157, 4154607, 5176593, 6019770, 6377480},
 1341970: {227591, 2235467, 5233949, 6124083, 6228504, 6543124},
 119869: {3261280, 3393319, 3541568, 7341310},
 1341964: {993528,
  1441724,
  1597921,
  1614787,
  1701993,
  1747161,
  1965428,
  2157081,
  2157754,
  3855291,
  5657069,
  6545191},
 1664684: {1476070, 1848116, 6068108, 6838613},
 1558736: {2185481, 3689556, 4127396},
 14466: {897863, 950784, 1

In [14]:
def get_contents_queries(df,lookup):
    queries={}
    for seed,recs in df.items():
        if seed not in lookup.index:
            print(seed)
            continue
        query = lookup.loc[seed].text
        if isinstance(query, pd.Series):
            query = query.iloc[0]
        queries[seed] = query
    return queries
        

In [15]:
def get_contents_recs(df,lookup):
    queries={}
    for _,recs in df.items():
        for rec_id in recs:
            if rec_id not in lookup.index:
                continue
            rec = lookup.loc[rec_id].text
            if isinstance(rec, pd.Series):
                rec = rec.iloc[0]
            queries[rec_id] = rec
    return queries

In [16]:
corpus_golden = get_contents_recs(relevant_docs,golden_lookup)

In [17]:
queries = get_contents_queries(relevant_docs,golden_lookup)

In [18]:
def get_contents_corpus(n,lookup):
    a=lookup.sample(n).reset_index()
    return dict(zip(a['id'], a['text']))
        

In [24]:
corpus = get_contents_corpus(50000,lookup)

In [25]:
corpus.update(corpus_golden)

In [27]:
len(corpus.keys())

50359

In [16]:
import pickle

with open('zbmath.pickle', 'wb') as file:
    pickle.dump(corpus, file)

with open('relevant_docs.pickle', 'wb') as file:
    pickle.dump(relevant_docs, file)

with open('queries.pickle', 'wb') as file:
    pickle.dump(queries, file)