In [46]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import seaborn as sns
np.random.seed(42)

In [47]:
conts = pd.read_csv('../data/documentContents.csv')
header = ['seed'] + ['rec'+str(i) for i in range(14)]
recs = pd.read_csv('../data/recommendationPairs.csv',names = header,index_col=0)
train_ids = pd.read_csv('../data/dataTrainIDs.csv')

full= pd.read_csv('../data/out.csv')

In [48]:
conts = conts.rename(columns={'Abstract/Review/Summarry':'text'})
conts = conts[~conts.text.isna()]
conts['text'] = conts.text.apply(lambda x: x[2:-2])
conts['text_len'] = conts.text.apply(lambda x: len(x))
conts = conts[conts.text_len > 150]
conts['zbMATH_ID']=conts['zbMATH_ID'].astype(int)
conts=conts.set_index('zbMATH_ID')

In [49]:
full = full.rename(columns={'de':'id'})[['id','text']]
full['text_len'] = full.text.apply(lambda x: len(x))
full = full[full.text_len > 150]
full['id']=full['id'].astype(int)
full=full.set_index('id')

In [50]:
train_ids = train_ids['seeID'].unique()
test_ids = np.setdiff1d(np.array(recs.index),train_ids)
pool_ids = np.array(full.index)
all_labeled_ids = np.array(recs.index)

In [51]:
train_recs = recs.loc[train_ids]
test_recs = recs.loc[test_ids]

In [52]:
def create_pairs(df,recs_ids,full_ids):
    rowlist=[]
    for seed,row in tqdm(df.iterrows()):
        rec_list = row[~row.isna()].astype(int).to_list()
        for rec_id in rec_list:
            # create positive
            rowlist.append({'seed':seed,'rec':rec_id,'label':1})
            # sample negative
            random_negative = np.random.choice(full_ids)
            while random_negative in recs_ids:
                random_negative = np.random.choice(full_ids)
            rowlist.append({'seed':seed,'rec':random_negative,'label':0})
    return pd.DataFrame(rowlist)
    

In [53]:
train_dataset = create_pairs(train_recs,all_labeled_ids,full_ids=pool_ids)
test_dataset = create_pairs(test_recs,all_labeled_ids,full_ids=pool_ids)

20it [00:00, 1600.48it/s]


60it [00:00, 2073.05it/s]


In [54]:
train_dataset

Unnamed: 0,seed,rec,label
0,1566951,4181495,1
1,1566951,5924715,0
2,1566951,930151,1
3,1566951,2507805,0
4,1566951,5083606,1
...,...,...,...
223,1269765,1233903,0
224,1269765,952018,1
225,1269765,2541827,0
226,1269765,1334957,1


In [55]:
def get_contents(df,lookup_positive,lookup_negative):
    rowlist=[]
    for idx,row in df.iterrows():

        if row['seed'] not in lookup_positive.index:
            continue
        anchor = lookup_positive.loc[row['seed']].text
        # positive recommendations
        if row['label']==1:
            
            # skip the ones that are not present in contents
            if row['rec'] not in lookup_positive.index:
                continue
            
            rec = lookup_positive.loc[row['rec']].text
            
        # negative ones. Sampling from whole zbmath corpus
        else:
            rec = lookup_negative.loc[row['rec']].text

        rowlist.append({'seed':row['seed'],'anchor':anchor,'rec':rec,'label':row['label']})
    return pd.DataFrame(rowlist).set_index('seed').sample(frac=1.,random_state=42)
        

In [56]:
final_train_dataset = get_contents(train_dataset,conts,full)
final_test_dataset = get_contents(test_dataset,conts,full)

In [58]:
dev_ids = np.array([1745734, 1031529, 1275776, 1269765]) # set by Ankit
train_ids = np.setdiff1d(final_train_dataset.index.unique(),dev_ids)

In [62]:
final_dev_dataset = final_train_dataset.loc[dev_ids].sample(frac=1,random_state=42)
final_train_dataset = final_train_dataset.loc[train_ids].sample(frac=1,random_state=42)

In [64]:
final_train_dataset.to_csv('../data/final/final_train_dataset.csv')
final_dev_dataset.to_csv('../data/final/final_dev_dataset.csv')
final_test_dataset.to_csv('../data/final/final_test_dataset.csv')