### Create Test Data
After the model is trained, this code is used to get predictions from the model. These predictions can then be used in the COLAB notebook "mag_concept_tagger_v1_model_testing.ipynb" to calculate the metrics for model performance.

In [1]:
# %pip install tensorflow==2.4.1
# %pip install transformers
# %pip install pyarrow
# %pip install tensorflow-addons

In [1]:
import tensorflow as tf
import pandas as pd
import pickle
import os
import tensorflow_addons as tfa
from math import ceil

AUTO = tf.data.experimental.AUTOTUNE

In [2]:
model_iteration = 'iteration_final'

In [3]:
with open(f"./{model_iteration}/vocab/topics_vocab.pkl", "rb") as f:
    target_vocab = pickle.load(f)
    
target_vocab_inv = {j:i for i,j in target_vocab.items()}

with open(f"./{model_iteration}/vocab/doc_type_vocab.pkl", "rb") as f:
    doc_vocab = pickle.load(f)
    
doc_vocab_inv = {j:i for i,j in doc_vocab.items()}

with open(f"./{model_iteration}/vocab/journal_name_vocab.pkl", "rb") as f:
    journal_vocab = pickle.load(f)
    
journal_vocab_inv = {j:i for i,j in journal_vocab.items()}

with open(f"./{model_iteration}/vocab/paper_title_vocab.pkl", "rb") as f:
    title_vocab = pickle.load(f)
    
title_vocab_inv = {j:i for i,j in title_vocab.items()}

In [5]:
len(target_vocab)

82178

##### Short code to create ID mapping

In [11]:
tag_ids = pd.read_parquet("fields_of_study_ids.parquet")

In [12]:
names = tag_ids['normalized_name'].to_list()
ids = tag_ids['field_of_study_id'].to_list()

In [15]:
name_to_id = {name:i for name, i in zip(names, ids)}

In [19]:
id_dict = {i:name_to_id[j] for i,j in target_vocab_inv.items()}

In [27]:
# with open(f"./{model_iteration}/vocab/tag_id_vocab.pkl", "wb") as f:
#     pickle.dump(id_dict, f)

In [28]:
with open(f"./{model_iteration}/vocab/tag_id_vocab.pkl", "rb") as f:
    test_dict = pickle.load(f)

#### Getting the model

In [4]:
encoding_layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
    max_tokens=len(target_vocab)+1, output_mode="binary", sparse=False)


In [5]:
len(title_vocab)

177567

In [6]:
len(target_vocab)

82178

In [8]:
mag_model = tf.keras.models.load_model(f'./{model_iteration}/models/gamma_28_nH8_nL6_epoch25/')

2021-11-25 23:06:01.068679: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-11-25 23:06:01.077543: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
mag_model.inputs

[<KerasTensor: shape=(None, 64) dtype=int64 (created by layer 'paper_title_ids')>,
 <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'doc_type_id')>,
 <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'journal_id')>]

In [10]:
final_model = tf.keras.Model(inputs=mag_model.inputs, 
                             outputs=tf.math.top_k(mag_model.outputs, k=30))

In [11]:
final_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
paper_title_ids (InputLayer)    [(None, 64)]         0                                            
__________________________________________________________________________________________________
title_embedding (Embedding)     (None, 64, 512)      90914816    paper_title_ids[0][0]            
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 64, 512)      0           title_embedding[0][0]            
__________________________________________________________________________________________________
encoder_0/multiheadattention (M (None, 64, 512)      1050624     tf.__operators__.add[0][0]       
                                                                 tf.__operators__.add[0][0]   

In [12]:
def get_all_model_predictions(data_path):
    # Get all of the files, load into single pandas dataframe
    # split up into blocks of 3000 and get model output
    file_names = [x for x in os.listdir(f"./{model_iteration}/tokenized_data/test/") if x.startswith('part')]
    file_names.sort()
    
    full_df = pd.DataFrame()
    
    for file_name in file_names:
        temp_df = pd.read_parquet(f"./{model_iteration}/tokenized_data/test/{file_name}")
        full_df = pd.concat([full_df, temp_df], axis=0)
    
    num_samples = 1000
    preds_final = []
    scores_final = []
    for i in range(ceil(full_df.shape[0]/num_samples)):
        print(i)
        small_df = full_df.iloc[i*num_samples:(i+1)*num_samples, :].copy()
        preds, scores = get_model_predictions(small_df)
        preds_final += preds
        scores_final += scores
    
    full_df['predictions'] = preds_final
    full_df['scores'] = scores_final
    
    return full_df


In [13]:
def get_model_predictions(input_data):
    
    paper_titles = tf.keras.preprocessing.sequence.pad_sequences(input_data['paper_title_tok'].to_list(), maxlen=64, 
                                                             dtype='int64', padding='post', 
                                                             truncating='post', value=0)
    
    doc_types = tf.convert_to_tensor(input_data['doc_type_tok'].to_list())
    journal = tf.convert_to_tensor(input_data['journal_tok'].to_list())
    
    model_output = final_model([paper_titles, doc_types, journal])
    
    scores = model_output.values.numpy()[0][:,:20].tolist()
    preds = model_output.indices.numpy()[0][:,:20].tolist()
    
    return preds, scores

In [14]:
test_data = get_all_model_predictions(f"./{model_iteration}/tokenized_data/test/")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97


In [16]:
test_data.to_parquet(f"./{model_iteration}/test_data/data_with_predictions.parquet")

In [15]:
# test_data = pd.read_parquet(f"./{model_iteration}/test_data/data_with_predictions.parquet")
test_data['target_test'] = test_data['target_tok'].apply(lambda x: [i for i in x if i!=-1])
test_data['target_test'] = test_data['target_test'].apply(len)
test_data = test_data[test_data['target_test'] > 0].copy()

In [16]:
test_data.shape

(97233, 9)

In [17]:
test_data.sample(5)

Unnamed: 0,paper_id,publication_date,doc_type_tok,journal_tok,target_tok,paper_title_tok,predictions,scores,target_test
23907,3131361924,2021-07-01,[3],[294],"[1715, 101, 2, 1381]","[168800, 166817, 107386, 126558, 151334, 17634...","[2, 1381, 42, 171, 37, 306, 156, 11791, 543, 1...","[0.641745388507843, 0.4675076901912689, 0.4003...",4
1629,2173313759,1971-12-01,[3],[2775],"[2076, 8, 2685, 49, 3770, 388, 239, 2736, 1524...","[161506, 64102, 37508, 126553, 97706, 151326, ...","[8, 49, 239, 95, 696, 2736, 2685, 685, 5727, 3...","[0.7249653339385986, 0.4582909345626831, 0.438...",10
4440,2357299977,2013-03-01,[3],[1039],"[1425, 5, 328, 81, 691, 1835, 478, 586, 251, 8...","[35688, 633, 117665, 162290, 14251, 126553, 13...","[5, 8493, 586, 691, 328, 251, 623, 42, 397, 47...","[0.6601946353912354, 0.6510505080223083, 0.572...",11
8505,2747201174,2017-09-29,[3],[5623],"[61, 1, 38146, 443]","[103617, 14805, 112377, 131258, 152551, 150768...","[1, 6, 31, 2024, 38146, 2181, 61, 76, 85, 1967...","[0.9602290391921997, 0.5080435276031494, 0.466...",4
18728,3134951874,2021-06-01,[3],[2384],"[774, 2, 37, 16104, 14212, 3972, 215, 4050, 79...","[100107, 17828, 126553, 53496, 133601, 166817,...","[79, 1708, 2, 774, 1139, 215, 14212, 44, 29434...","[0.8013430237770081, 0.688534677028656, 0.6055...",10


### Code to get all raw test data into one file

In [3]:
import os
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
test_data_raw = pd.DataFrame()
for i in os.listdir("./iteration_final/test_data_raw/"):
    if i.startswith('part'):
        temp_df = pd.read_parquet(f"./iteration_final/test_data_raw/{i}")
        test_data_raw = pd.concat([test_data_raw, temp_df])

In [5]:
test_data_raw.sample(20)

Unnamed: 0,paper_id,doc_type,paper_title,journal_name,publication_date,topics
57,3155850838,Conference,counterfactual reward modification for streaming recommendation with delayed feedback,,2021-07-11,"[computer science, counterfactual thinking, cognitive psychology]"
234,3090874414,Patent,stably fixed rubber frame,,2020-05-19,"[vertical edge, integrally closed, geometry, natural rubber, perpendicular, limiting, edge, physics, liquid crystal display, frame]"
0,3094668390,Journal,applicability of magnetic resonance imaging of the knee in forensic age estimation,american journal of forensic medicine and pathology,2021-06-01,"[stage, magnetic resonance imaging, radiology, radiological weapon, ossification, age estimation, femoral epiphysis, forensic science, medicine, mri image]"
260,3098691962,Journal,crystal lattice properties fully determine short range interaction parameters for alkali and halide ions,journal of chemical physics,2012-08-09,"[lennard jones potential, ion, lattice energy, materials science, lattice, lattice constant, aqueous solution, chemical physics, solvation, crystal structure]"
191,2857720048,Patent,signal power lightning prevention box air switch state detecting system,,2017-11-24,"[electricity, interface, power, signal, state, electrical engineering, logic gate, lightning, computer science, alarm]"
94,2240568383,Journal,ionic transport in high energy density matter,bulletin of the american physical society,2015-11-17,"[boltzmann constant, coulomb, statistical physics, high energy density matter, thermal conductivity, yukawa potential, viscosity, molecular dynamics, physics, logarithm]"
19,1189781847,Thesis,pengaruh faktor faktor budaya organisasi terhadap kinerja karyawan pt wonokoyo jaya corporindo kantor pusat surabaya,,2008-01-01,"[statistical significance, business administration, competence, employee performance, linear regression, operations management, competitive advantage, strategic management, engineering, teamwork, variables]"
39,3118080385,Journal,differences in health seeking behaviors by socioeconomic groups among the pediatric hydrocephalus patient population,interdisciplinary neurosurgery,2021-06-01,"[pediatrics, medicine, socioeconomic status, vomiting, pediatric hydrocephalus, patient population, household income, hydrocephalus, health seeking, descriptive statistics]"
30,1599508969,Journal,insuline bagimli diabetes mellituslu hastalarda hbaic ile serum insulin kolesterol ve trigliserit duzeyleri arasindaki iliski,turkiye klinikleri tip bilimleri dergisi,1994-01-01,[medicine]
113,2770440433,Thesis,comparative study of belt loop gastropexy and fundic gastropexy in dogs,,1994-11-01,"[gastropexy, mathematics, anatomy, null, loop]"


In [12]:
# test_data_raw.to_parquet("test_raw.parquet")