Result:

<table>
    <thead>
        <tr>
            <th>Weight type</th>
            <th colspan = 3>Overall Accuracy</th>
            <th colspan = 5>Accuracy by subject size</th>
        </tr>
        <tr>
            <th/>
            <th>Top 1</th>
            <th>Top 3</th>
            <th>Top 5</th>
            <th>Subject size 1</th>
            <th>Subject size 2</th>
            <th>Subject size 3</th>
            <th>Subject size 4</th>
            <th>Subject size 5</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>Dynamic weight</td>
            <td>76.8%</td>
            <td>93.5%</td>
            <td>96.9%</td>
            <td>67.8%</td>
            <td>82.1%</td>
            <td>84.2%</td>
            <td>95.3%</td>
            <td>100%</td>
        </tr>
        <tr>
            <td>100% subject area</td>
            <td>59.7%</td>
            <td>82.4%</td>
            <td>90.4%</td>
            <td>47.7%</td>
            <td>63.4%</td>
            <td>71.9%</td>
            <td>97.4%</td>
            <td>100%</td>
        </tr>
        <tr>
            <td>25% abstract / 75% subject area</td>
            <td>73.5%</td>
            <td>93.7%</td>
            <td>96.7%</td>
            <td>63.4%</td>
            <td>77.%</td>
            <td>84.4%</td>
            <td>97.9%</td>
            <td>100%</td>
        </tr>
        <tr>
            <td>50% abstract / 50% subject area</td>
            <td>79.1%</td>
            <td>93.8%</td>
            <td>96.8%</td>
            <td>67.6%</td>
            <td>85.0%</td>
            <td>91.9%</td>
            <td>100%</td>
            <td>100%</td>
        </tr>
        <tr>
            <td>75% abstract / 25% subject area</td>
            <td>74.8%</td>
            <td>93.3%</td>
            <td>96.9%</td>
            <td>65.2%</td>
            <td>79.1%</td>
            <td>84.4%</td>
            <td>98.5%</td>
            <td>100%</td>
        </tr>
        <tr>
            <td>100% abstract</td>
            <td>69.7%</td>
            <td>87.8%</td>
            <td>93.8%</td>
            <td>64.4%</td>
            <td>71.4%</td>
            <td>73.9%</td>
            <td>89.2%</td>
            <td>100%</td>
        </tr>
    </tbody>
</table>




Import

In [None]:
import pickle
import scipy

from sklearn.feature_extraction.text import CountVectorizer

from keras.utils import to_categorical
from keras.models import load_model

import numpy as np
import os

# Hugging Face Transformers (SciBERT)
from transformers import AutoModel, AutoTokenizer
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from tqdm import tqdm

#panda
import pandas as pd

# Text Processing
import re
import nltk
from nltk import word_tokenize, download
from nltk.corpus import stopwords

# Data Serialization and Deserialization
import ast

from collections import Counter
import tensorflow as tf


sciBert_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")



In [None]:
subjs_model_acc = {
    'deci': 0.8226,
    'mult': 0.8685,
    'eart': 0.5340,
    'mate': 0.4698,
    'econ': 0.6328,
    'envi': 0.4267,
    'agri': 0.6570,
    'phar': 0.5635,
    'neur': 0.5363,
    'comp': 0.8170,
    'nurs': 1,
    'medi': 0.7262,
    'soci': 0.5093,
    'immu': 0.7276,
    'arts': 0.8276,
    'chem': 0.5857,
    'busi': 0.7099,
    'math': 0.9808,
    'phys': 0.6064,
    'ceng': 0.6037,
    'heal': 0.9884,
    'bioc': 0.6193,
    'psyc': 0.6032,
    'ener': 0.4237,
    'engi': 0.4870,
    'vete': 0.9663,
}

In [None]:
subj_acc = 0.579

Label encoder

In [None]:
# Change the file path if needed
le_filename = 'models/labelencoder.pkl'
# load the model from disk
with open(le_filename, 'rb') as f:
    le = pickle.load(f)

Load subject model

In [None]:
# Change the file path if needed
subject_model = load_model('models/Subject area model/subjArea.h5')

# Change the file path if needed
subject_vect_filename = 'models/Subject area model/subj_vectorizer.pkl'
# load the model from disk
with open(subject_vect_filename, 'rb') as f:
    subj_vectorizer = pickle.load(f)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


Subject model prediction method

In [None]:
def to_lower(X):
    X = [x.lower() for x in X]
    return X

def prepared_subject(subjectarea):
    subjectarea = to_lower(subjectarea)
    subjectarea = ' '.join(subjectarea)
    return subj_vectorizer.transform([subjectarea]).toarray()

def subject_predict(subjectarea):
    subjectarea = prepared_subject(subjectarea)
    prediction = subject_model.predict(subjectarea, verbose=None)
    return prediction[0]

Load abstract model

In [None]:
abstract_model = {}
# Change the file path if needed
directory_path = 'models/abstract model (scibert)/'

file_list = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

for file in file_list:
    abstract_model[file[:4].lower()] = load_model(directory_path + file)

Abstract model method

In [None]:
def get_embeddings(sentences, batch_size=16, max_length=500):
    with torch.no_grad():
        embeddings = []  # Initialize a list to accumulate embeddings
        for idx in range(0, len(sentences), batch_size):
            batched_sentences = sentences[idx : min(len(sentences), idx + batch_size)]
            encoded = tokenizer(batched_sentences, truncation=True, return_tensors='pt', padding="max_length", max_length=max_length)
            batch_embeddings = sciBert_model(**encoded).last_hidden_state[:, 0].cpu().numpy()
            embeddings.extend(batch_embeddings)  # Accumulate embeddings
        return np.array(embeddings)


def abstract_predict(subj, abstract):
    model = abstract_model[subj]

    embedded_abstract = list(get_embeddings([abstract]))

    return model.predict(np.array(embedded_abstract), verbose=None)[0]

In [None]:
def dynamic_ranking(abstract, subjectarea):
    total_percentage = 0
    total_percentage += subj_acc
    for subject in subjectarea:
        total_percentage += subjs_model_acc[subject]

    subj_pred = subject_predict(subjectarea) * (subj_acc/total_percentage)

    total_abstract_pred = np.zeros(len(le.classes_))
    for subject in subjectarea:
        total_abstract_pred += (abstract_predict(subject, abstract) * subjs_model_acc[subject]/total_percentage)

    pred = total_abstract_pred + subj_pred
    return pred


def static_ranking(abstract, subjectarea, abstract_weight=0.75, subj_weight=0.25):
    subj_pred = subject_predict(subjectarea)

    total_abstract_pred = np.zeros(len(le.classes_))
    for subject in subjectarea:
        total_abstract_pred += abstract_predict(subject, abstract)

    pred = ((total_abstract_pred / len(subjectarea)) * abstract_weight) + (subj_pred * subj_weight)
    return pred


def ranking(abstract, subjectarea):
    # pred = dynamic_ranking(abstract, subjectarea)
    pred = static_ranking(abstract, subjectarea, 1, 0)

    return pred

Load data

In [None]:
# Change the file path if needed
data = pd.read_csv(os.getcwd() + '/data/body_extracted.csv', index_col=0)
def extract_journal(doi):
    # just get text after 'j.' until another '.'
    match = re.search(r'j\.([^\.]+)', doi)
    if match:
        name = match.group(1)
        return name

data['doi'] = data['doi'].apply(extract_journal)

threshold = 22/len(data) # the remaining journals need to at least contain 22 articles
counts = data['doi'].value_counts(normalize=True)
data = data.loc[data['doi'].isin(counts[counts > threshold].index), :]

Test accuracy

In [None]:
prediction_ranking_list = []

subject_size = []

test_size = 3000
test_data = data.sample(n=test_size, random_state=24)

for index, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):
    subjareas = ast.literal_eval(row['subjareas'])
    subjareas = [x.lower() for x in subjareas]
    pred = ranking(row['abstract'], subjareas)

    pred_index = pred.argsort()[::-1]

    true_index = np.where(le.classes_ == row.doi)[0][0]

    predict_ranking = np.where(pred_index==true_index)[0][0]

    prediction_ranking_list.append(predict_ranking + 1)

    subject_size.append(len(subjareas))

  0%|                                                                               | 2/3000 [00:06<2:28:10,  2.97s/it]



  0%|▏                                                                              | 5/3000 [00:11<1:32:20,  1.85s/it]



100%|████████████████████████████████████████████████████████████████████████████| 3000/3000 [3:50:53<00:00,  4.62s/it]


In [None]:
top_1_acc = sum([1 if i <= 1 else 0 for i in prediction_ranking_list])/len(prediction_ranking_list)
top_1_acc

0.6973333333333334

In [None]:
top_3_acc = sum([1 if i <= 3 else 0 for i in prediction_ranking_list])/len(prediction_ranking_list)
top_3_acc

0.8776666666666667

In [None]:
top_5_acc = sum([1 if i <= 5 else 0 for i in prediction_ranking_list])/len(prediction_ranking_list)
top_5_acc

0.938

In [None]:
correct_list = [prediction_ranking_list[i] for i, size in enumerate(subject_size) if size==1]
acc_with_sub_size_1 = sum([1 if i <= 1 else 0 for i in correct_list])/len(correct_list)
acc_with_sub_size_1


0.6435495898583147

In [None]:
correct_list = [prediction_ranking_list[i] for i, size in enumerate(subject_size) if size==2]
acc_with_sub_size_2 = sum([1 if i <= 1 else 0 for i in correct_list])/len(correct_list)
acc_with_sub_size_2

0.7138939670932358

In [None]:
correct_list = [prediction_ranking_list[i] for i, size in enumerate(subject_size) if size==3]
acc_with_sub_size_3 = sum([1 if i <= 1 else 0 for i in correct_list])/len(correct_list)
acc_with_sub_size_3

0.7388888888888889

In [None]:
correct_list = [prediction_ranking_list[i] for i, size in enumerate(subject_size) if size==4]
acc_with_sub_size_4 = sum([1 if i <= 1 else 0 for i in correct_list])/len(correct_list)
acc_with_sub_size_4

0.8923076923076924

In [None]:
correct_list = [prediction_ranking_list[i] for i, size in enumerate(subject_size) if size==5]
acc_with_sub_size_5 = sum([1 if i <= 1 else 0 for i in correct_list])/len(correct_list)
acc_with_sub_size_5

1.0