# Loading Tools and Dataset

In [14]:
!pip install tensorflow



In [16]:
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split

from ast import literal_eval
# is used for safely evaluating strings containing Python literals or container displays
# (e.g., lists, dictionaries) to their corresponding Python objects.

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [18]:
# arxiv_data = pd.read_csv("arxiv_data.csv")
arxiv_data = pd.read_csv("arxiv_data_210930-054931.csv")

In [26]:
arxiv_data.head()

Unnamed: 0,terms,titles,abstracts
0,['cs.LG'],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...
1,"['cs.LG', 'cs.AI']",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...
2,"['cs.LG', 'cs.CR', 'stat.ML']",Power up! Robust Graph Convolutional Network v...,Graph convolutional networks (GCNs) are powerf...
3,"['cs.LG', 'cs.CR']",Releasing Graph Neural Networks with Different...,With the increasing popularity of Graph Neural...
4,['cs.LG'],Recurrence-Aware Long-Term Cognitive Network f...,Machine learning solutions for pattern classif...


In [28]:
# arxiv_data.shape

In [30]:
# arxiv_data.drop(columns = ["terms","abstracts"], inplace = True)

In [32]:
# arxiv_data

## Data Cleaning and Preprocessing

In [36]:
arxiv_data.shape

(56181, 3)

In [38]:
arxiv_data.isnull().sum()

terms        0
titles       0
abstracts    0
dtype: int64

In [40]:
arxiv_data.duplicated().sum()

15054

In [42]:
# o index data
arxiv_data['terms']

0                                          ['cs.LG']
1                                 ['cs.LG', 'cs.AI']
2                      ['cs.LG', 'cs.CR', 'stat.ML']
3                                 ['cs.LG', 'cs.CR']
4                                          ['cs.LG']
                            ...                     
56176                             ['cs.CV', 'cs.IR']
56177    ['cs.LG', 'cs.AI', 'cs.CL', 'I.2.6; I.2.7']
56178                                      ['cs.LG']
56179                ['stat.ML', 'cs.LG', 'math.OC']
56180                  ['cs.LG', 'cs.AI', 'stat.ML']
Name: terms, Length: 56181, dtype: object

In [44]:
# 0 index 
arxiv_data['abstracts']

0        Graph neural networks (GNNs) have been widely ...
1        Deep networks and decision forests (such as ra...
2        Graph convolutional networks (GCNs) are powerf...
3        With the increasing popularity of Graph Neural...
4        Machine learning solutions for pattern classif...
                               ...                        
56176    Despite the growing availability of big data i...
56177    This paper presents a simple end-to-end model ...
56178    The popular Q-learning algorithm is known to o...
56179    Principal components analysis (PCA) is a well-...
56180    SDYNA is a general framework designed to addre...
Name: abstracts, Length: 56181, dtype: object

In [46]:
labels_column = arxiv_data['terms'].apply(literal_eval)
labels = labels_column.explode().unique()
print("labels :",labels)
print("lenght :",len(labels))

labels : ['cs.LG' 'cs.AI' 'cs.CR' ... 'D.1.3; G.4; I.2.8; I.2.11; I.5.3; J.3'
 '68T07, 68T45, 68T10, 68T50, 68U35' 'I.2.0; G.3']
lenght : 1177


In [47]:
arxiv_data[arxiv_data['titles'].duplicated()]

Unnamed: 0,terms,titles,abstracts
71,"['cs.LG', 'cs.AI']",Learnable Hypergraph Laplacian for Hypergraph ...,HyperGraph Convolutional Neural Networks (HGCN...
747,"['cs.LG', 'cs.AI']",Do Transformers Really Perform Bad for Graph R...,The Transformer architecture has become a domi...
852,['cs.LG'],Self-supervised Auxiliary Learning for Graph N...,"In recent years, graph neural networks (GNNs) ..."
950,"['cs.LG', 'cs.CL', 'cs.SE', 'stat.ML']",Structured Neural Summarization,Summarization of long sequences into a concise...
990,"['cs.CV', 'cs.LG', 'eess.IV']",Learning Local Neighboring Structure for Robus...,Mesh is a powerful data structure for 3D shape...
...,...,...,...
56171,"['cs.LG', 'cs.AI']",Reinforcement Learning with Deep Energy-Based ...,We propose a method for learning expressive en...
56172,"['cs.LG', 'cs.AI']",A Laplacian Framework for Option Discovery in ...,Representation learning and option discovery a...
56174,"['cs.LG', 'stat.ML']",Neural Episodic Control,Deep reinforcement learning methods attain sup...
56178,['cs.LG'],Deep Reinforcement Learning with Double Q-lear...,The popular Q-learning algorithm is known to o...


In [48]:
arxiv_data.shape

(56181, 3)

In [49]:
# arxiv_data['terms'].value_counts()
arxiv_data['terms'].value_counts()==1

terms
['cs.CV']                                                         False
['cs.LG', 'stat.ML']                                              False
['cs.LG']                                                         False
['cs.CV', 'cs.LG']                                                False
['cs.LG', 'cs.AI']                                                False
                                                                  ...  
['cs.CV', 'I.4.9; J.3']                                            True
['cs.LG', 'stat.ML', '68-04']                                      True
['cs.LG', 'math.RT', 'stat.ML']                                    True
['cs.LG', 'nlin.CD', 'physics.data-an', 'q-bio.QM', 'stat.ML']     True
['cs.LG', 'cs.AI', 'cs.CL', 'I.2.6; I.2.7']                        True
Name: count, Length: 3402, dtype: bool

In [54]:
sum(arxiv_data['terms'].value_counts()==1)

1846

In [56]:
print(sum(arxiv_data['terms'].value_counts()==1))
print(arxiv_data['terms'].nunique())

1846
3402


In [58]:
# Filtering the rare terms. (it keeps only those rows where the "terms" value occurs more than once in the original DataFrame.)
arxiv_data_filtered = arxiv_data.groupby('terms').filter(lambda x: len(x) > 1)
arxiv_data_filtered.shape

(54335, 3)

In [60]:
arxiv_data_filtered['terms'] = arxiv_data_filtered['terms'].apply(lambda x: literal_eval(x))
arxiv_data_filtered['terms'].values[:3]

array([list(['cs.LG']), list(['cs.LG', 'cs.AI']),
       list(['cs.LG', 'cs.CR', 'stat.ML'])], dtype=object)

## Train and Test Split

In [63]:
# test_split = 0.1
# The stratify parameter ensures that the splitting is done in a way that preserves the same distribution of labels (terms) in both the training and test sets.
train_df, test_df = train_test_split(arxiv_data_filtered,test_size=0.1,stratify=arxiv_data_filtered["terms"].values,)

# Splitting the test set further into validation and new test sets.
val_df = test_df.sample(frac=0.5)
test_df.drop(val_df.index, inplace=True)

# print(f"Number of rows in training set: {len(train_df)}")
# print(f"Number of rows in validation set: {len(val_df)}")
# print(f"Number of rows in test set: {len(test_df)}")

In [65]:
train_df.shape, test_df.shape

((48901, 3), (2717, 3))

In [67]:
val_df.shape, test_df.shape

((2717, 3), (2717, 3))

In [69]:
train_df.shape

(48901, 3)

In [71]:
terms = tf.ragged.constant(train_df['terms'])
lookup = tf.keras.layers.StringLookup(output_mode='multi_hot')
lookup.adapt(terms)
vocab = lookup.get_vocabulary()

In [72]:
# train_df['terms']

In [75]:
# sample_label = train_df['terms'].iloc[0]
# sample_label = train_df['terms'].iloc[1]
sample_label = train_df['terms'].iloc[2]
print(sample_label)
label_binarized = lookup([sample_label])
print(label_binarized)

['cs.LG', 'stat.ML']
tf.Tensor(
[[0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 

In [77]:
max_seqlen = 150
batch_size = 128
padding_token = "<pad>"
auto = tf.data.AUTOTUNE

def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe["terms"].values)
    # label_binarized is a NumPy array.
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices((dataframe["abstracts"].values, label_binarized))
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)

train_dataset = make_dataset(train_df, is_train=True)
validation_dataset = make_dataset(val_df, is_train=False)
test_dataset = make_dataset(test_df, is_train=False)


In [81]:
next(iter(train_dataset))

(<tf.Tensor: shape=(128,), dtype=string, numpy=
 array([b'The objective of active learning (AL) is to train classification models with\nless number of labeled instances by selecting only the most informative\ninstances for labeling. The AL algorithms designed for other data types such as\nimages and text do not perform well on graph-structured data. Although a few\nheuristics-based AL algorithms have been proposed for graphs, a principled\napproach is lacking. In this paper, we propose MetAL, an AL approach that\nselects unlabeled instances that directly improve the future performance of a\nclassification model. For a semi-supervised learning problem, we formulate the\nAL task as a bilevel optimization problem. Based on recent work in\nmeta-learning, we use the meta-gradients to approximate the impact of\nretraining the model with any unlabeled instance on the model performance.\nUsing multiple graph datasets belonging to different domains, we demonstrate\nthat MetAL efficiently outper

In [83]:
def invert_multi_hot(encoded_labels):
    hot_indices = np.argwhere(encoded_labels==1.0)[...,0]
    return np.take(vocab, hot_indices)

text_batch, label_batch = next(iter(train_dataset))
for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Abstract: {text}")
    print(f"Label(s): {invert_multi_hot(label[0])}")
    print(" ")

Abstract: b'Computational results demonstrate that posterior sampling for reinforcement\nlearning (PSRL) dramatically outperforms algorithms driven by optimism, such as\nUCRL2. We provide insight into the extent of this performance boost and the\nphenomenon that drives it. We leverage this insight to establish an\n$\\tilde{O}(H\\sqrt{SAT})$ Bayesian expected regret bound for PSRL in\nfinite-horizon episodic Markov decision processes, where $H$ is the horizon,\n$S$ is the number of states, $A$ is the number of actions and $T$ is the time\nelapsed. This improves upon the best previous bound of $\\tilde{O}(H S\n\\sqrt{AT})$ for any reinforcement learning algorithm.'
Label(s): ['cs.LG' 'stat.ML' 'cs.AI']
 
Abstract: b'Facial Expression Recognition (FER) is a classification task that points to\nface variants. Hence, there are certain affinity features between facial\nexpressions, receiving little attention in the FER literature. Convolution\npadding, despite helping capture the edge informa

In [85]:
# This code calculates the size of the vocabulary in the "abstracts" column of the train_df DataFrame.

# Creating vocabulary with uniques words
vocabulary = set()
train_df["abstracts"].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)
print(vocabulary_size)

164146


## Text Vectorization

### In the Context of Natural Language Processing

In [89]:
text_vectorizer = layers.TextVectorization(max_tokens=vocabulary_size,ngrams=2,output_mode="tf_idf")
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

In [91]:
train_dataset = train_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
validation_dataset = validation_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
test_dataset = test_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)

<h2>Model Training</h2>

In [97]:
from tensorflow.keras.callbacks import EarlyStopping
model1 = keras.Sequential([
    layers.Dense(512, activation="relu"),
    layers.Dropout(0.5), 
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.5), 
    layers.Dense(lookup.vocabulary_size(), activation='sigmoid')
])

model1.compile(loss="binary_crossentropy", optimizer='adam', metrics=['binary_accuracy'])
es = EarlyStopping(patience=5,restore_best_weights=True)
history = model1.fit(train_dataset,validation_data=validation_dataset,epochs=20,callbacks=[es])

Epoch 1/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m949s[0m 2s/step - binary_accuracy: 0.9612 - loss: 0.0826 - val_binary_accuracy: 0.9983 - val_loss: 0.0059
Epoch 2/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m914s[0m 2s/step - binary_accuracy: 0.9983 - loss: 0.0063 - val_binary_accuracy: 0.9985 - val_loss: 0.0049
Epoch 3/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m892s[0m 2s/step - binary_accuracy: 0.9987 - loss: 0.0046 - val_binary_accuracy: 0.9987 - val_loss: 0.0046
Epoch 4/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1044s[0m 3s/step - binary_accuracy: 0.9990 - loss: 0.0037 - val_binary_accuracy: 0.9987 - val_loss: 0.0046
Epoch 5/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m899s[0m 2s/step - binary_accuracy: 0.9991 - loss: 0.0032 - val_binary_accuracy: 0.9988 - val_loss: 0.0047
Epoch 6/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m893s[0m 2s/step - binary_accuracy: 0.9992 -

## Save Model and Text Vectorization

In [102]:
# Save the model
import pickle
model1.save("models/model.h5")

# Save the configuration of the text vectorizer
saved_text_vectorizer_config = text_vectorizer.get_config()
with open("models/text_vectorizer_config.pkl", "wb") as f:
    pickle.dump(saved_text_vectorizer_config, f)


# Save the vocabulary
with open("models/vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)



### Load Model and Text Vectorization

In [109]:
# from tensorflow import keras
# import pickle

# # Load the model
# loaded_model = keras.models.load_model("models/model.h5")

# from tensorflow.keras.layers import TextVectorization

# # Load the configuration of the text vectorizer
# with open("models/text_vectorizer_config.pkl", "rb") as f:
#     saved_text_vectorizer_config = pickle.load(f)

# # Create a new TextVectorization layer with the saved configuration
# loaded_text_vectorizer = TextVectorization.from_config(saved_text_vectorizer_config)

# # Load the saved weights into the new TextVectorization layer
# with open("models/text_vectorizer_weights.pkl", "rb") as f:
#     weights = pickle.load(f)
#     loaded_text_vectorizer.set_weights(weights)



# Create a new TextVectorization layer with no token limit
loaded_text_vectorizer = TextVectorization(
    max_tokens=None,  # No limit on the vocabulary size
    output_mode="int",
    output_sequence_length=200  # Adjust to your original setup
)





ValueError: Attempted to set a vocabulary larger than the maximum vocab size. Received vocabulary size is 164148; `max_tokens` is 164146.

In [421]:
# Load the vocabulary
with open("models/vocab.pkl", "rb") as f:
    loaded_vocab = pickle.load(f)

Model Evaluation

Model Prediction

In [423]:
def invert_multi_hot(encoded_labels):
    hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
    return np.take(loaded_vocab, hot_indices)

In [425]:
def predict_category(abstract, model, vectorizer, label_lookup):
    preprocessed_abstract = vectorizer([abstract])
    predictions = model.predict(preprocessed_abstract)
    predicted_labels = label_lookup(np.round(predictions).astype(int)[0])
    return predicted_labels

In [427]:
new_abstract = "ooooo"
predicted_category = predict_category(new_abstract, loaded_model, loaded_text_vectorizer, invert_multi_hot)
print("Predicted Categories:", predicted_category)

FailedPreconditionError: Exception encountered when calling TextVectorization.call().

[1m{{function_node __wrapped__LookupTableFindV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Table not initialized. [Op:LookupTableFindV2] name: [0m

Arguments received by TextVectorization.call():
  • inputs=["'ooooo'"]

In [401]:
# Example usage
new_abstract = 'Deep networks and decision forests (such as random forests and gradient\nboosted trees) are the leading machine learning methods for structured and\ntabular data, respectively. Many papers have empirically compared large numbers\nof classifiers on one or two different domains (e.g., on 100 different tabular\ndata settings). However, a careful conceptual and empirical comparison of these\ntwo strategies using the most contemporary best practices has yet to be\nperformed. Conceptually, we illustrate that both can be profitably viewed as\n"partition and vote" schemes. Specifically, the representation space that they\nboth learn is a partitioning of feature space into a union of convex polytopes.\nFor inference, each decides on the basis of votes from the activated nodes.\nThis formulation allows for a unified basic understanding of the relationship\nbetween these methods. Empirically, we compare these two strategies on hundreds\nof tabular data settings, as well as several vision and auditory settings. Our\nfocus is on datasets with at most 10,000 samples, which represent a large\nfraction of scientific and biomedical datasets. In general, we found forests to\nexcel at tabular and structured data (vision and audition) with small sample\nsizes, whereas deep nets performed better on structured data with larger sample\nsizes. This suggests that further gains in both scenarios may be realized via\nfurther combining aspects of forests and networks. We will continue revising\nthis technical report in the coming months with updated results.'
predicted_category = predict_category(new_abstract, loaded_model, loaded_text_vectorizer, invert_multi_hot)
print("Predicted Categories:", predicted_category)

FailedPreconditionError: Exception encountered when calling TextVectorization.call().

[1m{{function_node __wrapped__LookupTableFindV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Table not initialized. [Op:LookupTableFindV2] name: [0m

Arguments received by TextVectorization.call():
  • inputs=['\'Deep networks and decision forests (such as random forests and gradient\\nboosted trees) are the leading machine learning methods for structured and\\ntabular data, respectively. Many papers have empirically compared large numbers\\nof classifiers on one or two different domains (e.g., on 100 different tabular\\ndata settings). However, a careful conceptual and empirical comparison of these\\ntwo strategies using the most contemporary best practices has yet to be\\nperformed. Conceptually, we illustrate that both can be profitably viewed as\\n"partition and vote" schemes. Specifically, the representation space that they\\nboth learn is a partitioning of feature space into a union of convex polytopes.\\nFor inference, each decides on the basis of votes from the activated nodes.\\nThis formulation allows for a unified basic understanding of the relationship\\nbetween these methods. Empirically, we compare these two strategies on hundreds\\nof tabular data settings, as well as several vision and auditory settings. Our\\nfocus is on datasets with at most 10,000 samples, which represent a large\\nfraction of scientific and biomedical datasets. In general, we found forests to\\nexcel at tabular and structured data (vision and audition) with small sample\\nsizes, whereas deep nets performed better on structured data with larger sample\\nsizes. This suggests that further gains in both scenarios may be realized via\\nfurther combining aspects of forests and networks. We will continue revising\\nthis technical report in the coming months with updated results.\'']

Sentence Transformar

In [28]:
!pip install -U -q sentence-transformers

In [30]:
!pip install tf-keras



In [32]:
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange





In [34]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [36]:
sentences = arxiv_data['titles']

In [39]:
embeddings = model.encode(sentences)

In [40]:
embeddings

array([[ 0.06643406, -0.04954603,  0.06388083, ...,  0.00106309,
        -0.12156384, -0.06962774],
       [ 0.09212256, -0.07606938,  0.06572868, ..., -0.08565164,
        -0.09266549,  0.00725293],
       [-0.08162683,  0.02428931,  0.0188875 , ...,  0.00806162,
        -0.0512953 , -0.05873996],
       ...,
       [-0.09695337,  0.00057092,  0.07726488, ..., -0.01443806,
        -0.04748214,  0.06130564],
       [ 0.00768873, -0.1012418 ,  0.08909854, ..., -0.08199865,
        -0.05649742,  0.09007055],
       [ 0.06078517, -0.08312798, -0.00907767, ..., -0.03148185,
         0.05713108,  0.05696892]], dtype=float32)

Why select all-MiniLM-L6-v2?
All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. Source

Its small in size 80 MB with good performance.

Print the embeddings

In [41]:
c = 0
#This loop iterates over pairs of sentences and their corresponding embeddings. 
#zip is used to iterate over both lists simultaneously.
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding length:", len(embedding)) # list of floats
    # print("Embedding length:",embedding) 
    print("")
    # Breaks out of the loop after printing information for the first 5 sentences.
    if c >=5:
        break
    c +=1 

Sentence: Multi-Level Attention Pooling for Graph Neural Networks: Unifying Graph Representations with Multiple Localities
Embedding length: 384

Sentence: Decision Forests vs. Deep Networks: Conceptual Similarities and Empirical Differences at Small Sample Sizes
Embedding length: 384

Sentence: Power up! Robust Graph Convolutional Network via Graph Powering
Embedding length: 384

Sentence: Releasing Graph Neural Networks with Differential Privacy Guarantees
Embedding length: 384

Sentence: Recurrence-Aware Long-Term Cognitive Network for Explainable Pattern Classification
Embedding length: 384

Sentence: Lifelong Graph Learning
Embedding length: 384



Save Files

import pickle

with open("models/embeddings.pkl",'wb') as f:
    pickle.dump(embeddings,f)
    
with open("models/sentences.pkl",'wb') as f:
    pickle.dump(sentences,f)
    
with open("models/rec_model.pkl",'wb') as f:
    pickle.dump(model,f)
    

In [43]:
# load save files
embeddings = pickle.load(open('models/embeddings.pkl','rb'))
sentences = pickle.load(open('models/sentences.pkl','rb'))
rec_model = pickle.load(open('models/rec_model.pkl','rb'))

Recommendation for Similar Papers

In [45]:
import pickle
embeddings = pickle.load(open('models/embeddings.pkl','rb'))
sentences = pickle.load(open('models/sentences.pkl','rb'))
rec_model = pickle.load(open('models/rec_model.pkl','rb'))

In [79]:
import torch

def recommendation(input_paper):
    # Calculate cosine similarity scores between the embeddings of input_paper and all papers in the dataset.
    cosine_scores = util.cos_sim(embeddings, rec_model.encode(input_paper))
    
    # Get the indices of the top-k most similar papers based on cosine similarity.
    top_similar_papers = torch.topk(cosine_scores, dim=0, k=5, sorted=True)
                                 
    # Retrieve the titles of the top similar papers.
    papers_list = []
    for i in top_similar_papers.indices:
        papers_list.append(sentences[i.item()])
    
    return papers_list

In [81]:
# input_paper = input("Enter research paper title.....")
# recommend_papers = recommendation(input_paper)

In [83]:
# a= " Attention is all you need"
# myemb = rec_model.encode(a)
# util.cos_sim(embedding, myemb)

In [85]:
# cosine_scores = util.cos_sim(embedding, rec_model.encode(a))

In [87]:
# exampel usage 1: (use this paper as input (Attention is All you Need))
input_paper = input("Enter the title of any paper you like")
recommend_papers = recommendation(input_paper)


print("We recommend to read this paper............")
print("=============================================")
for paper in recommend_papers:
    print(paper)

Enter the title of any paper you like Attention is All you Need


We recommend to read this paper............
Attention that does not Explain Away
Attention that does not Explain Away
Attention that does not Explain Away
Area Attention
Area Attention


In [89]:
# exampel usage 2: (use this paper as input (BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding))
input_paper = input("Enter the title of any paper you like")
recommend_papers = recommendation(input_paper)


print("We recommend to read this paper............")
print("=============================================")
for paper in recommend_papers:
    print(paper)

Enter the title of any paper you like BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding


We recommend to read this paper............
BEiT: BERT Pre-Training of Image Transformers
BEiT: BERT Pre-Training of Image Transformers
VL-BERT: Pre-training of Generic Visual-Linguistic Representations
Sketch-BERT: Learning Sketch Bidirectional Encoder Representation from Transformers by Self-supervised Learning of Sketch Gestalt
Sketch-BERT: Learning Sketch Bidirectional Encoder Representation from Transformers by Self-supervised Learning of Sketch Gestalt


In [91]:
# exampel usage 3: (use this paper as input (Review of deep learning: concepts, CNN architectures, challenges, applications, future directions))
input_paper = input("Enter the title of any paper you like")
recommend_papers = recommendation(input_paper)


print("We recommend to read this paper............")
print("=============================================")
for paper in recommend_papers:
    print(paper)

Enter the title of any paper you like Review of deep learning: concepts, CNN architectures, challenges, applications, future directions


We recommend to read this paper............
A Review of Deep Learning with Special Emphasis on Architectures, Applications and Recent Trends
Review of Deep Learning
Deep Convolutional Neural Networks: A survey of the foundations, selected improvements, and some current applications
A Survey of the Recent Architectures of Deep Convolutional Neural Networks
A Survey of the Recent Architectures of Deep Convolutional Neural Networks


In [95]:
# install tool versions
import sentence_transformers
import tensorflow
import torch
print(torch.__version__)
print(sentence_transformers.__version__)
print(tensorflow.__version__)

2.5.0+cpu
3.2.1
2.17.0


In [1]:
# install tool versions
import sentence_transformers
import tensorflow
import torch
print(torch.__version__)
print(sentence_transformers.__version__)
print(tensorflow.__version__)

  from tqdm.autonotebook import tqdm, trange



2.5.0+cpu
3.2.1
2.17.0
