This notebook is made to retrieve textual features / meta features.

In [None]:
!nvidia-smi -L

In [None]:
!pip install transformers

In [30]:
from transformers import AutoModel, AutoTokenizer

import torch

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
import gzip
import pickle

# Load the textual data and metadata

In [4]:
data = pd.read_csv('/content/drive/MyDrive/MLNS/data/node_information.csv', header=None, names=['node_id', 'date', 'title', 'authors', 'journal', 'abstract'])
data.head()

Unnamed: 0,node_id,date,title,authors,journal,abstract
0,1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
2,1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
3,1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
4,1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...


## Some Informations

In [5]:
# Compute length of each abstract
length_abstract = list()
for i in range(len(data)):
  length_abstract.append(len(data.loc[i, 'abstract'].split()))

In [6]:
print(f"Maximum abstract length : {max(length_abstract)}")
print(f"Minimum abstract length : {min(length_abstract)}")
print(f"Mean abstract length : {sum(length_abstract)/len(length_abstract)}")

Maximum abstract length : 343
Minimum abstract length : 3
Mean abstract length : 101.39711919337414


# Load Model Embedding

In [7]:
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




# Extract Data

In [None]:
model.cuda()
model.eval()

In [29]:
txt2feat = dict()
for i in tqdm(range(len(data))):
  node_id = data.loc[i, 'node_id']
  date = data.loc[i, 'date']
  title = data.loc[i, 'title']
  try:
    authors = data.loc[i, 'authors'].split(',')
  except AttributeError:
    authors = None
  abstract = data.loc[i, 'abstract']

  article_textual_embedding = title + ' - ' + abstract
  tokens = tokenizer.encode(article_textual_embedding, return_tensors='pt').cuda()

  paper_embedding = model(tokens)['pooler_output'][0].detach().cpu().numpy()

  metadata = {"date": date, "paper_embedding": paper_embedding, "authors": authors}

  txt2feat[node_id] = metadata

HBox(children=(FloatProgress(value=0.0, max=27770.0), HTML(value='')))




# Saving

In [33]:
def save(object, filename, protocol = 0):
  """Saves a compressed object to disk
  """
  file = gzip.GzipFile(filename, 'wb')
  file.write(pickle.dumps(object, protocol))
  file.close()

In [34]:
save(txt2feat, f'/content/drive/MyDrive/MLNS/data_generated/textmetafeatures.files')