In [5]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
# from utils import pdfs_to_df, tokenize_df_of_texts
import torch
import json
import os

In [11]:
fname = os.path.join("..", "data", "parsed_cleaned_pdfs", "roberta", "1.json")
with open(fname) as json_file:
    data = json.load(json_file)
    
print(data.keys())

dict_keys(['tokens', 'tokens_less_sw', 'token_embeddings', 'token_embeddings_less_sw', 'Document', 'Abstract', 'Text', 'Abstract_Original', 'Original_Text', 'Path', 'sha_256', 'language', 'language_probability', 'Authors', 'Title', 'url', 'date'])


In [14]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': "What is a contact manifold",
    'context': data['Text']
}
res = nlp(QA_input)

# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading model.safetensors: 100%|████████████████████████████████████████████████████████████████████████| 496M/496M [00:11<00:00, 43.8MB/s]


In [15]:
res

{'score': 0.29526928067207336,
 'start': 962,
 'end': 989,
 'answer': 'an odd-dimensional manifold'}

In [12]:
question = "What is a contact manifold"
data['Text']

'lagrangian cobordisms between legendrian knots arise in symplectic field theory and impose an interesting and not well-understood relation on legendrian knots. there are some known “elementary” building blocks for lagrangian cobordisms that are smoothly the attachment of 0- and 1-handles. an important question is whether every pair of nonempty legendrians that are related by connected lagrangian cobordism can be related by ribbon lagrangian cobordism, in particular one that is “decomposable” into composition of these elementary building blocks. we will describe these and other combinatorial building blocks as well as some geometric methods, involving the theory of satellites, to construct lagrangian cobordisms. we will then survey some known results, derived through heegaard floer homology and contact surgery, that may provide pathway to proving the existence of nondecomposable (nonribbon) lagrangian cobordisms. 1. introduction contact manifold is an odd-dimensional manifold together 

In [1]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
directory = './test_pdfs'
df = pdfs_to_df(directory)
df = tokenize_df_of_texts(df)

drop_cols = [col for col in df.columns if col not in ['Document', 'Text', 'Original_Text', 'Path', 'tokens']]
print(drop_cols)

df = df.drop(columns=drop_cols)

./test_pdfs/2101.00031.pdf
./test_pdfs/2101.01089.pdf
./test_pdfs/2101.00182.pdf
./test_pdfs/2101.00525.pdf
./test_pdfs/2101.01017.pdf
./test_pdfs/2101.00005.pdf
./test_pdfs/2101.00763.pdf
./test_pdfs/2101.01291.pdf
./test_pdfs/2101.00831.pdf
./test_pdfs/2101.01094.pdf
./test_pdfs/2101.00572.pdf
processing text...
making lower-case...
Removing non-text elements (extra whitespaces)...
Removing unnecessary whitespace and special characters...
Removing line breaks...
Removing gibberish...
Removing unicode...
remove single letters or super large words (so big they don't make sense)...
done cleaning.

tokenize the processed text...
['Abstract', 'Abstract_Original', 'sha_256', 'language', 'language_probability', 'Authors', 'Title', 'url', 'date', 'token_embeddings']


In [9]:
# Example data
train_texts = df['tokens'].to_list()[0:8]  # List of your tokenized texts
train_labels = df['tokens'].to_list()[8:]  # List of corresponding labels (e.g., relevance to query)

In [13]:
# Tokenize text data and create input features
input_ids = []
attention_masks = []
for text in train_texts:
    encoded_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids.append(encoded_text['input_ids'])
    attention_masks.append(encoded_text['attention_mask'])



In [None]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train_labels)

# Create a dataset and dataloader
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
