In [18]:
# TODO: Combine main-part-project and abstract-part-project
# TODO: Find a way to download a lot of data from arxiv
# TODO: Download and add a lot of data into single folder, then make a python script run through all of them

Data retrieval from JSON file

In [19]:
import json
import torch

# Initialize empty lists for abstract and non-abstract texts
abstract_texts = []
non_abstract_texts = []

# Read the JSON file
with open("data.json", "r") as json_file:
    for line in json_file:
        # Parse each line as a JSON object
        data = json.loads(line)

        # Extract the category and text
        category = data.get("category")
        text = data.get("text")

        # Add the text to the corresponding list based on the category
        if category == "abstract":
            abstract_texts.append(text)
        elif category == "non-abstract":
            non_abstract_texts.append(text)


Data preperation

In [2]:
from transformers import BartTokenizer, BartForConditionalGeneration
from torch.utils.data import Dataset, DataLoader

# Step 1: Dataset Preparation
class MyDataset(Dataset):
    def __init__(self, source_texts, target_texts):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, index):
        source_text = self.source_texts[index]
        target_text = self.target_texts[index]

        source_inputs = self.tokenizer.encode_plus(source_text, return_tensors='pt', truncation=True, padding='max_length', max_length=512)
        target_inputs = self.tokenizer.encode_plus(target_text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)

        return {
            'input_ids': source_inputs['input_ids'].squeeze(),
            'attention_mask': source_inputs['attention_mask'].squeeze(),
            'decoder_input_ids': target_inputs['input_ids'].squeeze(),
            'decoder_attention_mask': target_inputs['attention_mask'].squeeze()
        }


train_dataset = MyDataset(non_abstract_texts, abstract_texts)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Test data preparation

In [3]:
import re
import PyPDF2

# Derive the main text for testing
# Read in the research paper data
pdf_file = open('file2.pdf', 'rb')

# Create a PDF reader object
pdf_reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)

# Loop through all the pages in the PDF document and extract the text
text = ""
for i in range(num_pages):
    # Get the page object
    page = pdf_reader.pages[i]

    # Extract the text from the page
    page_text = page.extract_text()

    # Append the text to the document text
    text += page_text

# Close the PDF file
pdf_file.close()


# Define a regular expression to match the word "References" and everything that follows it
references_pattern = re.compile(r'References(.*)', re.DOTALL)

# Use the regular expression to remove the text after "References"
text = references_pattern.sub('', text)

# Define a regular expression to match equations
equations_pattern = re.compile(r'.*?=.+')


updated_text = re.sub(r".*Abstract[^:]+:", "", text, flags=re.DOTALL)

To train

In [8]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        decoder_input_ids = batch['decoder_input_ids'].to(device)
        decoder_attention_mask = batch['decoder_attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask)

        loss = criterion(outputs.logits.view(-1, outputs.logits.shape[-1]), decoder_input_ids.view(-1))
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print("Epoch:", epoch, " Average Loss:", average_loss)

# Step 3: Save the trained model
model.save_pretrained("trained_bart_model")

TypeError: string indices must be integers