In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"  # You can use other BERT variants like 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # For binary classification

In [2]:
import data_file

15
(15, 384)
15
Vectorstore made


In [None]:
data_list=data_file.data1
data_list

In [None]:
data_X=[]
data_Y=[]
for element in data_list:
    data_X.append(element['content'])
    data_Y.append(element['label'])
data_X

In [27]:
def sliding_window_tokenize(text, max_length=512, overlap=256):
    tokens = tokenizer.encode(text)
    windows = []
    for i in range(0, len(tokens), max_length - overlap):
        window = tokens[i:i + max_length]
        windows.append(window)
    return windows
tokenized_cont={}
for i in range(len(data_X)):
    windows = sliding_window_tokenize(data_X[i], max_length=512, overlap=256)
    tokenized_cont[i]=windows

In [66]:
sum=0
for i in range(len(tokenized_cont)):
    sum+=len(tokenized_cont[i])
sum

456

In [None]:
from torch.utils.data import Dataset
import torch

# Define the Dataset class
class BinaryClassificationDataset(Dataset):
    def __init__(self, tokenized_windows, labels):
        """
        Args:
        - tokenized_windows: Pre-tokenized windows of text (List[List[int]]).
        - labels: Corresponding labels for each window (List[int]).
        """
        self.tokenized_windows = tokenized_windows
        self.labels = labels

    def __len__(self):
        return len(self.tokenized_windows)

    def __getitem__(self, idx):
        """
        Retrieve a single data sample.
        Args:
        - idx: Index of the data sample.
        Returns:
        - A dictionary with 'input_ids', 'attention_mask', and 'labels'.
        """
        input_ids = self.tokenized_windows[idx]
        label = self.labels[idx]

        # Generate attention mask (1 for real tokens, 0 for padding)
        attention_mask = [1] * len(input_ids) + [0] * (512 - len(input_ids))

        # Pad input_ids to max_length (512)
        padded_input_ids = input_ids + [0] * (512 - len(input_ids))

        return {
            'input_ids': torch.tensor(padded_input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare tokenized windows and labels
all_windows = []
all_labels = []

# Dynamically populate windows and replicate labels
for idx, windows in tokenized_cont.items():
    all_windows.extend(windows)  # Add all windows for the sample
    all_labels.extend([data_Y[idx]] * len(windows))  # Replicate the label for each window

# Verify size consistency
assert len(all_windows) == len(all_labels), "Mismatch between windows and labels!"

# Initialize the dataset
train_dataset = BinaryClassificationDataset(all_windows, all_labels)

# Check dataset size and a sample
print(f"Dataset size: {len(train_dataset)}")
print(f"Sample data: {train_dataset[0]}")




In [68]:
type(train_dataset)

__main__.BinaryClassificationDataset

In [69]:
len(train_dataset)

456

In [70]:
from transformers import Trainer, TrainingArguments
import accelerate
import torch
accelerate.__version__

'1.0.1'

In [None]:
training_args = TrainingArguments(
    output_dir='./results',            # output directory for model predictions and checkpoints
    num_train_epochs=1,                # number of training epochs
    per_device_train_batch_size=1,     # batch size for training
    per_device_eval_batch_size=1,      # batch size for evaluation
    warmup_steps=500,                  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                 # strength of weight decay
    logging_dir='./logs',              # directory for storing logs
    logging_steps=1,
)

# Setup Trainer
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=train_dataset,         # Pass the dataset
)

# Fine-tune the model
trainer.train()

In [None]:
# Assuming you have a validation dataset, pass it here
trainer.evaluate(train_dataset)  # Pass your validation dataset (similar to train_dataset)


In [None]:
# Save the model and tokenizer
model.save_pretrained('./binary_classification_model')
tokenizer.save_pretrained('./binary_classification_model')

In [24]:
import torch

def prepare_windows_for_model(windows, tokenizer, max_length=512):
    """
    Converts tokenized windows into model-compatible tensors.
    
    Args:
    - windows: List of tokenized windows.
    - tokenizer: Pretrained tokenizer object.
    - max_length: int, maximum sequence length for the model.
    
    Returns:
    - input_tensors: List of dictionaries with 'input_ids' and 'attention_mask'.
    """
    input_tensors = []
    for window in windows:
        # Decode tokens back into text for the tokenizer
        text_window = tokenizer.decode(window, skip_special_tokens=True)
        
        # Tokenize and encode for the model
        encoding = tokenizer(
            text_window,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        )
        input_tensors.append({
            'input_ids': encoding['input_ids'].squeeze(0),  # Remove extra batch dimension
            'attention_mask': encoding['attention_mask'].squeeze(0)
        })
    return input_tensors


In [77]:
from pypdf import PdfReader
import os

In [56]:
# Load the fine-tuned model
input_text=""
with open('./Papers/P099.pdf', "rb") as f:
    reader=PdfReader(f)
    for pages in reader.pages:
        input_text+=pages.extract_text()

model = BertForSequenceClassification.from_pretrained('./binary_classification_model')
tokenizer = BertTokenizer.from_pretrained('./binary_classification_model')
# Example inference
prompt=input_text
tokenized_text=prepare_windows_for_model(sliding_window_tokenize(prompt),tokenizer)
output = model(input_ids=tokenized_text[0]['input_ids'].unsqueeze(0),attention_mask=tokenized_text[0]['attention_mask'].unsqueeze(0))

# Convert logits to probabilities (if needed)
logits = output.logits
probabilities = torch.nn.Softmax(dim=1)(logits)
predicted_class = torch.argmax(probabilities, dim=1)

print(predicted_class)


Token indices sequence length is longer than the specified maximum sequence length for this model (3057 > 512). Running this sequence through the model will result in indexing errors


tensor([1])


In [8]:
import os
from pypdf import PdfReader
import pandas as pd

In [18]:
def load_content(destination):
    rows = []
    for file_name in os.listdir(destination):
        file_path = os.path.join(destination, file_name)
        with open(file_path, "rb") as f:
            reader = PdfReader(f)
            content=""
            for page in reader.pages:
                content+=page.extract_text()            
            rows.append({"file_name": file_name, "file_content": content})
            print(file_name) 
    print(len(rows))  
    return rows

In [None]:
rows=load_content('./Papers')

In [None]:
rows

In [12]:
model = BertForSequenceClassification.from_pretrained('./binary_classification_model')
tokenizer = BertTokenizer.from_pretrained('./binary_classification_model')

In [20]:
import numpy as np

In [34]:
rows[34]['file_name'][1:4]

'124'

In [42]:
outputs=np.zeros((135,2))
for i in range(len(rows)):
    tokenized_text=prepare_windows_for_model(sliding_window_tokenize(rows[i]['file_content']),tokenizer)
    output = model(input_ids=tokenized_text[0]['input_ids'].unsqueeze(0),attention_mask=tokenized_text[0]['attention_mask'].unsqueeze(0))
    logits = output.logits
    probabilities = torch.nn.Softmax(dim=1)(logits)
    predicted_class = torch.argmax(probabilities, dim=1)
    file_number=rows[i]['file_name'][1:4]
    outputs[int(file_number)-1,0]=predicted_class
    outputs[int(file_number)-1,1]=int(file_number)-1

In [None]:
labels=outputs[:,0].astype(int)
labels

In [46]:
labels_df=pd.DataFrame(labels)
labels_df.to_csv('output.csv',index=False,header=False)