<a href="https://colab.research.google.com/github/offthewallace/CSE584/blob/main/classcifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('IMDB.csv')

# Encode LLM labels ('gpt2', 'gpt-neo') into numerical labels (0, 1)
label_encoder = LabelEncoder()
df['llm_label'] = label_encoder.fit_transform(df['llm'])

# Split the data into train and test sets
train_xi, test_xi, train_xj, test_xj, train_labels, test_labels = train_test_split(
    df['xi'].values, df['xj'].values, df['llm_label'].values, test_size=0.2, random_state=42
)

# Convert inputs to lists of strings
train_xi = [str(x) if x is not None else '' for x in train_xi]
train_xj = [str(x) if x is not None else '' for x in train_xj]
test_xi = [str(x) if x is not None else '' for x in test_xi]
test_xj = [str(x) if x is not None else '' for x in test_xj]



In [None]:
from transformers import LongformerTokenizer, LongformerForSequenceClassification

# Load Longformer tokenizer and model

import torch

# Clear the CUDA cache
torch.cuda.empty_cache()

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')  # Model supports up to 4096 tokens
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=2)

# Tokenize the input with a maximum length of 1024 tokens
def tokenize_pairs_longformer(tokenizer, xi_list, xj_list):
    return tokenizer(
        xi_list,
        xj_list,
        truncation=True,
        padding=True,
        max_length=1024,  # Set max_length to 1024 tokens
        return_tensors='pt'
    )

# Ensure input is valid and tokenize the inputs
train_encodings = tokenize_pairs_longformer(tokenizer, train_xi, train_xj)
test_encodings = tokenize_pairs_longformer(tokenizer, test_xi, test_xj)

# You can use the same training and evaluation process

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!nvidia-smi

Sun Oct  6 17:11:05 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              44W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

# Create a PyTorch dataset for the paired input
class TextPairDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = TextPairDataset(train_encodings, train_labels)
test_dataset = TextPairDataset(test_encodings, test_labels)

# Create DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [None]:
train_labels[-1]

0

In [None]:
# Move model to GPU if available
from transformers import LongformerTokenizer, LongformerForSequenceClassification, AdamW
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training function
def train(model, train_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation function
def evaluate(model, test_loader):
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            total += labels.size(0)
            correct += (predictions == labels).sum().item()
    return correct / total

# Train for a few epochs
epochs = 3
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer)
    accuracy = evaluate(model, test_loader)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Test Accuracy: {accuracy:.4f}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Initializing global attention on CLS token...
Input ids are automatically padded to be a multiple of `config.attention_window`: 512
100%|██████████| 100/100 [02:35<00:00,  1.56s/it]


Epoch 1, Train Loss: 0.0928, Test Accuracy: 0.9850


100%|██████████| 100/100 [02:34<00:00,  1.54s/it]


Epoch 2, Train Loss: 0.0202, Test Accuracy: 0.9975


100%|██████████| 100/100 [02:34<00:00,  1.54s/it]


Epoch 3, Train Loss: 0.0126, Test Accuracy: 0.9875
