In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # Restrict to GPU 3

In [2]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [1]:
import pandas as pd

# Load the dataset
file_path = "train.txt"  # Replace with your actual file path
data = pd.read_csv(file_path, delimiter="\t", header=None, names=[
    'claim_label', 'topic_sentence', 'claim_candidate_sentence', 'article_id', 'stance_label'
])

# Check the data type of the first value in the 'stance_label' column
print("Data type of stance_label (first instance):", type(data['article_id'].iloc[0]))

Data type of stance_label (first instance): <class 'str'>


In [1]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'all_claims.txt'

# Attempt to load the file into a DataFrame with proper handling
try:
    data = pd.read_csv(
        file_path,
        delimiter="\t",  # Assuming tab-separated values
        header=None,  # No predefined headers
        names=['claim_label', 'topic_sentence', 'claim_candidate_sentence', 'article_id', 'stance_label'],
        skipinitialspace=True  # Handle any extra spaces
    )
except Exception as e:
    print(f"Error loading file: {e}")
    exit()

# Define a function to clean and convert stance_label
def clean_stance_label(label):
    """
    Convert stance_label -1, 0, 1 to 0, 1, 2 respectively. Return None for invalid values.
    """
    try:
        value = int(label)
        if value in {-1, 0, 1}:  # Valid stance_label values
            return value + 1  # Shift -1, 0, 1 to 0, 1, 2
    except ValueError:
        pass
    return None  # Invalid or non-convertible value

# Define a function to clean and convert claim_label
def clean_claim_label(label):
    """
    Convert claim_label 'C' to 1 and 'O' to 0. Return None for invalid values.
    """
    if label == 'C':
        return 1
    elif label == 'O':
        return 0
    return None  # Invalid or non-convertible value

# Define a function to compute joint labels
def compute_joint_label(claim_label, stance_label):
    """
    Map claim_label and stance_label to a single joint label:
    - Non-Claim -> 0
    - Support -> 1
    - Contest -> 2
    - No Relation -> 3
    """
    if claim_label == 0:  # Non-Claim
        return 0
    elif claim_label == 1:  # Claim
        if stance_label == 1:  # Support
            return 1
        elif stance_label == 2:  # Contest
            return 2
        elif stance_label == 0:  # No Relation
            return 3
    return None  # Invalid combination

# Apply validation and save the results
try:
    # Clean and convert stance_label
    data['stance_label'] = data['stance_label'].apply(clean_stance_label)

    # Clean and convert claim_label
    data['claim_label'] = data['claim_label'].apply(clean_claim_label)

    # Compute joint labels
    data['joint_label'] = data.apply(
        lambda row: compute_joint_label(row['claim_label'], row['stance_label']), axis=1
    )

    # Identify valid rows with type checks
    valid_rows = (
        data['stance_label'].notna() &  # Valid stance_label
        data['claim_label'].notna() &  # Valid claim_label
        data['joint_label'].notna() &  # Valid joint_label
        data['topic_sentence'].apply(lambda x: isinstance(x, str) and x.strip() != "") &  # Valid topic_sentence
        data['claim_candidate_sentence'].apply(lambda x: isinstance(x, str) and x.strip() != "") &  # Valid claim_candidate_sentence
        data['article_id'].apply(lambda x: isinstance(x, str) and bool(pd.Series([x]).str.match(r'^\d+_\d+$').iloc[0]))  # Valid article_id
    )

    # Filter valid and invalid rows
    cleaned_data = data[valid_rows]
    discarded_data = data[~valid_rows]  # Rows that failed validation

    # Save cleaned and discarded datasets to files
    cleaned_data.to_csv('cleaned_data.csv', index=False)
    discarded_data.to_csv('discarded_data.csv', index=False)

    # Print summaries
    print("Unprocessed Dataset:")
    print(data.head())
    print("\nDimensions of Unprocessed Dataset:")
    print(f"Rows: {data.shape[0]}, Columns: {data.shape[1]}")

    print("\nCleaned Dataset:")
    print(cleaned_data.head())
    print("\nDimensions of Cleaned Dataset:")
    print(f"Rows: {cleaned_data.shape[0]}, Columns: {cleaned_data.shape[1]}")

    print("\nDiscarded Dataset:")
    print(discarded_data.head())
    print("\nDimensions of Discarded Dataset:")
    print(f"Rows: {discarded_data.shape[0]}, Columns: {discarded_data.shape[1]}")

    print("\nFiles Saved:")
    print("- Cleaned data with joint labels: cleaned_data_with_joint_labels.csv")
    print("- Discarded data: discarded_data.csv")

except Exception as e:
    print(f"Error during data cleaning: {e}")

Unprocessed Dataset:
   claim_label                          topic_sentence  \
0            0  Should we abandon the one-child policy   
1            0  Should we abandon the one-child policy   
2            0  Should we abandon the one-child policy   
3            0  Should we abandon the one-child policy   
4            0  Should we abandon the one-child policy   

                            claim_candidate_sentence article_id  stance_label  \
0                                      Sex selection        1_1             1   
1  Sex selection is the attempt to control the se...        1_1             1   
2  It can be accomplished in several ways, both p...        1_1             1   
3  It has been marketed under the title family ba...        1_1             1   
4  According to the United Nations Population Fun...        1_1             1   

   joint_label  
0            0  
1            0  
2            0  
3            0  
4            0  

Dimensions of Unprocessed Dataset:
Rows:

In [2]:
import torch
import torch.nn as nn

class PatchEmbedding(nn.Module):
    def __init__(self, seq_len, patch_size, emb_size, vocab_size):
        super(PatchEmbedding, self).__init__()
        self.seq_len = seq_len
        self.patch_size = patch_size
        self.num_patches = seq_len // patch_size
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.projection = nn.Linear(emb_size * patch_size, emb_size)

    def forward(self, x):
        x = self.embedding(x)  # Shape: [batch_size, seq_len, emb_size]
        x = x.unfold(1, self.patch_size, self.patch_size)  # Shape: [batch_size, num_patches, patch_size, emb_size]
        x = x.flatten(2)  # Flatten patches: [batch_size, num_patches, emb_size * patch_size]
        x = self.projection(x)  # Project to emb_size: [batch_size, num_patches, emb_size]
        return x


class PositionalEncoding(nn.Module):
    def __init__(self, emb_size, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.pos_embedding = nn.Parameter(torch.randn(max_len, emb_size))

    def forward(self, x):
        return x + self.pos_embedding[:x.size(1), :].unsqueeze(0)

In [3]:
import torch.nn as nn
class TransformerEncoder(nn.Module):
    def __init__(self, emb_size, num_heads, ff_dim, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.attention = nn.MultiheadAttention(emb_size, num_heads, dropout=dropout)
        self.norm1 = nn.LayerNorm(emb_size)
        self.ff = nn.Sequential(
            nn.Linear(emb_size, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, emb_size)
        )
        self.norm2 = nn.LayerNorm(emb_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        attn_out, _ = self.attention(x, x, x)  # Self-attention
        x = x + attn_out  # Residual connection
        x = self.norm1(x)
        ff_out = self.ff(x)
        x = x + self.dropout(ff_out)  # Residual connection
        x = self.norm2(x)
        return x
    

class TextVisionTransformer(nn.Module):
    def __init__(self, seq_len, patch_size, emb_size, vocab_size, depth, num_heads, ff_dim, max_len=500, num_classes=4):
        super(TextVisionTransformer, self).__init__()

        # Patch Embedding for each input type
        self.topic_patch_embedding = PatchEmbedding(seq_len, patch_size, emb_size, vocab_size)
        self.claim_patch_embedding = PatchEmbedding(seq_len, patch_size, emb_size, vocab_size)
        self.article_patch_embedding = PatchEmbedding(seq_len, patch_size, emb_size, vocab_size)

        # Learnable CLS token
        self.cls_token = nn.Parameter(torch.randn(1, 1, emb_size))

        # Transformer Encoders
        self.encoders = nn.ModuleList(
            [TransformerEncoder(emb_size, num_heads, ff_dim) for _ in range(depth)]
        )

        # Joint Classifier for Multi-Label Classification
        self.joint_classifier = nn.Linear(emb_size, num_classes)  # Combined classes: Non-Claim, Support, Contest, No Relation

    def forward(self, topic_input_ids, claim_input_ids, article_id_tokens):
        """
        Forward pass through the TextVisionTransformer.
        Inputs:
        - topic_input_ids: tensor of shape [batch_size, seq_len]
        - claim_input_ids: tensor of shape [batch_size, seq_len]
        - article_id_tokens: tensor of shape [batch_size, seq_len]
        Outputs:
        - joint_logits: tensor of shape [batch_size, num_classes]
        """
        batch_size = topic_input_ids.size(0)

        # Step 1: Patch Embedding for each input
        topic_embeddings = self.topic_patch_embedding(topic_input_ids)  # Shape: [batch_size, num_patches, emb_size]
        claim_embeddings = self.claim_patch_embedding(claim_input_ids)  # Shape: [batch_size, num_patches, emb_size]
        article_embeddings = self.article_patch_embedding(article_id_tokens)  # Shape: [batch_size, num_patches, emb_size]

        # Step 2: Concatenate all embeddings
        combined_embeddings = torch.cat((topic_embeddings, claim_embeddings, article_embeddings), dim=1)

        # Step 3: Add CLS Token
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # Shape: [batch_size, 1, emb_size]
        combined_embeddings = torch.cat((cls_tokens, combined_embeddings), dim=1)  # Shape: [batch_size, total_patches + 1, emb_size]

        # Step 4: Transformer Encoder Layers
        for encoder in self.encoders:
            combined_embeddings = encoder(combined_embeddings)  # Shape remains [batch_size, total_patches + 1, emb_size]

        # Step 5: Extract CLS Token
        cls_token_final = combined_embeddings[:, 0, :]  # Extract the CLS token: [batch_size, emb_size]

        # Step 6: Joint Classifier Output
        joint_logits = self.joint_classifier(cls_token_final)  # Shape: [batch_size, num_classes]

        return joint_logits


In [4]:
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch
import torch.nn as nn

# Model Parameters
seq_len = 128         # Input sequence length
patch_size = 8        # Number of tokens per patch
emb_size = 128        # Size of each embedding vector
vocab_size = 30522    # Vocabulary size for embedding (e.g., BERT tokenizer size)
depth = 6             # Number of transformer layers
num_heads = 8         # Number of attention heads in each layer
ff_dim = 512          # Hidden layer size in feed-forward networks
num_classes = 4       # Number of joint classes (Non-Claim, Support, Contest, No Relation)

# Device Setup
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Move the model to the selected device
model = TextVisionTransformer(
    seq_len=seq_len,          # Sequence length
    patch_size=patch_size,    # Patch size
    emb_size=emb_size,        # Embedding size
    vocab_size=vocab_size,    # Vocabulary size
    depth=depth,              # Number of transformer layers
    num_heads=num_heads,      # Number of attention heads
    ff_dim=ff_dim,            # Feedforward network hidden dimension
    num_classes=num_classes   # Number of output classes
).to(device)                  # Move the model to the appropriate device (e.g., GPU)

# Initialize optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6)

# Loss Function
loss_fn = nn.CrossEntropyLoss()  # For the joint label

# Dummy Input Data
batch_size = 64

# Simulate three separate input tensors
topic_input_ids = torch.randint(0, vocab_size, (batch_size, seq_len)).to(device)  # Simulated topic input IDs
claim_input_ids = torch.randint(0, vocab_size, (batch_size, seq_len)).to(device)  # Simulated claim input IDs
article_id_tokens = torch.randint(0, vocab_size, (batch_size, seq_len)).to(device)  # Simulated article ID tokens

# Simulated Joint Labels
joint_labels = torch.randint(0, num_classes, (batch_size,)).to(device)  # Simulated joint labels

# Forward Pass
joint_logits = model(topic_input_ids, claim_input_ids, article_id_tokens)  # Output joint logits

# Compute Loss
loss = loss_fn(joint_logits, joint_labels)

# Debug Outputs
print("Final Joint Logits Shape:", joint_logits.shape)  # Expected: [batch_size, num_classes]
print("Loss Value:", loss.item())  # Print the computed loss

Final Joint Logits Shape: torch.Size([64, 4])
Loss Value: 1.3976267576217651


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch

# Load dataset
data = pd.read_csv("cleaned_data.csv")  # Load the dataset with joint labels

# Split dataset into train and test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=100)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

class CESCData(Dataset):
    def __init__(self, data, seq_len, patch_size, vocab_size, device):
        self.data = data
        self.seq_len = seq_len
        self.patch_size = patch_size
        self.vocab_size = vocab_size
        self.device = device  # Added device argument

    def text_to_numeric(self, text):
        """Simple tokenizer to convert text to numeric values (basic character-level mapping)."""
        return [ord(char) % self.vocab_size for char in text]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Process topic_sentence
        topic_tokens = self.text_to_numeric(row['topic_sentence'])

        # Process claim_candidate_sentence
        claim_tokens = self.text_to_numeric(row['claim_candidate_sentence'])

        # Padding topic_sentence
        topic_padded_length = ((self.seq_len + self.patch_size - 1) // self.patch_size) * self.patch_size
        topic_tokens = topic_tokens[:self.seq_len] + [0] * (topic_padded_length - len(topic_tokens))

        # Padding claim_candidate_sentence
        claim_padded_length = ((self.seq_len + self.patch_size - 1) // self.patch_size) * self.patch_size
        claim_tokens = claim_tokens[:self.seq_len] + [0] * (claim_padded_length - len(claim_tokens))

        # Process joint label
        joint_label = torch.tensor(row['joint_label'], dtype=torch.long).to(self.device)

        # Process article_id (optional: tokenize it)
        article_id_tokens = self.text_to_numeric(row['article_id'])
        article_padded_length = ((self.seq_len + self.patch_size - 1) // self.patch_size) * self.patch_size
        article_id_tokens = article_id_tokens[:self.seq_len] + [0] * (article_padded_length - len(article_id_tokens))

        return {
            'topic_input_ids': torch.tensor(topic_tokens, dtype=torch.long).to(self.device),
            'claim_input_ids': torch.tensor(claim_tokens, dtype=torch.long).to(self.device),
            'article_id_tokens': torch.tensor(article_id_tokens, dtype=torch.long).to(self.device),
            'joint_label': joint_label  # Use the joint label
        }

# Define PyTorch Dataset
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Pad sequences for topic, claim, and article ID input IDs
    topic_input_ids = pad_sequence([item['topic_input_ids'] for item in batch], batch_first=True, padding_value=0)
    claim_input_ids = pad_sequence([item['claim_input_ids'] for item in batch], batch_first=True, padding_value=0)
    article_id_tokens = pad_sequence([item['article_id_tokens'] for item in batch], batch_first=True, padding_value=0)
    
    # Stack joint labels
    joint_labels = torch.tensor([item['joint_label'] for item in batch], dtype=torch.long)

    return {
        'topic_input_ids': topic_input_ids,
        'claim_input_ids': claim_input_ids,
        'article_id_tokens': article_id_tokens,  # Return tokenized article IDs
        'joint_label': joint_labels  # Return joint labels
    }

# Create DataLoaders
train_dataset = CESCData(train_data, seq_len=seq_len, patch_size=patch_size, vocab_size=vocab_size, device=device)
test_dataset = CESCData(test_data, seq_len=seq_len, patch_size=patch_size, vocab_size=vocab_size, device=device)

train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    collate_fn=collate_fn
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

# Load dataset
data = pd.read_csv("cleaned_data.csv")  # Ensure 'topic_sentence', 'claim_candidate_sentence', 'article_id', and 'joint_label' are present

# Convert columns to string to avoid float-related issues
data["topic_sentence"] = data["topic_sentence"].astype(str)
data["claim_candidate_sentence"] = data["claim_candidate_sentence"].astype(str)
data["article_id"] = data["article_id"].astype(str)

# Combine text features for TF-IDF transformation
data["combined_text"] = data["topic_sentence"] + " " + data["claim_candidate_sentence"]

# Extract text and target labels
X_text = data["combined_text"]  # Combined text column
y = data["joint_label"]         # Target column

# Convert text to numerical features using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)  # Use up to 5000 features for efficiency
X_tfidf = tfidf.fit_transform(X_text)

# Check original class distribution
print("Original class distribution:")
print(Counter(y))

# Define adjusted sampling strategy for SMOTE
# Smaller sample sizes for oversampling to reduce dataset size
sampling_strategy = {
    0: 64772,  # Keep Non-Claim size unchanged
    2: 20000,  # Increase Support to 20,000
    3: 20000   # Increase No Relation to 20,000
}

# Apply SMOTE with adjusted sampling strategy
smote = SMOTE(random_state=42, sampling_strategy=sampling_strategy)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

# Check new class distribution
print("New class distribution after SMOTE:")
print(Counter(y_resampled))

# Combine resampled data into a DataFrame
resampled_data = pd.DataFrame(X_resampled.toarray(), columns=tfidf.get_feature_names_out())
resampled_data["joint_label"] = y_resampled

# Assign article_id and sentences back for interpretability
resampled_data["article_id"] = data["article_id"][:len(y_resampled)].reset_index(drop=True)  # Retain article IDs
resampled_data["topic_sentence"] = data["topic_sentence"][:len(y_resampled)].reset_index(drop=True)
resampled_data["claim_candidate_sentence"] = data["claim_candidate_sentence"][:len(y_resampled)].reset_index(drop=True)

# Split the resampled data into train and test sets
train_data, test_data = train_test_split(resampled_data, test_size=0.2, random_state=100)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Print final class distribution
print("Final class distribution in training data:")
print(train_data["joint_label"].value_counts())

# Create DataLoaders
train_dataset = CESCData(train_data, seq_len=seq_len, patch_size=patch_size, vocab_size=vocab_size, device=device)
test_dataset = CESCData(test_data, seq_len=seq_len, patch_size=patch_size, vocab_size=vocab_size, device=device)

train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    collate_fn=collate_fn
)

Original class distribution:
Counter({0: 64772, 2: 2613, 3: 2277})
New class distribution after SMOTE:
Counter({0: 64772, 2: 20000, 3: 20000})
Final class distribution in training data:
joint_label
0    51772
2    16025
3    16020
Name: count, dtype: int64


In [6]:
import torch
from sklearn.metrics import classification_report, confusion_matrix

# Before training starts
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def train_model(model, train_loader, optimizer, scheduler, loss_fn, epochs=3):
    model.to(device)
    model.train()

    for epoch in range(epochs):
        total_loss = 0

        for batch_idx, batch in enumerate(train_loader):
            # Extract inputs and labels from the batch
            topic_input_ids = batch['topic_input_ids'].to(device)
            claim_input_ids = batch['claim_input_ids'].to(device)
            article_id_tokens = batch['article_id_tokens'].to(device)
            joint_labels = batch['joint_label'].to(device)  # Directly use the joint label

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            joint_logits = model(topic_input_ids, claim_input_ids, article_id_tokens)  # Single joint output

            # Compute loss
            loss = loss_fn(joint_logits, joint_labels)

            # Backward pass
            loss.backward()
            optimizer.step()

            # Accumulate total loss
            total_loss += loss.item()

        # Step the scheduler
        scheduler.step()

        # Print epoch loss and learning rate
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}, LR: {current_lr:.6f}")

Using device: cuda:0


In [7]:
model.to(device)
#print(f"Model is on device: {next(model.parameters()).device}")

train_model(
    model=model,
    train_loader=train_loader,
    optimizer=optimizer,
    scheduler=scheduler,
    loss_fn=nn.CrossEntropyLoss(),  # Single loss function for joint labels
    epochs=3
)

NameError: name 'train_loader' is not defined

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score

def evaluate_model(model, test_loader):
    """
    Evaluates the model on the test dataset using the joint label.
    """
    model.to(device)
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            # Extract inputs and labels from the batch
            topic_input_ids = batch['topic_input_ids'].to(device)
            claim_input_ids = batch['claim_input_ids'].to(device)
            article_id_tokens = batch['article_id_tokens'].to(device)
            joint_labels = batch['joint_label'].to(device)  # Directly use the joint label

            # Forward pass
            joint_logits = model(topic_input_ids, claim_input_ids, article_id_tokens)

            # Predictions
            preds = torch.argmax(joint_logits, dim=1).cpu().numpy()  # Get predicted class indices
            labels = joint_labels.cpu().numpy()  # Get ground truth labels

            # Append predictions and labels for evaluation
            all_preds.extend(preds)
            all_labels.extend(labels)

    # Confusion Matrix
    conf_matrix = confusion_matrix(all_labels, all_preds)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(
        all_labels,
        all_preds,
        target_names=["Non-Claim", "Support", "Contest", "No Relation"]
    ))

In [80]:
evaluate_model(
    model=model,                # Use the same model trained in train_model
    test_loader=test_loader     # Pass the test DataLoader
)

Model is on device: cuda:0

Claim Confusion Matrix:
[[12956     0]
 [  977     0]]

Stance Confusion Matrix:
[[    0   454     0]
 [    0 12956     0]
 [    0   523     0]]
Claim Classification Report:
              precision    recall  f1-score   support

   Non-Claim       0.93      1.00      0.96     12956
       Claim       0.00      0.00      0.00       977

    accuracy                           0.93     13933
   macro avg       0.46      0.50      0.48     13933
weighted avg       0.86      0.93      0.90     13933

Stance Classification Report:
              precision    recall  f1-score   support

     Support       0.00      0.00      0.00       454
     Contest       0.93      1.00      0.96     12956
 No Relation       0.00      0.00      0.00       523

    accuracy                           0.93     13933
   macro avg       0.31      0.33      0.32     13933
weighted avg       0.86      0.93      0.90     13933



In [10]:
import torch

print("Is CUDA available?:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")
print("Current device:", torch.cuda.current_device() if torch.cuda.is_available() else "No GPU")

Is CUDA available?: True
Device name: NVIDIA RTX A5000
Current device: 0


In [5]:
from sklearn.metrics import classification_report
from transformers import (
    LongformerForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from torch.utils.data import Dataset

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")

# Optimized Dataset class for Longformer
class LongformerDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.topic_sentences = data["topic_sentence"].tolist()
        self.claim_candidate_sentences = data["claim_candidate_sentence"].tolist()
        self.labels = data["joint_label"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        topic_sentence = str(self.topic_sentences[idx])
        claim_candidate_sentence = str(self.claim_candidate_sentences[idx])
        label = int(self.labels[idx])  # Ensure label is an integer, not Tensor

        # Tokenize input sequences
        inputs = self.tokenizer(
            topic_sentence,
            claim_candidate_sentence,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        inputs["labels"] = label  # Add label directly to the dictionary
        return inputs

# Prepare datasets
max_length = 512  # Adjust based on your data and memory
train_dataset = LongformerDataset(train_data, tokenizer, max_length)
test_dataset = LongformerDataset(test_data, tokenizer, max_length)

# Metrics function
def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids
    report = classification_report(
        labels, preds, zero_division=0, output_dict=True
    )
    return {
        "accuracy": report["accuracy"],
        "precision": report["macro avg"]["precision"],
        "recall": report["macro avg"]["recall"],
        "f1": report["macro avg"]["f1-score"],
    }

# Load Longformer model
model = LongformerForSequenceClassification.from_pretrained(
    "allenai/longformer-base-4096", num_labels=4
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=12,  # Increased batch size
    per_device_eval_batch_size=12,
    gradient_accumulation_steps=1,  # Simplify training
    num_train_epochs=5,  # Start with 1 epoch
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=1000,  # Log less frequently
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,  # Mixed precision for faster training
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train and evaluate
trainer.train()
results = trainer.evaluate()

# Print evaluation results
print("Evaluation results:", results)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss


: 