In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, XLNetModel
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/train.csv')

# Combine the Title and Abstract for the model input
df['text'] = df['TITLE'] + ' ' + df['ABSTRACT']

# Define input features and labels
X = df['text'].values
y = df[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Initialize the tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = TextDataset(X_train, y_train, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Reduced batch size to fit memory
test_loader = DataLoader(test_dataset, batch_size=16)

# Define the model combining BERT and XLNet
class BertXLNetClassifier(nn.Module):
    def __init__(self, bert_model, xlnet_model, num_labels, dropout=0.3):
        super(BertXLNetClassifier, self).__init__()
        self.bert = bert_model
        self.xlnet = xlnet_model
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(self.bert.config.hidden_size + self.xlnet.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        with torch.no_grad():  # Freeze BERT to save memory
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Get XLNet outputs
        xlnet_outputs = self.xlnet(input_ids=input_ids, attention_mask=attention_mask)

        # Concatenate BERT and XLNet hidden states
        combined_output = torch.cat((bert_outputs.last_hidden_state[:, 0, :], xlnet_outputs.last_hidden_state[:, 0, :]), dim=1)

        # Apply dropout and fully connected layer for classification
        combined_output = self.dropout(combined_output)
        logits = self.fc(combined_output)

        return logits

# Initialize the BERT and XLNet models
bert_model = BertModel.from_pretrained('bert-base-uncased')
xlnet_model = XLNetModel.from_pretrained('xlnet-base-cased')

# Initialize the combined model
model = BertXLNetClassifier(bert_model, xlnet_model, num_labels=y_train.shape[1])

# Move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy for multi-label classification
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)

# Training the model
num_epochs = 15
accumulation_steps = 2  # Gradient accumulation to reduce memory load

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()  # Reset gradients
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass with gradient accumulation
        loss = loss / accumulation_steps
        loss.backward()

        if (i + 1) % accumulation_steps == 0:  # Update the model every 2 steps
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps  # Scale loss back to original

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

    # Validation phase and update learning rate scheduler
    model.eval()
    val_loss = 0
    preds = []
    true_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            preds.append(torch.sigmoid(outputs).cpu().numpy())
            true_labels.append(labels.cpu().numpy())

    preds = np.concatenate(preds)
    true_labels = np.concatenate(true_labels)

    # Step the learning rate scheduler based on validation loss
    scheduler.step(val_loss / len(test_loader))

    # Apply thresholding to get binary predictions
    preds_binary = (preds > 0.5).astype(int)

    # Calculate accuracy and F1 score
    accuracy = accuracy_score(true_labels, preds_binary)
    f1 = f1_score(true_labels, preds_binary, average='macro')

    print(f"Validation Loss: {val_loss/len(test_loader)}, Accuracy: {accuracy}, F1 Score: {f1}")

# Final evaluation after training
model.eval()
final_preds = []
final_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        final_preds.append(torch.sigmoid(outputs).cpu().numpy())
        final_true_labels.append(labels.cpu().numpy())

# Convert to numpy arrays
final_preds = np.concatenate(final_preds)
final_true_labels = np.concatenate(final_true_labels)

# Apply thresholding to get binary predictions
final_preds_binary = (final_preds > 0.5).astype(int)

# Calculate accuracy, F1 score, and confusion matrix
final_accuracy = accuracy_score(final_true_labels, final_preds_binary)
final_f1 = f1_score(final_true_labels, final_preds_binary, average='macro')

print(f"Final Accuracy: {final_accuracy}")
print(f"Final F1 Score: {final_f1}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Epoch 1/15, Loss: 0.3191930570766628
Validation Loss: 0.2413872786914441, Accuracy: 0.5902264600715137, F1 Score: 0.5207431176444367
Epoch 2/15, Loss: 0.22094245497900833
Validation Loss: 0.21264185718200507, Accuracy: 0.6419547079856972, F1 Score: 0.5389569868488099
Epoch 3/15, Loss: 0.18720454433213768
Validation Loss: 0.20763376425994212, Accuracy: 0.6541120381406437, F1 Score: 0.5378975075979202
Epoch 4/15, Loss: 0.16106058613843754
Validation Loss: 0.21417026662271285, Accuracy: 0.6467222884386175, F1 Score: 0.6339981692673887
Epoch 5/15, Loss: 0.1349803470785386
Validation Loss: 0.2130292961692629, Accuracy: 0.6584028605482718, F1 Score: 0.6591814320707707


KeyboardInterrupt: 