In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/train.csv')

# Combine the Title and Abstract for the model input
df['text'] = df['TITLE'] + ' ' + df['ABSTRACT']

# Define input features and labels
X = df['text'].values
y = df[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = TextDataset(X_train, y_train, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)  # Smaller batch size
test_loader = DataLoader(test_dataset, batch_size=32)

# Define the model with GRU and DistilBERT
class DistilBertGRUClassifier(nn.Module):
    def __init__(self, distilbert_model, num_labels, gru_hidden_size=128, num_gru_layers=1, dropout=0.1):
        super(DistilBertGRUClassifier, self).__init__()
        self.distilbert = distilbert_model
        self.gru = nn.GRU(self.distilbert.config.hidden_size, gru_hidden_size, num_gru_layers, batch_first=True, dropout=dropout)
        self.classifier = nn.Linear(gru_hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # Freeze DistilBERT to save memory and computation
            distilbert_outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)

        gru_output, hn = self.gru(distilbert_outputs.last_hidden_state)
        logits = self.classifier(hn[-1])
        return logits

# Initialize the DistilBERT model and the GRU classifier
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model = DistilBertGRUClassifier(distilbert_model, num_labels=y_train.shape[1])

# Move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy for multi-label classification
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Training the model
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

# Evaluation code (similar to your current implementation)
# Evaluation
model.eval()
preds = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        preds.append(torch.sigmoid(outputs).cpu().numpy())
        true_labels.append(labels.cpu().numpy())

# Convert predictions and labels to NumPy arrays
preds = np.concatenate(preds)
true_labels = np.concatenate(true_labels)

# Apply thresholding to get binary predictions
threshold = 0.5
preds_binary = (preds > threshold).astype(int)

# Calculate accuracy, F1 score, and confusion matrix
accuracy = accuracy_score(true_labels, preds_binary)
f1 = f1_score(true_labels, preds_binary, average='macro')
conf_matrix = multilabel_confusion_matrix(true_labels, preds_binary)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



Epoch 1/5, Loss: 0.5559963416485559
Epoch 2/5, Loss: 0.429603929292588
Epoch 3/5, Loss: 0.38931987126668294
Epoch 4/5, Loss: 0.3580374460560935
Epoch 5/5, Loss: 0.327390307528632
Accuracy: 0.48533969010727057
F1 Score: 0.3937927815668704
Confusion Matrix:
[[[2004  499]
  [ 360 1332]]

 [[2939   30]
  [ 510  716]]

 [[2935  110]
  [ 469  681]]

 [[3074   52]
  [ 959  110]]

 [[4073    0]
  [ 122    0]]

 [[4150    0]
  [  45    0]]]
