In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/train.csv')

# Combine the Title and Abstract for the model input
df['text'] = df['TITLE'] + ' ' + df['ABSTRACT']

# Define input features and labels
X = df['text'].values
y = df[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Dataset definition
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):  # Reduced max_len to 128 for faster processing
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = TextDataset(X_train, y_train, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)  # Smaller batch size and increased num_workers
test_loader = DataLoader(test_dataset, batch_size=16)

# Define the BERT + GRU model
class BertGRUClassifier(nn.Module):
    def __init__(self, bert_model, num_labels, gru_hidden_size=64, num_gru_layers=1, dropout=0.1):  # Reduced GRU hidden size
        super(BertGRUClassifier, self).__init__()
        self.bert = bert_model
        self.gru = nn.GRU(self.bert.config.hidden_size, gru_hidden_size, num_gru_layers, batch_first=True, dropout=dropout)
        self.classifier = nn.Linear(gru_hidden_size, num_labels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        # Freeze most BERT layers to save computation time
        with torch.no_grad():
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        gru_output, hn = self.gru(bert_outputs.last_hidden_state)
        output = self.classifier(hn[-1])
        return output

# Initialize the BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Freeze all BERT layers except the last two to save time
for name, param in bert_model.named_parameters():
    if 'encoder.layer.10' in name or 'encoder.layer.11' in name:  # Unfreezing only the last two layers
        param.requires_grad = True
    else:
        param.requires_grad = False

# Define model and move it to GPU
model = BertGRUClassifier(bert_model, num_labels=y_train.shape[1])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop with gradient accumulation
num_epochs = 40  # Reduce epochs for faster training
accumulation_steps = 2

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass with gradient accumulation
        loss = loss / accumulation_steps
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}")

# Evaluation
model.eval()
preds = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        preds.append(torch.sigmoid(outputs).cpu().numpy())
        true_labels.append(labels.cpu().numpy())

# Convert predictions and labels to NumPy arrays
preds = np.concatenate(preds)
true_labels = np.concatenate(true_labels)

# Apply thresholding to get binary predictions
threshold = 0.5
preds_binary = (preds > threshold).astype(int)

# Calculate accuracy, F1 score, and confusion matrix
accuracy = accuracy_score(true_labels, preds_binary)
f1 = f1_score(true_labels, preds_binary, average='macro')
conf_matrix = multilabel_confusion_matrix(true_labels, preds_binary)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")




Epoch 1/40, Loss: 0.5518022617689192
Epoch 2/40, Loss: 0.4218648048499519
Epoch 3/40, Loss: 0.38537543080214437
Epoch 4/40, Loss: 0.35564233683710217
Epoch 5/40, Loss: 0.3290620720221726
Epoch 6/40, Loss: 0.30832223930849817
Epoch 7/40, Loss: 0.29315517726287715
Epoch 8/40, Loss: 0.2819722024200074
Epoch 9/40, Loss: 0.27372206237568414
Epoch 10/40, Loss: 0.26700546272148734
Epoch 11/40, Loss: 0.26150359074142115
Epoch 12/40, Loss: 0.2565766701650574
Epoch 13/40, Loss: 0.25323635548552065
Epoch 14/40, Loss: 0.24917552655685277
Epoch 15/40, Loss: 0.24549561347617094
Epoch 16/40, Loss: 0.24207547511568062
Epoch 17/40, Loss: 0.23947929259961395
Epoch 18/40, Loss: 0.2366889777626164
Epoch 19/40, Loss: 0.23366469358608083
Epoch 20/40, Loss: 0.23115238717786463
Epoch 21/40, Loss: 0.22911526813208205
Epoch 22/40, Loss: 0.22687337019110543
Epoch 23/40, Loss: 0.22443583194304922
Epoch 24/40, Loss: 0.22214103410210465
Epoch 25/40, Loss: 0.22039062776260995
Epoch 26/40, Loss: 0.2181441641285717
Ep

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/train.csv')

# Combine the Title and Abstract for the model input
df['text'] = df['TITLE'] + ' ' + df['ABSTRACT']

# Define input features and labels
X = df['text'].values
y = df[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Dataset definition
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):  # Reduced max_len to 128 for faster processing
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = TextDataset(X_train, y_train, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)  # Smaller batch size and increased num_workers
test_loader = DataLoader(test_dataset, batch_size=16)

# Define the BERT + GRU model
class BertGRUClassifier(nn.Module):
    def __init__(self, bert_model, num_labels, gru_hidden_size=64, num_gru_layers=1, dropout=0.1):  # Reduced GRU hidden size
        super(BertGRUClassifier, self).__init__()
        self.bert = bert_model
        self.gru = nn.GRU(self.bert.config.hidden_size, gru_hidden_size, num_gru_layers, batch_first=True, dropout=dropout)
        self.classifier = nn.Linear(gru_hidden_size, num_labels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        # Freeze most BERT layers to save computation time
        with torch.no_grad():
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        gru_output, hn = self.gru(bert_outputs.last_hidden_state)
        output = self.classifier(hn[-1])
        return output

# Initialize the BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Freeze all BERT layers except the last two to save time
for name, param in bert_model.named_parameters():
    if 'encoder.layer.10' in name or 'encoder.layer.11' in name:  # Unfreezing only the last two layers
        param.requires_grad = True
    else:
        param.requires_grad = False

# Define model and move it to GPU
model = BertGRUClassifier(bert_model, num_labels=y_train.shape[1])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop with gradient accumulation
num_epochs = 15  # Reduce epochs for faster training
accumulation_steps = 2

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass with gradient accumulation
        loss = loss / accumulation_steps
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}")

# Evaluation
model.eval()
preds = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        preds.append(torch.sigmoid(outputs).cpu().numpy())
        true_labels.append(labels.cpu().numpy())

# Convert predictions and labels to NumPy arrays
preds = np.concatenate(preds)
true_labels = np.concatenate(true_labels)

# Apply thresholding to get binary predictions
threshold = 0.5
preds_binary = (preds > threshold).astype(int)

# Calculate accuracy, F1 score, and confusion matrix
accuracy = accuracy_score(true_labels, preds_binary)
f1 = f1_score(true_labels, preds_binary, average='macro')
conf_matrix = multilabel_confusion_matrix(true_labels, preds_binary)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Epoch 1/15, Loss: 0.53913203259669
Epoch 2/15, Loss: 0.4201090506875481
Epoch 3/15, Loss: 0.38343200388695653
Epoch 4/15, Loss: 0.35614287310605736
Epoch 5/15, Loss: 0.33054407958022974
Epoch 6/15, Loss: 0.30932288371857514
Epoch 7/15, Loss: 0.2923002461658647
Epoch 8/15, Loss: 0.2805159787573737
Epoch 9/15, Loss: 0.2716975040641481
Epoch 10/15, Loss: 0.2654085701936989
Epoch 11/15, Loss: 0.2598178525614216
Epoch 12/15, Loss: 0.25612566831767614
Epoch 13/15, Loss: 0.2527897324345587
Epoch 14/15, Loss: 0.24867451173902352
Epoch 15/15, Loss: 0.24544801945142
Accuracy: 0.5942789034564958
F1 Score: 0.4961359185308005
Confusion Matrix:
[[[1973  530]
  [ 243 1449]]

 [[2837  132]
  [ 198 1028]]

 [[2942  103]
  [ 406  744]]

 [[2909  217]
  [ 543  526]]

 [[4073    0]
  [ 122    0]]

 [[4150    0]
  [  45    0]]]


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/train.csv')

# Combine the Title and Abstract for the model input
df['text'] = df['TITLE'] + ' ' + df['ABSTRACT']

# Define input features and labels
X = df['text'].values
y = df[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Dataset definition
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):  # Reduced max_len to 128 for faster processing
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = TextDataset(X_train, y_train, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)  # Smaller batch size and increased num_workers
test_loader = DataLoader(test_dataset, batch_size=16)

# Define the BERT + GRU model
class BertGRUClassifier(nn.Module):
    def __init__(self, bert_model, num_labels, gru_hidden_size=64, num_gru_layers=1, dropout=0.1):  # Reduced GRU hidden size
        super(BertGRUClassifier, self).__init__()
        self.bert = bert_model
        self.gru = nn.GRU(self.bert.config.hidden_size, gru_hidden_size, num_gru_layers, batch_first=True, dropout=dropout)
        self.classifier = nn.Linear(gru_hidden_size, num_labels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        # Freeze most BERT layers to save computation time
        with torch.no_grad():
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        gru_output, hn = self.gru(bert_outputs.last_hidden_state)
        output = self.classifier(hn[-1])
        return output

# Initialize the BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Freeze all BERT layers except the last two to save time
for name, param in bert_model.named_parameters():
    if 'encoder.layer.10' in name or 'encoder.layer.11' in name:  # Unfreezing only the last two layers
        param.requires_grad = True
    else:
        param.requires_grad = False

# Define model and move it to GPU
model = BertGRUClassifier(bert_model, num_labels=y_train.shape[1])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop with gradient accumulation
num_epochs = 10  # Reduce epochs for faster training
accumulation_steps = 2

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass with gradient accumulation
        loss = loss / accumulation_steps
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}")

# Evaluation
model.eval()
preds = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        preds.append(torch.sigmoid(outputs).cpu().numpy())
        true_labels.append(labels.cpu().numpy())

# Convert predictions and labels to NumPy arrays
preds = np.concatenate(preds)
true_labels = np.concatenate(true_labels)

# Apply thresholding to get binary predictions
threshold = 0.5
preds_binary = (preds > threshold).astype(int)

# Calculate accuracy, F1 score, and confusion matrix
accuracy = accuracy_score(true_labels, preds_binary)
f1 = f1_score(true_labels, preds_binary, average='macro')
conf_matrix = multilabel_confusion_matrix(true_labels, preds_binary)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")




Epoch 1/10, Loss: 0.5255998007107963
Epoch 2/10, Loss: 0.42500924802712875
Epoch 3/10, Loss: 0.39161457824525203
Epoch 4/10, Loss: 0.36569645341744983
Epoch 5/10, Loss: 0.343089246630555
Epoch 6/10, Loss: 0.3221726145939786
Epoch 7/10, Loss: 0.30476989110557096
Epoch 8/10, Loss: 0.29043429749027905
Epoch 9/10, Loss: 0.2792635674102745
Epoch 10/10, Loss: 0.2709100400412844
Accuracy: 0.5644815256257449
F1 Score: 0.4478766171830557
Confusion Matrix:
[[[1958  545]
  [ 273 1419]]

 [[2872   97]
  [ 254  972]]

 [[2920  125]
  [ 387  763]]

 [[3051   75]
  [ 855  214]]

 [[4073    0]
  [ 122    0]]

 [[4150    0]
  [  45    0]]]
