In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import functional as F
from tqdm import tqdm
from transformers import AutoTokenizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_units, num_classes):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_units, batch_first=True)
        self.fc = nn.Linear(hidden_units, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.gru(embedded)
        output = self.fc(output[:, -1, :])  # Use the last hidden state
        return output

from tqdm import tqdm

def train(model, device, train_loader, optimizer, criterion):
    model.train()
    train_loss = 0
    train_acc = 0

    for batch in tqdm(train_loader, desc='Training'):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs, dim=1)
        train_acc += (predicted == labels).sum().item()

    train_loss /= len(train_loader.dataset)
    train_acc /= len(train_loader.dataset)

    return train_loss, train_acc


def evaluate(model, device, data_loader, criterion):
    model.eval()
    eval_loss = 0
    eval_f1 = 0

    with torch.no_grad():
        all_labels = []
        all_predictions = []

        for batch in tqdm(data_loader, desc='Evaluation'):
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)  # Modify this line

            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            eval_loss += loss.item()
            _, predicted = torch.max(outputs, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

        eval_loss /= len(data_loader.dataset)
        eval_f1 = f1_score(all_labels, all_predictions, average='weighted')

    return eval_loss, eval_f1

In [None]:
 df = pd.read_csv('data_cleaned.csv')

In [None]:
df =df.drop(['Unnamed: 0', 'Title','Text', 'Text_first200',
       'Text_first200_translated', 'Title_Translated'], axis=1)
df = df.rename(columns={"Text_first200_translated_cleaned": "Text", "Title_Translated_cleaned": "Title"})

In [None]:
class NewsDataset(Dataset):
  def __init__(self, data, tokenizer, max_length):
      self.data = data
      self.tokenizer = tokenizer
      self.max_length = max_length

  def __len__(self):
      return len(self.data)

  def __getitem__(self, index):
      title = self.data.iloc[index]['Title']
      text = self.data.iloc[index]['Text']
      label = self.data.iloc[index]['Category']

      encoding = self.tokenizer.encode_plus(
          title,
          text,
          add_special_tokens=True,
          max_length=self.max_length,
          truncation=True,
          padding='max_length',
          return_tensors='pt'
      )

      input_ids = encoding['input_ids'].squeeze()
      attention_mask = encoding['attention_mask'].squeeze()

      return {
          'input_ids': input_ids,
          'attention_mask': attention_mask,
          'label': label
      }

In [None]:
data = df.copy()

# Encode the labels
label_encoder = LabelEncoder()
data['Category'] = label_encoder.fit_transform(data['Category'])

# Split the data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.15, random_state=42)

# Create the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define hyperparameters
max_length = 100  # Maximum sequence length for input
vocab_size = tokenizer.vocab_size  # Vocabulary size
embedding_dim = 100  # Embedding dimension
hidden_units = 64
num_classes = len(label_encoder.classes_)  # Number of classes

# Create instances of NewsDataset for training and validation sets
train_dataset = NewsDataset(train_data, tokenizer, max_length)
val_dataset = NewsDataset(val_data, tokenizer, max_length)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Instantiate the GRU-based model
# Create an instance of the GRUModel
model = GRUModel(vocab_size, embedding_dim, hidden_units, num_classes)
model.to(device)

# Define the loss criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 30
best_val_loss = float('inf')
best_model_weights = None

for epoch in range(num_epochs):
    # Training
    train_loss, train_f1 = train(model, device, train_loader, optimizer, criterion)
    val_loss, val_f1 = evaluate(model, device, val_loader, criterion)

    # Print epoch results
    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Training Loss: {train_loss:.4f} | Training F1 Score: {train_f1:.4f}')
    print(f'Validation Loss: {val_loss:.4f} | Validation F1 Score: {val_f1:.4f}')
    print('-' * 50)

    # Check if validation loss improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_weights = model.state_dict()

# Save the best model
torch.save(best_model_weights, 'gru_model.pt')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Training: 100%|██████████| 90/90 [00:14<00:00,  6.23it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 16.60it/s]


Epoch 1/30
Training Loss: 0.0516 | Training F1 Score: 0.2875
Validation Loss: 0.0495 | Validation F1 Score: 0.3220
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:13<00:00,  6.54it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 16.41it/s]


Epoch 2/30
Training Loss: 0.0435 | Training F1 Score: 0.4629
Validation Loss: 0.0358 | Validation F1 Score: 0.5235
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:13<00:00,  6.73it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 16.08it/s]


Epoch 3/30
Training Loss: 0.0296 | Training F1 Score: 0.6714
Validation Loss: 0.0244 | Validation F1 Score: 0.6865
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:13<00:00,  6.73it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 16.64it/s]


Epoch 4/30
Training Loss: 0.0195 | Training F1 Score: 0.7852
Validation Loss: 0.0198 | Validation F1 Score: 0.7480
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  6.99it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.02it/s]


Epoch 5/30
Training Loss: 0.0179 | Training F1 Score: 0.8138
Validation Loss: 0.0209 | Validation F1 Score: 0.7427
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  6.95it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 16.64it/s]


Epoch 6/30
Training Loss: 0.0132 | Training F1 Score: 0.8569
Validation Loss: 0.0191 | Validation F1 Score: 0.7656
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:13<00:00,  6.88it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 16.66it/s]


Epoch 7/30
Training Loss: 0.0107 | Training F1 Score: 0.8834
Validation Loss: 0.0190 | Validation F1 Score: 0.7824
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  6.98it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.29it/s]


Epoch 8/30
Training Loss: 0.0089 | Training F1 Score: 0.9133
Validation Loss: 0.0190 | Validation F1 Score: 0.7888
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  6.98it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.07it/s]


Epoch 9/30
Training Loss: 0.0071 | Training F1 Score: 0.9370
Validation Loss: 0.0192 | Validation F1 Score: 0.8006
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.04it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.25it/s]


Epoch 10/30
Training Loss: 0.0056 | Training F1 Score: 0.9548
Validation Loss: 0.0192 | Validation F1 Score: 0.8094
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.02it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.31it/s]


Epoch 11/30
Training Loss: 0.0036 | Training F1 Score: 0.9753
Validation Loss: 0.0187 | Validation F1 Score: 0.8280
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.13it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.11it/s]


Epoch 12/30
Training Loss: 0.0027 | Training F1 Score: 0.9822
Validation Loss: 0.0215 | Validation F1 Score: 0.8045
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.12it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 16.71it/s]


Epoch 13/30
Training Loss: 0.0022 | Training F1 Score: 0.9847
Validation Loss: 0.0201 | Validation F1 Score: 0.8398
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  6.99it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 16.74it/s]


Epoch 14/30
Training Loss: 0.0018 | Training F1 Score: 0.9871
Validation Loss: 0.0211 | Validation F1 Score: 0.8362
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.05it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.13it/s]


Epoch 15/30
Training Loss: 0.0015 | Training F1 Score: 0.9896
Validation Loss: 0.0206 | Validation F1 Score: 0.8365
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.08it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.24it/s]


Epoch 16/30
Training Loss: 0.0016 | Training F1 Score: 0.9903
Validation Loss: 0.0197 | Validation F1 Score: 0.8349
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.23it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 13.37it/s]


Epoch 17/30
Training Loss: 0.0010 | Training F1 Score: 0.9937
Validation Loss: 0.0207 | Validation F1 Score: 0.8414
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.37it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 10.69it/s]


Epoch 18/30
Training Loss: 0.0009 | Training F1 Score: 0.9934
Validation Loss: 0.0212 | Validation F1 Score: 0.8453
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:11<00:00,  7.69it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 10.78it/s]


Epoch 19/30
Training Loss: 0.0008 | Training F1 Score: 0.9948
Validation Loss: 0.0221 | Validation F1 Score: 0.8297
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.49it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 11.04it/s]


Epoch 20/30
Training Loss: 0.0007 | Training F1 Score: 0.9944
Validation Loss: 0.0218 | Validation F1 Score: 0.8461
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.43it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 11.04it/s]


Epoch 21/30
Training Loss: 0.0006 | Training F1 Score: 0.9955
Validation Loss: 0.0231 | Validation F1 Score: 0.8370
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.34it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 14.47it/s]


Epoch 22/30
Training Loss: 0.0005 | Training F1 Score: 0.9951
Validation Loss: 0.0234 | Validation F1 Score: 0.8393
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.17it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 16.86it/s]


Epoch 23/30
Training Loss: 0.0004 | Training F1 Score: 0.9955
Validation Loss: 0.0236 | Validation F1 Score: 0.8384
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.10it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.39it/s]


Epoch 24/30
Training Loss: 0.0004 | Training F1 Score: 0.9962
Validation Loss: 0.0232 | Validation F1 Score: 0.8388
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.16it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.40it/s]


Epoch 25/30
Training Loss: 0.0005 | Training F1 Score: 0.9948
Validation Loss: 0.0236 | Validation F1 Score: 0.8353
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.06it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.07it/s]


Epoch 26/30
Training Loss: 0.0006 | Training F1 Score: 0.9941
Validation Loss: 0.0241 | Validation F1 Score: 0.8423
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.08it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.15it/s]


Epoch 27/30
Training Loss: 0.0007 | Training F1 Score: 0.9930
Validation Loss: 0.0243 | Validation F1 Score: 0.8263
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.05it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.17it/s]


Epoch 28/30
Training Loss: 0.0007 | Training F1 Score: 0.9920
Validation Loss: 0.0277 | Validation F1 Score: 0.8354
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.10it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.57it/s]


Epoch 29/30
Training Loss: 0.0028 | Training F1 Score: 0.9770
Validation Loss: 0.0225 | Validation F1 Score: 0.8231
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:12<00:00,  7.12it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.21it/s]

Epoch 30/30
Training Loss: 0.0007 | Training F1 Score: 0.9920
Validation Loss: 0.0220 | Validation F1 Score: 0.8540
--------------------------------------------------



