In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import functional as F
from tqdm import tqdm
from transformers import AutoTokenizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.autograd import Variable 
from tqdm import tqdm

In [3]:
class NewsDataset(Dataset):
  def __init__(self, data, tokenizer, max_length):
      self.data = data
      self.tokenizer = tokenizer
      self.max_length = max_length

  def __len__(self):
      return len(self.data)

  def __getitem__(self, index):
      title = self.data.iloc[index]['Title']
      text = self.data.iloc[index]['Text']
      label = self.data.iloc[index]['Category']

      encoding = self.tokenizer.encode_plus(
          title,
          text,
          add_special_tokens=True,
          max_length=self.max_length,
          truncation=True,
          padding='max_length',
          return_tensors='pt'
      )

      input_ids = encoding['input_ids'].squeeze()
      attention_mask = encoding['attention_mask'].squeeze()

      return {
          'input_ids': input_ids,
          'attention_mask': attention_mask,
          'label': label
      }


In [4]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_units, num_classes):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_units, batch_first=True)
        self.fc = nn.Linear(hidden_units, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output[:, -1, :])  # Use the last hidden state
        return output


def train(model, device, train_loader, optimizer, criterion):
    model.train()
    train_loss = 0
    train_acc = 0

    for batch in tqdm(train_loader, desc='Training'):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs, dim=1)
        train_acc += (predicted == labels).sum().item()

    train_loss /= len(train_loader.dataset)
    train_acc /= len(train_loader.dataset)

    return train_loss, train_acc


def evaluate(model, device, data_loader, criterion):
    model.eval()
    eval_loss = 0
    eval_f1 = 0

    with torch.no_grad():
        all_labels = []
        all_predictions = []

        for batch in tqdm(data_loader, desc='Evaluation'):
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)  # Modify this line

            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            eval_loss += loss.item()
            _, predicted = torch.max(outputs, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

        eval_loss /= len(data_loader.dataset)
        eval_f1 = f1_score(all_labels, all_predictions, average='weighted')

    return eval_loss, eval_f1
     

df = pd.read_csv('/content/data_cleaned.csv')
     

df =df.drop(['Unnamed: 0', 'Title','Text', 'Text_first200',
       'Text_first200_translated', 'Title_Translated'], axis=1)
df = df.rename(columns={"Text_first200_translated_cleaned": "Text", "Title_Translated_cleaned": "Title"})

In [5]:
data = df.copy()

# Encode the labels
label_encoder = LabelEncoder()
data['Category'] = label_encoder.fit_transform(data['Category'])

train_data, val_data = train_test_split(data, test_size=0.15, random_state=42)

# Create the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define hyperparameters
max_length = 100  # Maximum sequence length for input
vocab_size = tokenizer.vocab_size  # Vocabulary size
embedding_dim = 100  # Embedding dimension
hidden_units = 64
num_classes = len(label_encoder.classes_)  # Number of classes

# Create instances of NewsDataset for training and validation sets
train_dataset = NewsDataset(train_data, tokenizer, max_length)
val_dataset = NewsDataset(val_data, tokenizer, max_length)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


# Instantiate the LSTM-based model
model = LSTMModel(vocab_size, embedding_dim, hidden_units, num_classes)
model.to(device)

# Define the loss criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 40
best_val_loss = float('inf')
best_model_weights = None

for epoch in range(num_epochs):
    # Training
    train_loss, train_f1 = train(model, device, train_loader, optimizer, criterion)
    val_loss, val_f1 = evaluate(model, device, val_loader, criterion)

    # Print epoch results
    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Training Loss: {train_loss:.4f} | Training F1 Score: {train_f1:.4f}')
    print(f'Validation Loss: {val_loss:.4f} | Validation F1 Score: {val_f1:.4f}')
    print('-' * 50)

    # Check if validation loss improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_weights = model.state_dict()

# Save the best model
torch.save(best_model_weights, 'lstm_model.pt')


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Training: 100%|██████████| 90/90 [00:11<00:00,  7.59it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 16.53it/s]


Epoch 1/40
Training Loss: 0.0518 | Training F1 Score: 0.2934
Validation Loss: 0.0487 | Validation F1 Score: 0.3348
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:11<00:00,  7.96it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.54it/s]


Epoch 2/40
Training Loss: 0.0413 | Training F1 Score: 0.4939
Validation Loss: 0.0305 | Validation F1 Score: 0.5873
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.35it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.38it/s]


Epoch 3/40
Training Loss: 0.0291 | Training F1 Score: 0.6759
Validation Loss: 0.0323 | Validation F1 Score: 0.6013
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.83it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 11.76it/s]


Epoch 4/40
Training Loss: 0.0231 | Training F1 Score: 0.7414
Validation Loss: 0.0250 | Validation F1 Score: 0.7057
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:09<00:00,  9.78it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 14.22it/s]


Epoch 5/40
Training Loss: 0.0183 | Training F1 Score: 0.8051
Validation Loss: 0.0285 | Validation F1 Score: 0.6712
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.37it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.84it/s]


Epoch 6/40
Training Loss: 0.0167 | Training F1 Score: 0.8200
Validation Loss: 0.0242 | Validation F1 Score: 0.7101
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.31it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 18.05it/s]


Epoch 7/40
Training Loss: 0.0119 | Training F1 Score: 0.8743
Validation Loss: 0.0219 | Validation F1 Score: 0.7645
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.29it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.78it/s]


Epoch 8/40
Training Loss: 0.0091 | Training F1 Score: 0.9144
Validation Loss: 0.0219 | Validation F1 Score: 0.7748
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:09<00:00,  9.17it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 11.66it/s]


Epoch 9/40
Training Loss: 0.0080 | Training F1 Score: 0.9304
Validation Loss: 0.0235 | Validation F1 Score: 0.7765
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:09<00:00,  9.27it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.32it/s]


Epoch 10/40
Training Loss: 0.0080 | Training F1 Score: 0.9363
Validation Loss: 0.0238 | Validation F1 Score: 0.7672
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.33it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 18.15it/s]


Epoch 11/40
Training Loss: 0.0050 | Training F1 Score: 0.9641
Validation Loss: 0.0233 | Validation F1 Score: 0.7995
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.43it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.78it/s]


Epoch 12/40
Training Loss: 0.0037 | Training F1 Score: 0.9711
Validation Loss: 0.0232 | Validation F1 Score: 0.8020
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.53it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 14.06it/s]


Epoch 13/40
Training Loss: 0.0060 | Training F1 Score: 0.9436
Validation Loss: 0.0264 | Validation F1 Score: 0.7741
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:09<00:00,  9.64it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 12.28it/s]


Epoch 14/40
Training Loss: 0.0034 | Training F1 Score: 0.9725
Validation Loss: 0.0261 | Validation F1 Score: 0.7870
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.69it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.88it/s]


Epoch 15/40
Training Loss: 0.0028 | Training F1 Score: 0.9791
Validation Loss: 0.0268 | Validation F1 Score: 0.7872
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.41it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.60it/s]


Epoch 16/40
Training Loss: 0.0020 | Training F1 Score: 0.9822
Validation Loss: 0.0272 | Validation F1 Score: 0.8034
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.45it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.83it/s]


Epoch 17/40
Training Loss: 0.0018 | Training F1 Score: 0.9850
Validation Loss: 0.0275 | Validation F1 Score: 0.7984
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:09<00:00,  9.41it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 11.51it/s]


Epoch 18/40
Training Loss: 0.0017 | Training F1 Score: 0.9864
Validation Loss: 0.0290 | Validation F1 Score: 0.7941
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:09<00:00,  9.23it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.93it/s]


Epoch 19/40
Training Loss: 0.0018 | Training F1 Score: 0.9861
Validation Loss: 0.0284 | Validation F1 Score: 0.7927
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.44it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 18.13it/s]


Epoch 20/40
Training Loss: 0.0014 | Training F1 Score: 0.9857
Validation Loss: 0.0290 | Validation F1 Score: 0.8024
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.40it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.13it/s]


Epoch 21/40
Training Loss: 0.0015 | Training F1 Score: 0.9882
Validation Loss: 0.0297 | Validation F1 Score: 0.8079
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  9.00it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 11.58it/s]


Epoch 22/40
Training Loss: 0.0013 | Training F1 Score: 0.9875
Validation Loss: 0.0298 | Validation F1 Score: 0.8076
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:09<00:00,  9.87it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.88it/s]


Epoch 23/40
Training Loss: 0.0012 | Training F1 Score: 0.9885
Validation Loss: 0.0324 | Validation F1 Score: 0.8095
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.34it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 18.09it/s]


Epoch 24/40
Training Loss: 0.0014 | Training F1 Score: 0.9861
Validation Loss: 0.0311 | Validation F1 Score: 0.7999
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.44it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 18.11it/s]


Epoch 25/40
Training Loss: 0.0016 | Training F1 Score: 0.9854
Validation Loss: 0.0294 | Validation F1 Score: 0.8050
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.55it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 15.01it/s]


Epoch 26/40
Training Loss: 0.0009 | Training F1 Score: 0.9906
Validation Loss: 0.0313 | Validation F1 Score: 0.8024
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:09<00:00,  9.79it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 12.97it/s]


Epoch 27/40
Training Loss: 0.0010 | Training F1 Score: 0.9896
Validation Loss: 0.0318 | Validation F1 Score: 0.7954
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.73it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 18.20it/s]


Epoch 28/40
Training Loss: 0.0013 | Training F1 Score: 0.9861
Validation Loss: 0.0378 | Validation F1 Score: 0.7928
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.36it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.57it/s]


Epoch 29/40
Training Loss: 0.0009 | Training F1 Score: 0.9892
Validation Loss: 0.0307 | Validation F1 Score: 0.8073
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.37it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.96it/s]


Epoch 30/40
Training Loss: 0.0019 | Training F1 Score: 0.9798
Validation Loss: 0.0325 | Validation F1 Score: 0.7916
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:09<00:00,  9.25it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 10.94it/s]


Epoch 31/40
Training Loss: 0.0045 | Training F1 Score: 0.9603
Validation Loss: 0.0303 | Validation F1 Score: 0.7802
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.98it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.58it/s]


Epoch 32/40
Training Loss: 0.0025 | Training F1 Score: 0.9777
Validation Loss: 0.0270 | Validation F1 Score: 0.8063
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.34it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.89it/s]


Epoch 33/40
Training Loss: 0.0017 | Training F1 Score: 0.9843
Validation Loss: 0.0285 | Validation F1 Score: 0.7959
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.44it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.78it/s]


Epoch 34/40
Training Loss: 0.0012 | Training F1 Score: 0.9878
Validation Loss: 0.0298 | Validation F1 Score: 0.7957
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.49it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 14.14it/s]


Epoch 35/40
Training Loss: 0.0009 | Training F1 Score: 0.9903
Validation Loss: 0.0289 | Validation F1 Score: 0.8090
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:09<00:00,  9.69it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 12.56it/s]


Epoch 36/40
Training Loss: 0.0008 | Training F1 Score: 0.9910
Validation Loss: 0.0296 | Validation F1 Score: 0.8121
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.74it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.87it/s]


Epoch 37/40
Training Loss: 0.0006 | Training F1 Score: 0.9930
Validation Loss: 0.0301 | Validation F1 Score: 0.8125
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.44it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.94it/s]


Epoch 38/40
Training Loss: 0.0006 | Training F1 Score: 0.9923
Validation Loss: 0.0306 | Validation F1 Score: 0.8140
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:10<00:00,  8.47it/s]
Evaluation: 100%|██████████| 16/16 [00:00<00:00, 17.40it/s]


Epoch 39/40
Training Loss: 0.0005 | Training F1 Score: 0.9923
Validation Loss: 0.0308 | Validation F1 Score: 0.8113
--------------------------------------------------


Training: 100%|██████████| 90/90 [00:09<00:00,  9.34it/s]
Evaluation: 100%|██████████| 16/16 [00:01<00:00, 11.78it/s]

Epoch 40/40
Training Loss: 0.0005 | Training F1 Score: 0.9934
Validation Loss: 0.0303 | Validation F1 Score: 0.8218
--------------------------------------------------



