In [None]:
import torch
print(torch.version.cuda)


if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")
print(f"Using device: {mps_device}")

In [22]:
import pandas as pd
news_data = pd.read_csv("news.csv")
news_data = news_data.drop(['Date'], axis=1)
# Combine Top1-Top25 into a single string
news_data['combined_text'] = news_data.iloc[:, 1:].apply(lambda x: " ".join(x.dropna()), axis=1)
columns_2 = ['Top1','Top2','Top3', 'Top4', 'Top5','Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23','Top24', 'Top25']
news_data = news_data.drop(columns_2, axis=1)
news_data = news_data.replace('b\"|b\'|\\\\|\\\"', '', regex=True)

news_data.head()

Unnamed: 0,Label,combined_text
0,0,Georgia 'downs two Russian warplanes' as count...
1,1,Why wont America and Nato help us? If they won...
2,0,Remember that adorable 9-year-old who sang at ...
3,0,U.S. refuses Israel weapons to attack Iran: r...
4,1,All the experts admit that we should legalise ...


In [34]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer

class TextDataset(Dataset):
    def __init__(self, dataframe, max_length=512, tokenizer_name='bert-base-uncased'):
        self.texts = dataframe['combined_text'].values
        self.targets = dataframe['Label'].values
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        
        return {
            'input_ids': torch.as_tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.as_tensor(attention_mask, dtype=torch.long),
            'targets': torch.as_tensor(target, dtype=torch.long),
            'text': text
        }

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(news_data, train_size=0.9, shuffle=True, random_state=1702)
print('{:>5,} Training samples'.format(len(train_data)))
print('{:>5,} Validation samples'.format(len(test_data)))
# Create custom Datasets
train_dataset = TextDataset(train_data)
test_dataset = TextDataset(test_data)

from torch.utils.data import DataLoader
BATCH_SIZE = 16

torch.manual_seed(1702)
train_loader = DataLoader(train_dataset, 
                          batch_size=BATCH_SIZE,
                          shuffle=True)
test_loader = DataLoader(test_dataset,
                         batch_size=len(test_dataset))
next(iter(train_loader))

In [None]:
#from torch import cuda
#torch.cuda.empty_cache()
device = 'mps'

from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

model.to(device)

In [40]:
EPOCHS = 5

optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8)
import time
import datetime
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

training_stats = []
epoch_loss_train = []
total_t0 = time.time()

# TRAINING
for epoch in range(1, EPOCHS + 1):
    model.train()
    t0 = time.time()
    print("")
    print("================ Epoch {:} / {:} ================".format(epoch, EPOCHS))
    train_all_predictions = []
    train_all_true_labels = []
    for step, data in enumerate(train_loader):
        if step % 2 == 0 and not step == 0:
            elapsed = int(round(time.time() - t0))
            elapsed = str(datetime.timedelta(seconds=elapsed))
            print(
                "  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.".format(
                    step, len(train_loader), elapsed
                )
            )

        targets = data["targets"].to(device)
        mask = data["attention_mask"].to(device)
        ids = data["input_ids"].to(device)

        model.zero_grad()

        loss, logits = model(
            ids, token_type_ids=None, attention_mask=mask, labels=targets
        ).to_tuple()
        epoch_loss_train.append(loss.item())

        cpu_logits = logits.cpu().detach().numpy()
        train_all_predictions.extend(np.argmax(cpu_logits, axis=1).flatten())
        train_all_true_labels.extend(targets.cpu().numpy())

        loss.backward()
        optimizer.step()
    train_accuracy = accuracy_score(train_all_true_labels, train_all_predictions)
    train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(
        train_all_true_labels, train_all_predictions, average="binary"
    )
    print("")
    print('---TRAIN METRICS---')
    print(f"Loss: {np.mean(epoch_loss_train):.4f}")
    print(f"Accuracy: {train_accuracy:.4f}")
    print(f"Precision: {train_precision:.4f}")
    print(f"Recall: {train_recall:.4f}")
    print(f"F1-Score: {train_f1:.4f}")
    print("")
    
    # VALIDATION
    # print("Running validation ...")
    # print("")
    # model.eval()
    # epoch_loss_test = []
    # test_all_predictions = []
    # test_all_true_labels = []
    # for data in test_loader:
    #     targets = data["targets"].to(device)
    #     mask = data["attention_mask"].to(device)
    #     ids = data["input_ids"].to(device)
    #     
    #     with torch.no_grad():
    #         loss, logits = model(ids, token_type_ids=None, attention_mask=mask, labels=targets).to_tuple()
    #         
    #     epoch_loss_test.append(loss.item())
    #     cpu_logits = logits.cpu().detach().numpy()
    #     test_all_predictions.extend(np.argmax(cpu_logits, axis=1).flatten())
    #     test_all_true_labels.extend(targets.cpu().numpy())
    # test_accuracy = accuracy_score(test_all_true_labels, test_all_predictions)
    # test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
    #     test_all_true_labels, test_all_predictions, average="binary"
    # )
    # print("")
    # print('---TEST METRICS---')
    # print(f"Loss: {np.mean(epoch_loss_test):.4f}")
    # print(f"Accuracy: {test_accuracy:.4f}")
    # print(f"Precision: {test_precision:.4f}")
    # print(f"Recall: {test_recall:.4f}")
    # print(f"F1-Score: {test_f1:.4f}")
    # 
    # training_stats.append(
    #         {
    #         'epoch': epoch,
    #         'Training Loss': np.mean(epoch_loss_train),
    #         'Training Accuracy': train_accuracy,
    #         'Training Precision': train_precision,
    #         'Training Recall': train_recall,
    #         'Training F1': train_f1,
    #         'Validation Loss': np.mean(epoch_loss_test),
    #         'Validation Accuracy': test_accuracy,
    #         'Validation Precision': test_precision,
    #         'Validation Recall': test_recall,
    #         'Validation F1': test_f1
    #     }
    # )


  Batch     2  of    112.    Elapsed: 0:00:37.
  Batch     4  of    112.    Elapsed: 0:01:20.
  Batch     6  of    112.    Elapsed: 0:01:58.
  Batch     8  of    112.    Elapsed: 0:02:33.
  Batch    10  of    112.    Elapsed: 0:03:07.
  Batch    12  of    112.    Elapsed: 0:03:38.
  Batch    14  of    112.    Elapsed: 0:04:08.
  Batch    16  of    112.    Elapsed: 0:04:40.
  Batch    18  of    112.    Elapsed: 0:05:10.
  Batch    20  of    112.    Elapsed: 0:05:38.
  Batch    22  of    112.    Elapsed: 0:06:07.
  Batch    24  of    112.    Elapsed: 0:06:36.
  Batch    26  of    112.    Elapsed: 0:07:06.
  Batch    28  of    112.    Elapsed: 0:07:42.
  Batch    30  of    112.    Elapsed: 0:08:16.
  Batch    32  of    112.    Elapsed: 0:08:47.
  Batch    34  of    112.    Elapsed: 0:09:14.
  Batch    36  of    112.    Elapsed: 0:09:41.
  Batch    38  of    112.    Elapsed: 0:10:08.
  Batch    40  of    112.    Elapsed: 0:10:34.
  Batch    42  of    112.    Elapsed: 0:11:03.
  Batch    4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



---TRAIN METRICS---
Loss: nan
Accuracy: 0.4721
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000


  Batch     2  of    112.    Elapsed: 0:00:30.



KeyboardInterrupt

