In [1]:
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


True
NVIDIA GeForce GTX 1650


In [4]:
df = pd.read_pickle('../../data/processed/balanced_dataset.pkl')

df['label'] = df['label'].map({'FAKE':0, 'REAL':1})

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'],df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_length = 256

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length = max_length)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length = max_length)


In [6]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {
            "labels": torch.tensor(self.labels[idx])
        }
        
train_dataset = NewsDataset(train_encodings, list(train_labels))
val_dataset = NewsDataset(val_encodings, list(val_labels))


In [7]:
import accelerate
print(accelerate.__version__)


1.10.0


In [8]:
import sys
print(sys.executable)

c:\Users\kknah\AppData\Local\Programs\Python\Python310\python.exe


In [9]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

CUDA available: True
Device: cuda


In [None]:
# print(f"Model device: {next(model.parameters()).device}")

In [10]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../../results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # keep small
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,  # mixed precision for faster training
)

print(training_args.device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda:0


In [11]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5271,0.5442,0.6385,0.584329,0.789316,0.6385
2,0.5482,0.540865,0.6385,0.584502,0.788446,0.6385
3,0.5657,0.542782,0.63825,0.583956,0.78923,0.63825


TrainOutput(global_step=6000, training_loss=0.5491172148386637, metrics={'train_runtime': 22769.742, 'train_samples_per_second': 2.108, 'train_steps_per_second': 0.264, 'total_flos': 6314665328640000.0, 'train_loss': 0.5491172148386637, 'epoch': 3.0})

In [13]:
results = trainer.evaluate()
print(results)


{'eval_loss': 0.5441998243331909, 'eval_accuracy': 0.6385, 'eval_f1': 0.5843293904992533, 'eval_precision': 0.7893155858588391, 'eval_recall': 0.6385, 'eval_runtime': 550.3889, 'eval_samples_per_second': 7.268, 'eval_steps_per_second': 0.908, 'epoch': 3.0}


In [14]:
trainer.save_model("../../model/bert_merged")
tokenizer.save_pretrained("../../model/bert_merged")


('../../model/bert_merged\\tokenizer_config.json',
 '../../model/bert_merged\\special_tokens_map.json',
 '../../model/bert_merged\\vocab.txt',
 '../../model/bert_merged\\added_tokens.json')