<a href="https://colab.research.google.com/github/naokityokoyama/HDC/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install unidecode num2words evaluate

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

import re
import zipfile
from unidecode import unidecode
import string
from num2words import num2words

from tqdm import tqdm
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate

In [None]:
path_origin = '/content/drive/MyDrive/uff/fake.zip'
path_destino = '/content/'
with zipfile.ZipFile(path_origin, "r") as zip_ref:
    zip_ref.extractall(path_destino)

In [None]:
#build dataset

df_fake = pd.read_csv('/content/fakes.csv')[['text']]
df_fake['target'] = 1
df_true = pd.read_csv('/content/true.csv')[['text']]
df_true['target'] = 0
df = pd.concat([df_fake, df_true]).reset_index(drop=True)

In [None]:
# Definir o tamanho da amostra
sample_size = 10000

# Criar uma amostra balanceada
df = df.groupby("target", group_keys=False).apply(lambda x: resample(x, n_samples=sample_size // df["target"].nunique(), random_state=42))
df = df.reset_index(drop=True)

In [None]:
#clean

def n2w(texto:str)->str:
  padrao = r"\d+"
  numeros = re.findall(padrao, texto)
  for numero in numeros:
    extenso = num2words(numero, lang='pt')
    texto = texto.replace(numero, extenso)
  return texto

In [None]:
for repet in tqdm(range(2)):  #bug para rodar 2x
  df['text'] = df['text'].str.lower()
  df['text'] = df['text'].str.replace(f"[{string.punctuation}]", "", regex=True)
  df['text'] = df['text'].apply(lambda x: ' '.join(x.split()))
  df['text'] = df['text'].str.replace('"', '').str.replace('\\', '')
  df['text'] = df['text'].apply(n2w)
  df['text'] = df['text'].apply(unidecode)

In [None]:
#create X and y
X = df['text']
y = df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [None]:
train = Dataset.from_dict({'text': X_train, 'labels': y_train})
val = Dataset.from_dict({'text': X_val, 'labels': y_val})
test = Dataset.from_dict({'text': X_test, 'labels': y_test})
dataset_dict = DatasetDict({'train': train, 'test': test})
dataset_dict = DatasetDict({'train': train, 'validation': val, 'test': test})
num_labels = len(set(np.array(y)))

In [None]:
# define pre-trained model path
model_path = "google-bert/bert-base-uncased"

# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=num_labels)

In [None]:
# freeze all base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [None]:
# define text preprocessing
def preprocess_function(examples):
    # return tokenized text with truncation
    return tokenizer(examples["text"], truncation=True)

# preprocess all datasets
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

In [None]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred

    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1,
                                                                 keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs,
                                     references=labels)['roc_auc'],3)

    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes,
                                     references=labels)['accuracy'],3)

    return {"Accuracy": acc, "AUC": auc}

In [None]:
# hyperparameters
lr = 2e-4
batch_size = 32
num_epochs = 2

training_args = TrainingArguments(
    output_dir="bert-phishing-classifier_teacher",
    report_to="none", #disable wandb
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

