<a href="https://colab.research.google.com/github/naokityokoyama/HDC/blob/main/HDC_RecordEncoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch-hd binhd unidecode num2words torchmetrics -q

In [None]:
import zipfile
from unidecode import unidecode
import string
from num2words import num2words
import re
import numpy as np
import pandas as pd
from typing import Union, Literal
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torchhd
from torchhd import embeddings

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from binhd.embeddings import ScatterCode
from binhd.datasets import BaseDataset
from binhd.classifiers import BinHD, NeuralHD
from torchmetrics import Accuracy, AUROC


import warnings
warnings.filterwarnings("ignore")

In [None]:
path_origin = '/content/drive/MyDrive/uff/fake.zip'
path_destino = '/content/'
with zipfile.ZipFile(path_origin, "r") as zip_ref:
    zip_ref.extractall(path_destino)

In [None]:
#build dataset

df_fake = pd.read_csv('/content/fakes.csv')[['text']]
df_fake['target'] = 1
df_true = pd.read_csv('/content/true.csv')[['text']]
df_true['target'] = 0
df = pd.concat([df_fake, df_true]).reset_index(drop=True)

In [None]:
# Definir o tamanho da amostra
sample_size = 10000

# Criar uma amostra balanceada
df = df.groupby("target", group_keys=False).apply(lambda x: resample(x, n_samples=sample_size // df["target"].nunique(), random_state=42))
df = df.reset_index(drop=True)

In [None]:
df.head(3)

In [None]:
#clean

def n2w(texto:str)->str:
  padrao = r"\d+"
  numeros = re.findall(padrao, texto)
  for numero in numeros:
    extenso = num2words(numero, lang='pt')
    texto = texto.replace(numero, extenso)
  return texto

In [None]:
for repet in tqdm(range(2)):  #bug para rodar 2x
  df['text'] = df['text'].str.lower()
  df['text'] = df['text'].str.replace(f"[{string.punctuation}]", "", regex=True)
  df['text'] = df['text'].apply(lambda x: ' '.join(x.split()))
  df['text'] = df['text'].str.replace('"', '').str.replace('\\', '')
  df['text'] = df['text'].apply(n2w)
  df['text'] = df['text'].apply(unidecode)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using {} device".format(device))

MAX_INPUT_SIZE = 128
PADDING_IDX = 0

ASCII_A = ord("a")
ASCII_Z = ord("z")
ASCII_SPACE = ord(" ")
NUM_TOKENS = ASCII_Z - ASCII_A + 3  # a through z plus space and padding
print (ASCII_A, '--', ASCII_Z, '--', ASCII_SPACE, '--', NUM_TOKENS)

def char2int(char: str) -> int:
    """Map a character to its integer identifier"""
    ascii_index = ord(char)

    if ascii_index == ASCII_SPACE:
        # Remap the space character to come after "z"
        return ASCII_Z - ASCII_A + 1

    return ascii_index - ASCII_A


def transform(x: str) -> torch.Tensor:
    char_ids = x[:MAX_INPUT_SIZE]
    char_ids = [char2int(char) + 1 for char in char_ids.lower()]

    if len(char_ids) < MAX_INPUT_SIZE:
        char_ids += [PADDING_IDX] * (MAX_INPUT_SIZE - len(char_ids))

    return torch.tensor(char_ids, dtype=torch.long)

In [None]:
#create X and y
lst = []
print ('size dataset',df.shape[0] )
for i in range(df.shape[0]):
  lst.append (np.array(transform(df['text'][i])))

X = np.array(lst)
y = list(df['target'])

Record Encoder

In [None]:
# Use the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using {} device".format(device))

dimension = 1000
num_levels = 100
num_feature = X.shape[1]
num_classe = len(set(np.array(y)))

min_val = X[X != 0].min()
max_val = X.max()
print(min_val, max_val)

class RecordEncoder(nn.Module):
    def __init__(self, out_features, size, levels, low, high):
        super(RecordEncoder, self).__init__()
        self.position = embeddings.Random(size, out_features, vsa="BSC", dtype=torch.uint8)
        self.value = ScatterCode(levels, out_features, low = low, high = high)

    def forward(self, x):
        sample_hv = torchhd.bind(self.position.weight, self.value(x))
        sample_hv = torchhd.multiset(sample_hv)
        return sample_hv

In [None]:
record_encode = RecordEncoder(dimension, num_feature, num_levels, min_val, max_val)
record_encode = record_encode.to(device)

In [None]:
with torch.no_grad():
    samples = torch.tensor(X).to(device)
    labels = torch.tensor(y).squeeze().to(device)

batch_size = 32
num_samples = samples.shape[0]

for i in tqdm(range(0, num_samples, batch_size)):
    batch = samples[i:i+batch_size]
    batch_labels = labels[i:i+batch_size]

    X_hv = record_encode(batch)

    X_train, X_test, y_train, y_test = train_test_split(X_hv, batch_labels, test_size=0.3, random_state = 0)

    model = BinHD(dimension, num_classe)


    with torch.no_grad():
      model.fit(X_train,y_train)



In [None]:
predictions = model.predict(X_test)
acc = accuracy_score(predictions, y_test)
print("BinHD - Adapt: Accuracy = ", acc)

In [None]:
accuracy = Accuracy("binary", num_classes=2)
auroc = AUROC(task="binary")

In [None]:
acc_value = auroc(predictions, y_test)
accuracy = accuracy(predictions, y_test)
print("BinHD - Adapt: AUC = ", acc_value, 'ACCURACY', accuracy)