In [1]:
from transformers import AutoTokenizer
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn
import os
import numpy as np
from transformers import BertForSequenceClassification, AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from transformers import AutoModel
from transformers import BertTokenizer, TFBertModel

model = AutoModel.from_pretrained("google-bert/bert-base-uncased")

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text = "HOLA  a"
token = tokenizer.encode(text)
print(token)

[101, 7570, 2721, 1037, 102]


In [17]:
class IMDBDataset(Dataset):
    def __init__(self, csv, tokenizer):
        df = pd.read_csv(csv)
        df['sentiment'] = np.where(df['sentiment'] == 'positive', 1, 0)  # Binary classification
        self.x = df["review"].values
        self.y = df["sentiment"].values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        sentence = self.x[idx]
        tokens = self.tokenizer(sentence, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
        
        input_ids = tokens["input_ids"].squeeze(0)  # Remove batch dim
        attention_mask = tokens["attention_mask"].squeeze(0)

        label = torch.tensor(self.y[idx], dtype=torch.long)  # CrossEntropyLoss expects long labels
        
        return input_ids, attention_mask, label

In [19]:
def create_train_test_datasets(filepath, tokenizer, split_ratio=0.8, batch_size=10):
    dataset = imbd_dataset(filepath, tokenizer)
    train_size = int(split_ratio * len(dataset))
    test_size = len(dataset) - train_size
    train_data, test_data = random_split(dataset, [train_size, test_size])

    train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

    return train_dataloader, test_dataloader
