In [1]:
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader, random_split, Dataset
from torch import nn
import pandas as pd
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class imdb50reviws_dataset(Dataset):
    def __init__(self, csv, tokenizer):
        df=pd.read_csv(csv)
        df=df.replace("positive",1)
        df=df.replace("negative",0)
        self.x=df["review"]
        self.y=df["sentiment"]
        self.tokenizer=tokenizer
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        sentence=self.x[idx]
        tokens= self.tokenizer.encode(sentence, max_length=200, pad_to_max_length=True)
        if len(tokens)<200:
            for i in range(200-len(tokens)):
                tokens.append(0)
        #y_logs=torch.tensor(self.y[idx], dtype=torch.long)
        #y_logs=nn.functional.one_hot(torch.tensor(self.y[idx]),2)
        
        #return torch.tensor(tokens), torch.tensor(y_logs, dtype=torch.long)
        return torch.tensor(tokens), self.y[idx]

In [3]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Define model
class Transformer_model(nn.Module):
    def __init__(self, vocab_size, embed_size=200):
        super(Transformer_model, self).__init__()
        self.embed= nn.Embedding(vocab_size, embed_size)
        self.encoder_layer=nn.TransformerEncoder(nn.TransformerEncoderLayer(embed_size, 10, batch_first=True, dropout=0.3), 4)
        
        self.linear = nn.Sequential(
            nn.ReLU(),
            nn.Linear(embed_size*200, 2)
        )
        

    def forward(self, x):
        emb = self.embed(x) 
        out_encod=self.encoder_layer(emb) 
        out_encod = out_encod.reshape(out_encod.shape[0],out_encod.shape[1]*out_encod.shape[2])

        # logits = self.linear(out_encod[:,-1,:])
        logits = self.linear(out_encod)
        return logits


Using cpu device


  return torch._C._cuda_getDeviceCount() > 0


In [4]:
model = Transformer_model(2000,100)

print(model)

Transformer_model(
  (embed): Embedding(2000, 100)
  (encoder_layer): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=2048, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=2048, out_features=100, bias=True)
        (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (linear): Sequential(
    (0): ReLU()
    (1): Linear(in_features=20000, out_features=2, bias=True)
  )
)


In [5]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    
    loop = tqdm(dataloader, desc="Training", leave=True)
    
    for batch, (X, y) in enumerate(loop):
        X, y = X.to(device), y.to(device)
        
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Update tqdm description
        loop.set_postfix(loss=loss.item())

        
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    
    loop = tqdm(dataloader, desc="Testing", leave=True)

    with torch.no_grad():
        for X, y in loop:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

            # Update tqdm description
            loop.set_postfix(accuracy=100 * correct / size)

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = Transformer_model(vocab_size=tokenizer.vocab_size, embed_size=200).to(device)

In [7]:
batch_size =64
epochs = 3
lr= 0.0005


def main(model, batch_size, epochs, lr):    
    dataset = imdb50reviws_dataset("./IMDB Dataset.csv", tokenizer)
    
    train_dataset, test_dataset = random_split(dataset, [0.8, 0.2])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    
    
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr)
    
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_loader, model, loss_fn, optimizer)
        test(test_loader, model, loss_fn)

        
    print("Done!")
    
    
main(model, batch_size, epochs, lr)

  df=df.replace("negative",0)


Epoch 1
-------------------------------


Training:   0%|          | 0/625 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Training:   2%|▏         | 10/625 [00:22<22:48,  2.22s/it, loss=1.49]


KeyboardInterrupt: 

In [30]:
sent = "this movie is amazing"

tokens = tokenizer.encode(sent)
if len(tokens)<200:
            for i in range(200-len(tokens)):
                tokens.append(0)
print(len(tokens))
test_sentence = torch.tensor(tokens).reshape(1,len(tokens)).to(device)
model(test_sentence)

200


tensor([[-1.6252,  1.8253]], device='cuda:0', grad_fn=<AddmmBackward0>)