In [None]:
import torch
import spacy
from torch.utils.data import Dataset , DataLoader
import numpy as np
import torch.nn as nn
import pandas as pd
import torchtext
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split



In [None]:
spacy_en = spacy.load("en_core_web_sm")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
!gdown 1g0Wll4Q7zp5JrrfM3Tnox1RCMfeTJ_Nq

Downloading...
From: https://drive.google.com/uc?id=1g0Wll4Q7zp5JrrfM3Tnox1RCMfeTJ_Nq
To: /content/imdb_processed.csv
100% 45.1M/45.1M [00:00<00:00, 84.2MB/s]


In [None]:
dataset = pd.read_csv('/content/imdb_processed.csv')
dataset.head(10)

Unnamed: 0,processed,label
0,One reviewer mentioned watching Oz episode hoo...,1
1,A wonderful little production . The filming te...,1
2,I thought wonderful way spend time hot summer ...,1
3,Basically family little boy Jake think zombie ...,0
4,Petter Mattei Love Time Money visually stunnin...,1
5,"Probably - time favorite movie , story selfles...",1
6,I sure would like see resurrection dated Seahu...,1
7,"This show amazing , fresh innovative idea firs...",0
8,Encouraged positive comment film I looking for...,0
9,If like original gut wrenching laughter like m...,1


In [None]:
# for i in dataset:
#   if dataset.label[i] == '1':
#     label = 'TRUE'
dataset[:3]

Unnamed: 0,processed,label
0,One reviewer mentioned watching Oz episode hoo...,1
1,A wonderful little production . The filming te...,1
2,I thought wonderful way spend time hot summer ...,1


In [None]:
class vocabulary:

    def __init__(self,freq_threshold,maxlength):
        self.itos = {0:"<PAD>" , 1:"<SOS>" , 2:"<EOS>" , 3:"<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold
        self.maxlength = maxlength
    
    def __len__(self):
      return len(self.itos)

    def tokenizer(self, text):
      return[token.text.lower() for token in spacy_en.tokenizer(text)][:self.maxlength]

    def biuld_vocab(self,sentence_list):
      frequencies = {}
      idx = 4 

      for sentence in sentence_list:
            for word in self.tokenizer(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]


In [None]:
train , test = train_test_split(dataset , train_size = 0.8 , random_state=123 , shuffle = True)

In [None]:
sentence_train = train['processed'].tolist()
label_train = train['label'].tolist()

sentence_test = test['processed'].tolist()
label_test = test['label'].tolist()

In [None]:
freq_threshold = 30
maxlength = 512
vocab = vocabulary(freq_threshold = freq_threshold , maxlength = maxlength)
vocab.biuld_vocab(sentence_train)

In [None]:
len(vocab)

12236

# Define dataset

In [None]:
class mydataset(Dataset):
  def __init__(self, sentence_list, labels_list, vocab ):
    super(mydataset, self).__init__()
    self.sentences = sentence_list
    self.labels = labels_list
    self.vocab = vocab
  def __len__(self):
    return len(self.sentences)
  def __getitem__(self, index):
    text = self.sentences[index]
    label = self.labels[index]
    numericalized_text = [self.vocab.stoi["<SOS>"]]
    numericalized_text += self.vocab.numericalize(text)
    numericalized_text.append(self.vocab.stoi["<EOS>"])
    return torch.tensor(numericalized_text), torch.tensor(label, dtype=torch.long)




In [None]:
class mycollate:
  def __init__(self,pad_idx):
      self.pad_idx = pad_idx
  def __call__(self,batch):
      label = [item[1] for item in batch]
      text = [item[0] for item in batch]
      length = [len(item[0]) for item in batch]
      text = pad_sequence(text , batch_first = True , padding_value=self.pad_idx)
      return text , torch.tensor(label , dtype = torch.long) , length

In [None]:
pad_idx = vocab.stoi['<PAD>']
trainset = mydataset(sentence_train,label_train,vocab)
testset = mydataset(sentence_test,label_test,vocab)
trainloader = DataLoader(trainset , batch_size = 64 , shuffle = True ,collate_fn=mycollate(pad_idx=pad_idx))
testloader = DataLoader(testset , batch_size = 64 , shuffle = False ,collate_fn=mycollate(pad_idx=pad_idx))

In [None]:
for text, label, length in trainloader:
  print(text.shape)
  print(label.shape)
  print(length)
  print(text)
  break

torch.Size([64, 514])
torch.Size([64])
[43, 94, 63, 136, 133, 196, 73, 92, 173, 127, 188, 92, 109, 88, 77, 78, 84, 106, 305, 82, 131, 514, 254, 450, 89, 86, 106, 514, 245, 39, 202, 186, 123, 139, 43, 97, 181, 161, 76, 85, 85, 174, 132, 147, 514, 31, 431, 105, 95, 353, 301, 286, 98, 99, 134, 53, 65, 124, 124, 514, 166, 100, 162, 89]
tensor([[   1,   22,    8,  ...,    0,    0,    0],
        [   1,   55,  993,  ...,    0,    0,    0],
        [   1,   22,    8,  ...,    0,    0,    0],
        ...,
        [   1,  193, 7869,  ...,    0,    0,    0],
        [   1,  202,  285,  ...,    0,    0,    0],
        [   1,    6,   37,  ...,    0,    0,    0]])


**can use LSTM or GRU for your model**


#Define model

In [None]:
class lstm(nn.Module):
  def __init__(self , vocab_size , embeding_dim , output_dim ,n_layer , bidirectional , hidden_dim , dropout , pad_idx):
   
    super(lstm , self).__init__()
    self.embeding = nn.Embedding(vocab_size , embeding_dim , padding_idx=pad_idx)
    self.lstm = nn.LSTM(embeding_dim , hidden_dim  , n_layer , bidirectional = bidirectional , dropout = dropout , batch_first = True)
    #batch_first must true beacase our sentence are in rows and dimations must be batch x sent'''
    self.fc = nn.Linear(hidden_dim*2 if bidirectional else hidden_dim,output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self ,input , lenght):

    embeding = self.dropout(self.embeding(input))
    packed_embed = nn.utils.rnn.pack_padded_sequence(embeding , lenght , batch_first = True , enforce_sorted=False)
    packed_output , (hidden , cell) = self.lstm(packed_embed) 
    output , output_lenght = nn.utils.rnn.pad_packed_sequence(packed_output)
    if self.lstm.bidirectional:
      hidden = self.dropout(torch.cat([hidden[-1] , hidden[-2]],dim=-1))
    else:
      hidden = self.dropout(hidden[-1])
    out = self.fc(hidden)

    return out

In [None]:
class gru(nn.Module):
  def __init__(self , vocab_size , embeding_dim , output_dim ,n_layer , bidirectional , hidden_dim , dropout , pad_idx):
      super(gru , self).__init__()
      self.embeding = nn.Embedding(vocab_size , embeding_dim , padding_idx=pad_idx)
      self.rnn = nn.GRU(embeding_dim , hidden_dim , n_layer , bidirectional = bidirectional , dropout = dropout , batch_first = True)
      self.linear = nn.Linear(hidden_dim*2 if bidirectional else hidden_dim , output_dim)
      self.dropout = nn.Dropout(dropout)
  def forward(self, input , length):
    # input = [batch , length]
    print('\n input: ',input.shape)
    embeding = self.dropout(self.embeding(input))
    print('\n embeding: ',embeding.shape)
    # embeding  = [batch , length , embed_dim]
    pack_out = nn.utils.rnn.pack_padded_sequence(embeding, length, batch_first=True, enforce_sorted=False )
    out , hidden = self.rnn(pack_out)
    output , out_lenght = nn.utils.rnn.pad_packed_sequence(out)
    # out = [batch , length , 2*hidden]
    if self.rnn.bidirectional:
      hidden = self.dropout(torch.cat([hidden[-1],hidden[-2]] , dim = -1))
    else:
      hidden = self.dropout(hidden[-1])
    output = self.linear(hidden)
    print('\n output: ',output.shape)
    print('\n hidden: ',hidden.shape)
    return output

In [None]:
model

gru(
  (embeding): Embedding(12236, 300, padding_idx=0)
  (rnn): GRU(300, 300, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=600, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
class transformer(nn.Module):
  def __init__(self,vocab_size,embeding_dim,pad_idx):
      super(transformer , self).__init__()
      self.embeding = nn.Embedding(vocab_size , embeding_dim , padding_idx=pad_idx)
      self.transformer = n

In [None]:
vocab_size = len(vocab)
embeding_dim = 300
hidden_dim = 300
output_dim = 2
n_layer = 2
bidirectional = True
dropout = 0.5

# model = lstm(vocab_size , embeding_dim , output_dim ,n_layer , bidirectional , hidden_dim , dropout , pad_idx)
model = gru(vocab_size , embeding_dim , output_dim ,n_layer , bidirectional , hidden_dim , dropout , pad_idx).to(device)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,379,202 trainable parameters


# some metrics and func

In [None]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)

model.apply(initialize_weights)

gru(
  (embeding): Embedding(12236, 300, padding_idx=0)
  (rnn): GRU(300, 300, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=600, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
import torch.optim as optim
lr = 5e-4

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
criterion.to(device)

CrossEntropyLoss()

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
from torch.utils.tensorboard import SummaryWriter
import numpy as np

#tensor board

In [None]:
try:
    from google import colab
    COLAB_ENV = True
except (ImportError, ModuleNotFoundError):
    COLAB_ENV = False

# Load the TensorBoard notebook extension
if COLAB_ENV:
    %load_ext tensorboard
    %tensorboard --logdir runs  --host localhost --port 8088
else:
    print("To use tensorboard, please use this notebook in a Google Colab environment.")

# Train, test and checkpoint

In [None]:
!pip install torchmetrics==0.9.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import logging
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger('NLP-Sentiment')

In [None]:
logger.info('This is an info message')
logging.warning('This is a warning message')
logging.error('This is an error message')
logging.critical('This is a critical message')

INFO:NLP-Sentiment:This is an info message
ERROR:root:This is an error message
CRITICAL:root:This is a critical message


In [None]:
import sys
import torchmetrics
import logging
logger = logging.getLogger('NLP-Sentiment')
writer = SummaryWriter()

In [None]:
#############################################################
####################### def train ############################
#############################################################
from tqdm import tqdm
def train(epoch):
    model.train()
    loss_total = AverageMeter()
    accuracy = torchmetrics.Accuracy(topk = 5).to(device)
    for batch_idx, (inputs, targets, lengths) in enumerate(tqdm(trainloader)):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, lengths)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        loss_total.update(loss)
        accuracy(outputs.softmax(dim=-1), targets)
        # print(targets[:10])
        # if batch_idx == 5:
        #     break
    acc = accuracy.compute()
    writer.add_scalar('Loss/train', loss_total.avg.item(), epoch)
    writer.add_scalar('Acc/train', acc.item(), epoch)
    logger.info(f'Train: Epoch:{epoch} Loss:{loss_total.avg:.4} Accuracy:{acc:.4}')
    
#############################################################
####################### def test ############################
#############################################################

def test(epoch , checkpoint):
    model.eval()
    loss_total = AverageMeter()
    accuracy = torchmetrics.Accuracy(topk = 5).to(device)
    with torch.no_grad():
        for batch_idx, (inputs, targets, lengths) in enumerate(tqdm(testloader)):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs, lengths)
            loss = criterion(outputs, targets)
            loss_total.update(loss)
            accuracy(outputs.softmax(dim=-1), targets)
        acc = accuracy.compute()
        writer.add_scalar('Loss/test', loss_total.avg.item(), epoch)
        writer.add_scalar('Acc/test', acc.item(), epoch)
        logger.info(f'Test:  Epoch:{epoch} Loss:{loss_total.avg:.4} Accuracy:{acc:.4}')
        Checkpoint.save(acc=aaccuracy.compute() , filename='check' , net = model , epoch = epoch)
        print()

import os
best_acc = 0.0
class Checkpoint(object):
    def __init__(self):
        self.best_acc = 0.
        self.folder = 'chekpoint'  #make a folder by this name ' chekpoint '
        os.makedirs(self.folder, exist_ok=True)
    def save(self, acc, filename, net, epoch=-1, ):
        if acc > best_acc:
            logger.info('Saving checkpoint...')
            state = {
                'net': net.state_dict(),
                'acc': acc,
                'epoch': epoch,
                'optimizer': optimizer.state_dict(),
            }
            path = os.path.join(os.path.abspath(self.folder), filename + '.pth')
            torch.save(state, path)
            self.best_acc = acc
    def load(self, net,  PATH):
      model = net
      optimizer = optim.Adam(net.parameters(), lr=0.1)
      checkpt = torch.load(PATH)
      model.load_state_dict(checkpt['model_state_dict'])
      optimizer.load_state_dict(checkpt['optimizer_state_dict'])
      epoch = checkpt['epoch']
      loss = checkpt['loss']
      acc = checkpt['acc']
      return model, optimizer, loss, acc, epoch        

# Training

In [None]:
rnn = nn.GRU(1200, 1200, 2)
input = torch.randn(5, 1024, 1200)#patches
h0 = torch.randn(2, 1024, 1200)
output, hn = rnn(input, h0)
print(output.shape)

torch.Size([5, 1024, 1200])


In [None]:
start , end = 0,14
checkpoint = Checkpoint()
for epoch in range(start , end):
  train(epoch)
  test(epoch,checkpoint)
writer.close()

  0%|          | 0/625 [00:00<?, ?it/s]


 input:  torch.Size([64, 514])

 embeding:  torch.Size([64, 514, 300])

 output:  torch.Size([64, 2])

 hidden:  torch.Size([64, 600])


  0%|          | 0/625 [02:09<?, ?it/s]


KeyboardInterrupt: ignored

# Eval

In [None]:
def test_sentence(mysentence):
  token = [vocab.stoi["<SOS>"]]
  token += vocab.numericalize(mysentence)
  token.append(vocab.stoi["<EOS>"])
  token =torch.tensor(token).unsqueeze(0).to(device)#use unsqueeze beacuse input dim must be like [barch,len] but now it is [len]
  model.eval()
  out = model(token , torch.tensor(token.shape[1]).unsqueeze(0))
  out = torch.argmax(out)
  if out.item() == 1:
    print('this sentence is positive')
  else:
    print('this sentence is negative')

In [None]:
get_sentence = input()
test_sentence(get_sentence)

ok
torch.Size([1, 3])
this sentence is negative
