In [None]:
!pip install pyarrow

In [1]:
from ClassifierTransformer import Classifier
import torch
import pandas as pd
import numpy as np
import tiktoken
import time

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
tokenizer = tiktoken.get_encoding('r50k_base')

In [3]:
#  hyperparameters

vocab_size = tokenizer.n_vocab
block_size = 32
learning_rate = 1e-3
steps = 10000
eval_step = steps // 10


device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [5]:
x = df['text'].copy()
y = df.sentiment.copy()

In [6]:
tokens = []
for x_ in x:
    tokens.append(tokenizer.encode(x_))

In [7]:
target_map = {
    'negative' : 0,
    'neutral'  : 1,
    'positive' : 2
}

In [8]:
y = y.map(target_map)

In [9]:
n = int(len(tokens) * 0.9)

train_x, train_y = tokens[:n], list(y[:n])
val_x, val_y = tokens[n:], list(y[n:])

In [10]:
len(val_x),  len(val_y)

(2748, 2748)

In [11]:
def data_loader(split='train', block_size=32):
    x_, y_ = (train_x, train_y) if split == 'train' else (val_x, val_y)
    rand_nums = torch.randint(len(x_), (block_size,))

    
    
    x_ = [x_[i.item()] for i in rand_nums]
    y_ = [y_[i.item()] for i in rand_nums]

    for i, s in enumerate(x_):
        if len(s) < block_size:
            temp = tokenizer.encode(' ' * (block_size - len(s))) + s
            x_[i] = temp
        else:
            x_[i] = s[-block_size:]
    
    x_ = torch.tensor(x_).to(device)
    y_ = torch.tensor(y_).to(device)

    return x_, y_


In [12]:
@torch.no_grad
def get_loss(m):
    train_lossi = []
    val_lossi = []
    m.eval()
    for _ in range(100):
        # train
        x, y = data_loader('train')
        logits, loss = m(x, y)
        train_lossi.append(loss)

        # val
        x, y = data_loader('val')
        logits, loss = m(x, y)
        val_lossi.append(loss)
    train_loss = torch.tensor(train_lossi).mean()
    val_loss = torch.tensor(val_lossi).mean()
    m.train()
    return train_loss, val_loss

In [13]:
c = Classifier(vocab_size, 3)
c.to(device)
optimizer =  torch.optim.AdamW(c.parameters(),  lr=learning_rate)

In [14]:
st = time.time()

for step in range(steps):
    x, y = data_loader('train')
    logits, loss = c(x, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if step % eval_step == 0:
      train_loss, val_loss = get_loss(c)
      print(f'Step {step}:  Train Loss: {train_loss.item():.4f},  Val Loss: {val_loss.item():.4f}')
et = time.time()

mins = (et - st)   //  60
secs = int((et - st) % 60)

print()
print(f'Time Elasped: {mins} mins {secs} secs')

Step 0:  Train Loss: 1.1178,  Val Loss: 1.1180
Step 1000:  Train Loss: 1.0614,  Val Loss: 1.0542
Step 2000:  Train Loss: 0.9004,  Val Loss: 0.9262
Step 3000:  Train Loss: 0.7683,  Val Loss: 0.8762
Step 4000:  Train Loss: 0.7364,  Val Loss: 0.8484
Step 5000:  Train Loss: 0.6643,  Val Loss: 0.8309
Step 6000:  Train Loss: 0.5914,  Val Loss: 0.8697
Step 7000:  Train Loss: 0.5306,  Val Loss: 0.8788
Step 8000:  Train Loss: 0.5077,  Val Loss: 0.8648
Step 9000:  Train Loss: 0.4797,  Val Loss: 0.9062

Time Elasped: 3.0 mins 53 secs


In [23]:
idx = tokenizer.encode('It is amazing, what a fabulous day')
idx = tokenizer.encode(' ' * (block_size - len(idx))) + idx
idx = torch.tensor(idx).to(device).view(1, -1)

In [24]:
idx.shape

torch.Size([1, 32])

In [25]:
with torch.no_grad():
    op = torch.nn.functional.softmax(c.predict(idx), dim=1)
op

tensor([[0.0081, 0.0346, 0.9574]], device='cuda:0')

In [26]:
torch.argmax(op[0])

tensor(2, device='cuda:0')