In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [11]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [14]:
df_train = pd.read_csv("../data/cleaned_data.csv", index_col=0)

In [29]:
train = df_train.sample(frac=0.001, random_state=42)

In [30]:
train.shape

(181, 2)

In [32]:
train['labels'].unique()

array([1, 0])

In [33]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [34]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [35]:
train_size = 0.8
train_data=train.sample(frac=train_size,random_state=200)
test_data=train.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(train.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (181, 2)
TRAIN Dataset: (145, 2)
TEST Dataset: (36, 2)


In [36]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [37]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [38]:
model = RobertaClass()
model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), e

In [39]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [40]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [41]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [42]:
EPOCHS = 1
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training Loss per 5000 steps: 1.6458680629730225
Training Accuracy per 5000 steps: 0.0


19it [00:46,  2.43s/it]

The Total Accuracy for Epoch 0: 35.172413793103445
Training Loss Epoch: 1.497786986200433
Training Accuracy Epoch: 35.172413793103445





In [43]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [44]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

1it [00:00,  2.69it/s]

Validation Loss per 100 steps: 1.1876401901245117
Validation Accuracy per 100 steps: 25.0


9it [00:03,  2.85it/s]

Validation Loss Epoch: 1.1642781363593206
Validation Accuracy Epoch: 41.666666666666664
Accuracy on test data = 41.67%





In [45]:
output_model_file = 'pytorch_roberta_sentiment.pt'
output_vocab_file = './models/'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed


In [1]:
import tweetnlp

In [2]:
model = tweetnlp.Sentiment()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
model.sentiment("Jacob Collier is a Grammy-awarded English artist from London.") 

{'label': 'positive'}

In [4]:
text1 = "color ill give one	"
text2 = "its gonna rain . too bad i gotta go sentosa for the stupid workshop"

prob1 = model.sentiment(text1, return_probability=True)
prob2 = model.sentiment(text2, return_probability=True)

# prob["probability"]["positive"], prob["probability"]["negative"]
# result = 1 if prob["probability"]["positive"] > prob["probability"]["negative"] else 0
# result

In [5]:
prob1, prob2

({'label': 'neutral',
  'probability': {'negative': 0.07074535638093948,
   'neutral': 0.7350003123283386,
   'positive': 0.19425424933433533}},
 {'label': 'negative',
  'probability': {'negative': 0.9440370798110962,
   'neutral': 0.05025005340576172,
   'positive': 0.005712881684303284}})

In [8]:
train_set_pos_1 = pd.read_csv('../data/pos_data_cleaned_1.csv', index_col=0)
train_set_pos_2 = pd.read_csv('../data/pos_data_cleaned_2.csv', index_col=0)
train_set_pos_3 = pd.read_csv('../data/pos_data_cleaned_3.csv', index_col=0)
train_set_pos_4 = pd.read_csv('../data/pos_data_cleaned_4.csv', index_col=0)

train_set_neg_1 = pd.read_csv('../data/neg_data_cleaned_1.csv', index_col=0)
train_set_neg_2 = pd.read_csv('../data/neg_data_cleaned_2.csv', index_col=0)
train_set_neg_3 = pd.read_csv('../data/neg_data_cleaned_3.csv', index_col=0)
train_set_neg_4 = pd.read_csv('../data/neg_data_cleaned_4.csv', index_col=0)

In [14]:
train_set_1 = pd.concat([train_set_pos_1, train_set_neg_1])
train_set_2 = pd.concat([train_set_pos_2, train_set_neg_2])
train_set_3 = pd.concat([train_set_pos_3, train_set_neg_3])
train_set_4 = pd.concat([train_set_pos_4, train_set_neg_4])

In [101]:
test_set = train_set_4.sample(10000, random_state=42)

In [102]:
test_set.shape, train_set_1.shape

((10000, 2), (181320, 2))

In [103]:
count = 0
leng = 0
for i in range(len(test_set)):

    if i%100 == 0:
        print(i)

    if type(test_set.text.values[i]) != str:
        continue
    prob = model.sentiment(test_set.text.values[i], return_probability=True)

    if prob["label"]=="neutral":
        continue
    result = 1 if prob["label"] == "positive" else 0
    isAccurate = result == test_set.labels.values[i]
    leng += 1
    if isAccurate:
        count += 1

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900


In [104]:
count, leng, count/leng

(3664, 4823, 0.7596931370516277)

In [102]:
prob = model.sentiment("b r e someone save boredom pff	", return_probability=True)
result = 1 if prob["probability"]["positive"] > prob["probability"]["negative"] else 0
isAccurate = result == 0
isAccurate

True

In [84]:
test_set[950:1000]

Unnamed: 0,text,labels
51257,coin fort ill forgive,1
45048,pleaseee please follow thanks,0
3904,follow instagram sohelakaur ill followback tweet,1
96914,wish could get know oomf,0
51568,tuesday numbers bad worse 11 number hours fed...,0
8524,sucks sorry smoke guys know better,1
17974,happens forever always,1
97201,nigguh ran going theres ghost,0
23682,makasih sayaangkuh get well soon sister sick ...,0
60003,wish youd catching fire mocking jay would fre...,1
