In [None]:
!pip install transformers

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from transformers import AutoModelForPreTraining

In [None]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)

In [None]:
from typing import List, Optional, Tuple

In [None]:
class LIABertClassifier(nn.Module):
    def __init__(self,model,num_labels):
        super(LIABertClassifier,self).__init__()
        self.bert = model.bert
        self.config = model.config
        self.num_labels = num_labels
        self.cls = nn.Linear(self.config.hidden_size,num_labels)

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        ) ->Tuple[torch.Tensor]:

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        sequence_output = outputs[0][:,0,:]
        prediction = self.cls(sequence_output)
        return prediction

In [None]:
model_base= AutoModelForPreTraining.from_pretrained('neuralmind/bert-base-portuguese-cased')
model = LIABertClassifier(model=model_base,num_labels=3)

In [None]:
import pandas as pd

In [None]:
cols = ["Datetime","Text","Likes","Retweets","Feeling"]
data = pd.read_csv(
    r"C:\Users\allan\Downloads\drive-download-20230505T001753Z-001\final.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1",
    index_col = False
)

In [None]:

data.drop(["Datetime","Likes","Retweets"],
          axis=1,
          inplace=True)
data = data.drop(0)
data = data.reset_index(drop=True)

In [None]:
import numpy as np
import re
from bs4 import BeautifulSoup

In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.Text]

In [None]:
data_labels = data.Feeling.values
data_labels[data_labels == 'Pos'] = 1
data_labels[data_labels == 'Neu'] = 0.5
data_labels[data_labels == 'Neg'] = 0


In [None]:
import torch

In [None]:
shuffle=np.random.randint(0,len(data['Text']),1000)

In [None]:
ytrain_global = np.array(data['Feeling'].tolist())[shuffle]
xtrain_global = np.array(data['Text'])[shuffle]

In [None]:
xtrain_global[1],ytrain_global[1]

In [None]:
!pip install scikit-learn

In [None]:
import sklearn.model_selection as model_selection

In [None]:
xtrain, xval, ytrain, yval = model_selection.train_test_split(xtrain_global, ytrain_global, test_size=0.30, random_state=42,shuffle=True)

In [None]:
train_encodings = tokenizer(xtrain.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
val_encodings = tokenizer(xval.tolist(), truncation=True, padding=True,max_length=512, return_tensors='pt')

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        label = torch.tensor(self.labels[idx].astype('float32'))
        return (item,label)

    def __len__(self):
        return len(self.labels)

In [None]:
ds_train = MyDataset(train_encodings,ytrain)
ds_val   = MyDataset(val_encodings,yval)

In [None]:
from torch.utils.data import DataLoader

In [None]:
batch_size = 8

In [None]:
dl_train = DataLoader(ds_train,shuffle=True,batch_size=batch_size)
dl_eval  = DataLoader(ds_val,batch_size=batch_size)

In [None]:
x,y = next(iter(dl_train))

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
batch = {k: v.to(device) for k, v in x.items()}

In [None]:
model.to(device)

In [None]:
out = model(**batch)

In [None]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-3)

In [None]:
num_epochs = 100
num_training_steps = num_epochs * len(dl_train)

In [None]:
from transformers import get_scheduler

In [None]:
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
model.train()

In [None]:
loss_fct = nn.CrossEntropyLoss()

In [None]:
count = 0
for epoch in range(num_epochs):
    count+=1
    lepochs = []
    for batch,y in dl_train:
        batch = {k: v.to(device) for k, v in batch.items()}
        y     = y.to(device)
        outputs = model(**batch)
        loss = loss_fct(outputs,y.to(torch.long))
        lepochs.append(loss.cpu().item())
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print(np.mean(lepochs))
    torch.save(model.state_dict(),f'./model{count}.pth')

In [None]:
model.eval()

In [None]:
ytrue = []
ypred = []
for batch,y in dl_eval:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    predictions = torch.argmax(outputs, dim=-1)
    ytrue += y.tolist()
    ypred += predictions.cpu().tolist()

In [None]:
torch.save(model.state_dict(),'/content/drive/MyDrive/model.pth')

In [None]:
model.load_state_dict(backup)

In [None]:
from sklearn import metrics

In [None]:
metrics.confusion_matrix(ytrue,ypred)

In [None]:
print(metrics.classification_report(ytrue,ypred))