In [1]:
!pip install transformers



In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from transformers import AutoModelForPreTraining

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)

In [4]:
from typing import List, Optional, Tuple

In [5]:
class LIABertClassifier(nn.Module):
    def __init__(self,model,num_labels):
        super(LIABertClassifier,self).__init__()
        self.bert = model.bert
        self.config = model.config
        self.num_labels = num_labels
        self.cls = nn.Linear(self.config.hidden_size,num_labels)

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        ) ->Tuple[torch.Tensor]:

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        sequence_output = outputs[0][:,0,:]
        prediction = self.cls(sequence_output)
        return prediction

In [6]:
model_base= AutoModelForPreTraining.from_pretrained('neuralmind/bert-base-portuguese-cased')
model = LIABertClassifier(model=model_base,num_labels=3)

In [7]:
import pandas as pd

In [8]:
cols = ["Datetime","Text","Likes","Retweets","Feeling"]
data = pd.read_csv(
    r"C:\Users\allan\Downloads\drive-download-20230505T001753Z-001\final.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="utf-8",
    index_col = False
)

In [9]:
data

Unnamed: 0,Datetime,Text,Likes,Retweets,Feeling
0,Datetime,Text,Likes,Retweets,Feeling
1,2022-11-02 23:02:08 UTC+0000,"A esquerda pediu o Fora Collor, o Fora FHC, o ...",32871,7123,Pos
2,2022-11-22 14:38:02 UTC+0000,pov: você é a melhor adaptação já feita de um ...,10798,2196,Pos
3,2022-11-02 23:58:42 UTC+0000,"odeio gente grudenta, mas se o grudento for el...",8265,2084,Pos
4,2022-11-21 03:45:18 UTC+0000,rbd me leva de volta a melhor parte da minha v...,2022,396,Pos
...,...,...,...,...,...
2135,2022-11-02 23:13:14 UTC+0000,Dica da noite\n\nÁRIES: esqueça o passado\nTOU...,3421,198,Neu
2136,2022-11-10 14:33:52 UTC+0000,A vida é sobre fazer sua parte. Você não contr...,4892,2367,Neu
2137,2023-02-13 22:16:33 UTC+0000,Bella Ramsey falou sobre a ajuda que teve de P...,4095,398,Neu
2138,2022-11-02 23:03:46 UTC+0000,Há alguma chance de vocês perdoar Bolsonaro?,3662,347,Neu


In [10]:

data.drop(["Datetime","Likes","Retweets"],
          axis=1,
          inplace=True)
data = data.drop(0)
data = data.reset_index(drop=True)

In [11]:
!pip install ekphrasis



In [12]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.dicts.emoticons import emoticons

In [13]:
text_processor = TextPreProcessor(
    
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    # tokenizer=tokenizer.tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [14]:
import re

pattern = r"<.*?>" #pattern used by ekphrasis to mark social network lingo

#function that only tokenizes what is not from the ekphrasis pattern
#this nedded to be done so we could use the portuguese tokenizer and keep
#all the ekphrasis tags
def ekphrasis_tokenize(text):
    
    ptext = text_processor.pre_process_doc(text)
    
    pattern_matches = re.findall(pattern, ptext)
    
    tokens = []
    prev_end = 0
    for match in pattern_matches:
        start, end = re.search(re.escape(match), ptext).span()
        tokens.extend(tokenizer.tokenize(ptext[prev_end:start]))
        tokens.append(match)
        prev_end = end
    tokens.extend(tokenizer.tokenize(ptext[prev_end:]))
    
    return tokens

In [15]:
data_clean = data.copy()
data_clean.Text = [ekphrasis_tokenize(tweet) for tweet in data.Text]

In [16]:
data_labels = data_clean.Feeling.values
data_labels[data_labels == 'Pos'] = 1
data_labels[data_labels == 'Neu'] = 0.5
data_labels[data_labels == 'Neg'] = 0


In [17]:
data_clean

Unnamed: 0,Text,Feeling
0,"[A, esquerda, pediu, o, Fora, Collor, ,, o, Fo...",1
1,"[po, ##v, :, você, é, a, melhor, adaptação, já...",1
2,"[o, ##de, ##io, gente, gru, ##dent, ##a, ,, ma...",1
3,"[r, ##b, ##d, me, leva, de, volta, a, melhor, ...",1
4,"[Feliz, aniversário, para, a, talentos, ##íssi...",1
...,...,...
2134,"[Di, ##ca, da, noite, Á, ##RI, ##ES, :, esque,...",0.5
2135,"[A, vida, é, sobre, fazer, sua, parte, ., Você...",0.5
2136,"[Bella, Ram, ##sey, falou, sobre, a, ajuda, qu...",0.5
2137,"[Há, alguma, chance, de, você, ##s, perdo, ##a...",0.5


In [18]:
import numpy as np

In [19]:
shuffle=np.random.randint(0,len(data_clean['Text']),1000)

In [20]:
ytrain_global = np.array(data_clean['Feeling'].tolist())[shuffle]
xtrain_global = np.array(data_clean['Text'])[shuffle]

In [23]:
xtrain_global[12],ytrain_global[1]

(['Inglaterra',
  'não',
  'é',
  'ra',
  '##cista',
  '<allcaps>',
  'K',
  '<elongated>',
  '</allcaps>',
  'só',
  'invadiram',
  'metade',
  'do',
  'globo',
  'promovendo',
  'geno',
  '##c',
  '##ídio',
  ',',
  'apar',
  '##the',
  '##id',
  ',',
  'etno',
  '##c',
  '##ídio',
  'mas',
  'não',
  'são',
  'ra',
  '##cista',
  '##s',
  'não',
  ',',
  'confia',
  'Não',
  'são',
  'homo',
  '##f',
  '##ób',
  '##icos',
  ',',
  'Alan',
  'Turing',
  'foi',
  'na',
  'verdade',
  'um',
  'ho',
  '##log',
  '##rama',
  'e',
  'não',
  'um',
  'ser',
  'humano',
  'vítima',
  'de',
  'cas',
  '##tração',
  'química',
  'por',
  'ser',
  'gay'],
 0.0)

In [None]:
!pip install scikit-learn

In [None]:
import sklearn.model_selection as model_selection

In [None]:
xtrain, xval, ytrain, yval = model_selection.train_test_split(xtrain_global, ytrain_global, test_size=0.30, random_state=42,shuffle=True)

In [None]:
train_encodings = tokenizer(xtrain.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
val_encodings = tokenizer(xval.tolist(), truncation=True, padding=True,max_length=512, return_tensors='pt')

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        label = torch.tensor(self.labels[idx].astype('float32'))
        return (item,label)

    def __len__(self):
        return len(self.labels)

In [None]:
ds_train = MyDataset(train_encodings,ytrain)
ds_val   = MyDataset(val_encodings,yval)

In [None]:
from torch.utils.data import DataLoader

In [None]:
batch_size = 8

In [None]:
dl_train = DataLoader(ds_train,shuffle=True,batch_size=batch_size)
dl_eval  = DataLoader(ds_val,batch_size=batch_size)

In [None]:
x,y = next(iter(dl_train))

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
batch = {k: v.to(device) for k, v in x.items()}

In [None]:
model.to(device)

In [None]:
out = model(**batch)

In [None]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-3)

In [None]:
num_epochs = 100
num_training_steps = num_epochs * len(dl_train)

In [None]:
from transformers import get_scheduler

In [None]:
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
model.train()

In [None]:
loss_fct = nn.CrossEntropyLoss()

In [None]:
count = 0
for epoch in range(num_epochs):
    count+=1
    lepochs = []
    for batch,y in dl_train:
        batch = {k: v.to(device) for k, v in batch.items()}
        y     = y.to(device)
        outputs = model(**batch)
        loss = loss_fct(outputs,y.to(torch.long))
        lepochs.append(loss.cpu().item())
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print(np.mean(lepochs))
    torch.save(model.state_dict(),f'./model{count}.pth')

In [None]:
model.eval()

In [None]:
ytrue = []
ypred = []
for batch,y in dl_eval:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    predictions = torch.argmax(outputs, dim=-1)
    ytrue += y.tolist()
    ypred += predictions.cpu().tolist()

In [None]:
torch.save(model.state_dict(),'/content/drive/MyDrive/model.pth')

In [None]:
model.load_state_dict(backup)

In [None]:
from sklearn import metrics

In [None]:
metrics.confusion_matrix(ytrue,ypred)

In [None]:
print(metrics.classification_report(ytrue,ypred))