# Teste com o dataset classificado pelo BERT
Foi utilizada a Pipeline Zero-shot classification do BERTimbau para a filtragem do dataset. Estão sendo utilizados apenas os registros cuja hastag coincidiu com a classificação feita pelo BERTimbau.

Inicialmente, foi feito o treinamento com os parâmetros padrão. Porém, devido ao mau desempenho, foi feito o ajuste dos parâmetros. No entanto, o desempenho alcançado ainda não foi satisfatório.

Será feito novamente o teste assim que for lançada a nova versão do dataset.

## Métrica
Queremos priorizar um modelo com maior **precision**, prefiro ter um menor número de falsos positivos do que de falsos negativos. Prefiro mais afunilamento que abrangência.

Um maior recall seria interessante para uma análise do tipo "sobre o que falam os tweets tristes?" ou "o que mais deixam os usuários do twitter irritados?", que aí eu pegaria o maior número possível de tweets tristes e os analisaria, mesmo que tivessem falsos positivos.

In [1]:
import pandas as pd
import numpy as np
import spacy
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from spacy.cli.download import download

In [2]:
def criar_embeddings(df_tweets):
    try:
        nlp = spacy.load('pt_core_news_lg')
    except (IOError, OSError):
        download('pt_core_news_lg')
        nlp = spacy.load('pt_core_news_lg')
    # desativamos todos os outros pipes que vem com o modelo nlp porque não preicsaremos deles
    with nlp.disable_pipes():
        # transformamos cada texto em um vetor e colocamos em uma array
        print('Fazendo os word embeddings')
        vetores = np.array([nlp(texto).vector for texto in df_tweets.Texto])

    return vetores


def ler_modelo(path: str):
    return pickle.load(open(path, 'rb'))


def salvar_modelo(path: str, modelo):
    return pickle.dump(modelo, open(path, 'wb'))    


def fazer_amostragem(train_dataset: pd.DataFrame):
    """

    :param train_dataset:
    :return:
    """
    sentimentos = train_dataset['Sentimento'].unique()
    df = pd.DataFrame([])

    for sentimento in sentimentos:
        df_filtrado = train_dataset.loc[train_dataset['Sentimento'] == sentimento][:600]
        df = pd.concat([df, df_filtrado])

    return df

In [3]:
path_datasets = '../resources/datasets'

# importação dos dados
df_treinamento = pd.read_csv(
    f'{path_datasets}/tweets_ekman_5.csv'
).dropna()

x_train, x_test, y_train, y_test = train_test_split(
    df_treinamento,
    df_treinamento['Sentimento'],
    test_size=0.2,
    random_state=42
)

In [4]:
embeddings = criar_embeddings(x_train)
embeddings_teste = criar_embeddings(x_test)

Fazendo os word embeddings
Fazendo os word embeddings


In [5]:
svc = LinearSVC(C=100, random_state=0, dual=True, max_iter=10000)
svc.fit(embeddings, y_train)

lgr = LogisticRegression(random_state=0, max_iter=10000)
lgr.fit(embeddings, y_train)

forest = RandomForestClassifier(random_state=0, n_jobs=-1)
forest.fit(embeddings, y_train)



In [6]:
# previsões
previsoes_svc = svc.predict(embeddings_teste)
previsoes_lgr = lgr.predict(embeddings_teste)
previsoes_forest = forest.predict(embeddings_teste)

In [7]:
# testar performance
print('Relatório Linear SVC')
print(classification_report(y_test, previsoes_svc))

Relatório Linear SVC
              precision    recall  f1-score   support

       feliz       0.86      0.66      0.75       369
        medo       0.33      0.19      0.24       113
        nojo       0.36      0.84      0.50       207
       raiva       0.14      0.02      0.03        55
      triste       0.47      0.26      0.33       200

    accuracy                           0.52       944
   macro avg       0.43      0.39      0.37       944
weighted avg       0.56      0.52      0.50       944



In [8]:
print('Relatório Logistic Regression')
print(classification_report(y_test, previsoes_lgr))

Relatório Logistic Regression
              precision    recall  f1-score   support

       feliz       0.80      0.87      0.83       369
        medo       0.33      0.33      0.33       113
        nojo       0.60      0.60      0.60       207
       raiva       0.38      0.22      0.28        55
      triste       0.48      0.46      0.47       200

    accuracy                           0.62       944
   macro avg       0.52      0.49      0.50       944
weighted avg       0.61      0.62      0.61       944



In [9]:
print('Relatório Random Forest')
print(classification_report(y_test, previsoes_forest))

Relatório Random Forest
              precision    recall  f1-score   support

       feliz       0.67      0.97      0.79       369
        medo       0.45      0.12      0.18       113
        nojo       0.60      0.57      0.58       207
       raiva       1.00      0.02      0.04        55
      triste       0.51      0.47      0.48       200

    accuracy                           0.62       944
   macro avg       0.64      0.43      0.42       944
weighted avg       0.61      0.62      0.56       944



In [10]:
pontuacoes_logreg = cross_val_score(lgr, embeddings_teste, y_test, cv=5, n_jobs=-1, scoring='f1_weighted')
pontuacoes_svc = cross_val_score(svc, embeddings_teste, y_test, cv=5, n_jobs=-1, scoring='f1_weighted')
pontuacoes_forest = cross_val_score(forest, embeddings_teste, y_test, cv=5, n_jobs=-1, scoring='f1_weighted');

In [11]:
def exibir_pontuacoes(pontuacoes):
    soma_ponuacoes = 0

    for valor in pontuacoes:
        soma_ponuacoes += valor
        
    media = soma_ponuacoes / len(pontuacoes)
    
    print(f'Lista de pontuações: {pontuacoes}\nMédia: {media}' )
    
exibir_pontuacoes(pontuacoes_logreg)
exibir_pontuacoes(pontuacoes_svc)
exibir_pontuacoes(pontuacoes_forest)

Lista de pontuações: [0.5619677  0.52017714 0.57082305 0.47190076 0.57651147]
Média: 0.5402760233070711
Lista de pontuações: [0.48326454 0.5032465  0.49739425 0.42204001 0.51785759]
Média: 0.4847605797000519
Lista de pontuações: [0.50901075 0.46313397 0.46500009 0.45354082 0.45829196]
Média: 0.46979551644351947


## Ajuste de parâmetros

In [4]:
from sklearn.model_selection import GridSearchCV

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    df_treinamento,
    df_treinamento['Sentimento'],
    test_size=0.2,
    random_state=42
)

In [5]:
embeddings_treinamento = criar_embeddings(x_train)
embeddings_teste = criar_embeddings(x_test)

Fazendo os word embeddings
Fazendo os word embeddings


In [15]:
logreg = LogisticRegression(random_state=0, n_jobs=-1, max_iter=1000)

parametros = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'sag', 'newton_cholesky', 'saga', 'libfgs'],
    'multi_class': ['auto', 'over', 'multinomial']
}

logreg_grid = GridSearchCV(logreg, parametros, cv=5, n_jobs=-1)
logreg_grid.fit(embeddings_treinamento, y_train)
previsoes_lgr_best = logreg_grid.predict(embeddings_teste)

1110 fits failed out of a total of 1350.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covi

              precision    recall  f1-score   support

       feliz       0.79      0.88      0.83       369
        medo       0.37      0.33      0.35       113
        nojo       0.60      0.63      0.62       207
       raiva       0.31      0.09      0.14        55
      triste       0.47      0.48      0.48       200

    accuracy                           0.63       944
   macro avg       0.51      0.48      0.48       944
weighted avg       0.60      0.63      0.61       944

{'C': 0.1, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'saga'}


In [None]:
# print(classification_report(y_test, previsoes_lgr_best))
print(logreg.cv_results_)
print(logreg_grid.best_params_)

In [16]:
svc = LinearSVC(C=100, random_state=0, dual=True, max_iter=10000)
parametros_svc = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'multi_class': ['crammer_singer', 'ovr']
}

svc_grid = GridSearchCV(svc, parametros_svc, cv=5, n_jobs=-1)
svc_grid.fit(embeddings_treinamento, y_train)
previsoes_svc_best = svc_grid.predict(embeddings_teste)

print(classification_report(y_test, previsoes_svc_best))
print(svc_grid.best_params_)

90 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
33 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 638, in _vali

              precision    recall  f1-score   support

       feliz       0.73      0.90      0.81       369
        medo       0.42      0.29      0.34       113
        nojo       0.58      0.66      0.62       207
       raiva       0.33      0.05      0.09        55
      triste       0.53      0.44      0.48       200

    accuracy                           0.63       944
   macro avg       0.52      0.47      0.47       944
weighted avg       0.59      0.63      0.60       944

{'C': 0.01, 'multi_class': 'crammer_singer', 'penalty': 'l1'}


In [17]:
svc = LinearSVC(C=100, random_state=0, dual=True, max_iter=10000)
parametros_svc = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.00001, 0.0001, 0.001],
    'multi_class': ['crammer_singer', 'ovr']
}

svc_grid = GridSearchCV(svc, parametros_svc, cv=5, n_jobs=-1)
svc_grid.fit(embeddings_treinamento, y_train)
previsoes_svc_best = svc_grid.predict(embeddings_teste)

print(classification_report(y_test, previsoes_svc_best))
print(svc_grid.best_params_)

45 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 638, in _valida

              precision    recall  f1-score   support

       feliz       0.70      0.91      0.79       369
        medo       0.38      0.14      0.21       113
        nojo       0.56      0.65      0.61       207
       raiva       0.00      0.00      0.00        55
      triste       0.45      0.42      0.44       200

    accuracy                           0.60       944
   macro avg       0.42      0.42      0.41       944
weighted avg       0.54      0.60      0.56       944

{'C': 0.001, 'multi_class': 'ovr', 'penalty': 'l2'}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Alguns autores, como Duarte et al. 2019 e o próprio trabalho de Mayara, conseguiram um desempenho maior que o SVC, com o uso do classificador Naive Bayes. Acho que vale a pena tentar também

In [7]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(embeddings_treinamento, y_train)

previsoes_bnb = bnb.predict(embeddings_teste)

In [8]:
# testar performance
print('Relatório BernoulliNB')
print(classification_report(y_test, previsoes_bnb))

Relatório BernoulliNB
              precision    recall  f1-score   support

       feliz       0.72      0.56      0.63       369
        medo       0.26      0.27      0.27       113
        nojo       0.40      0.44      0.42       207
       raiva       0.07      0.11      0.08        55
      triste       0.39      0.42      0.41       200

    accuracy                           0.44       944
   macro avg       0.37      0.36      0.36       944
weighted avg       0.49      0.44      0.46       944



In [10]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(embeddings_treinamento, y_train)

previsoes_gnb = gnb.predict(embeddings_teste)

In [11]:
# testar performance
print('Relatório GaussianNB')
print(classification_report(y_test, previsoes_gnb))

Relatório GaussianNB
              precision    recall  f1-score   support

       feliz       0.67      0.47      0.55       369
        medo       0.28      0.19      0.23       113
        nojo       0.36      0.43      0.39       207
       raiva       0.05      0.07      0.06        55
      triste       0.35      0.48      0.41       200

    accuracy                           0.41       944
   macro avg       0.34      0.33      0.33       944
weighted avg       0.45      0.41      0.42       944



Logistic regression com pesos balanceados para as classes

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

logreg = LogisticRegression(random_state=0, n_jobs=-1, max_iter=1000, class_weight='balanced')

parametros = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'sag', 'newton_cholesky', 'saga', 'libfgs'],
    'multi_class': ['auto', 'over', 'multinomial']
}

logreg_grid = GridSearchCV(logreg, parametros, cv=5, n_jobs=-1)
logreg_grid.fit(embeddings_treinamento, y_train)
previsoes_lgr_best = logreg_grid.predict(embeddings_teste)

print(classification_report(y_test, previsoes_lgr_best))
print(logreg_grid.best_params_)

1110 fits failed out of a total of 1350.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covi

              precision    recall  f1-score   support

       feliz       0.84      0.76      0.80       369
        medo       0.37      0.50      0.43       113
        nojo       0.63      0.62      0.62       207
       raiva       0.20      0.38      0.26        55
      triste       0.54      0.40      0.46       200

    accuracy                           0.60       944
   macro avg       0.52      0.53      0.51       944
weighted avg       0.64      0.60      0.61       944

{'C': 0.01, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'sag'}


A **precision** para a classe feliz, mostra que a classe é corretamente classificada 84% das vezes, enquanto que outras classes são inorretamente classificadas como feliz, 76% das vezes, de acordo com o **recall**.

Queremos um modelo que tenha maior **precision** e menor **recall**.

In [12]:
svc = LinearSVC(class_weight='balanced', random_state=0, dual=True, max_iter=10000)
parametros_svc = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'multi_class': ['crammer_singer', 'ovr']
}

svc_grid = GridSearchCV(svc, parametros_svc, cv=5, n_jobs=-1)
svc_grid.fit(embeddings_treinamento, y_train)
previsoes_svc_best = svc_grid.predict(embeddings_teste)

print(classification_report(y_test, previsoes_svc_best))
print(svc_grid.best_params_)

90 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 638, in _vali

              precision    recall  f1-score   support

       feliz       0.81      0.87      0.84       369
        medo       0.39      0.35      0.37       113
        nojo       0.58      0.68      0.62       207
       raiva       0.27      0.24      0.25        55
      triste       0.50      0.40      0.44       200

    accuracy                           0.63       944
   macro avg       0.51      0.50      0.50       944
weighted avg       0.61      0.63      0.62       944

{'C': 0.001, 'multi_class': 'ovr', 'penalty': 'l2'}


## Testando XGBoost

In [1]:
import pandas as pd
import numpy as np
import spacy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [2]:
def criar_embeddings(df_tweets):
    try:
        nlp = spacy.load('pt_core_news_lg')
    except (IOError, OSError):
        download('pt_core_news_lg')
        nlp = spacy.load('pt_core_news_lg')
    # desativamos todos os outros pipes que vem com o modelo nlp porque não preicsaremos deles
    with nlp.disable_pipes():
        # transformamos cada texto em um vetor e colocamos em uma array
        print('Fazendo os word embeddings')
        vetores = np.array([nlp(texto).vector for texto in df_tweets.Texto])

    return vetores

In [3]:
path_datasets = '../resources/datasets'

# importação dos dados
df_treinamento = pd.read_csv(
    f'{path_datasets}/tweets_ekman_5.csv'
).dropna()

In [4]:
x_trainval, x_test, y_trainval, y_test = train_test_split(
    df_treinamento,
    df_treinamento['Sentimento'],
    test_size=0.2,
    random_state=42
)

x_train, x_valid, y_train, y_valid = train_test_split(
    x_trainval,
    y_trainval,
    random_state=42
)

In [5]:
embeddings_treinamento = criar_embeddings(x_train)
embeddings_teste = criar_embeddings(x_test)

Fazendo os word embeddings
Fazendo os word embeddings


In [6]:
le_train = LabelEncoder()
le_test = LabelEncoder()

y_train = le_train.fit_transform(y_train)
y_test = le_test.fit_transform(y_test)

In [7]:
xgb = XGBClassifier(random_state=0)
xgb.fit(embeddings_treinamento, y_train)

In [8]:
previsoes_xgb = xgb.predict(embeddings_teste)

In [9]:
print(classification_report(y_test, previsoes_xgb))

              precision    recall  f1-score   support

           0       0.75      0.93      0.83       369
           1       0.37      0.28      0.32       113
           2       0.52      0.52      0.52       207
           3       0.17      0.04      0.06        55
           4       0.52      0.49      0.50       200

    accuracy                           0.62       944
   macro avg       0.47      0.45      0.45       944
weighted avg       0.57      0.62      0.59       944



In [12]:
x_test['Sentimento'].value_counts()

Sentimento
feliz     369
nojo      207
triste    200
medo      113
raiva      55
Name: count, dtype: int64

## Classificação dos tweets da pandemia

In [8]:
import pandas as pd

df_covid = pd.read_csv(f'{path_datasets}/tweets_pandemia.csv')
df_covid.head()

Unnamed: 0,Texto,Data
0,me covid-19 vezes pra beijar sua boca,2020-04-01T23:41:43.000Z
1,Imagina a cara de c* que a família tradicional...,2020-04-01T23:59:46.000Z
2,"O que separa as pessoas? \r\n\r\nCOVID-19, mal...",2020-04-01T23:40:25.000Z
3,Primeiro caso COVID-19 confirmado em Ribeirão ...,2020-04-01T23:38:04.000Z
4,ACABOU COVID 19 NA CHINA \r\n\r\nPRESIDENTE CH...,2020-04-01T22:55:39.000Z


In [11]:
import re 

textos = df_covid['Texto']
novos_textos = []

for texto in textos:
    texto = str(texto)
    texto = texto.lower()
    texto = texto.replace('"', '')
    texto = texto.replace(',', ';')
    texto = re.sub(r'\s+', ' ', texto)
    texto = re.sub(r'@\w*', '', texto)
    texto = re.sub(r'#\w*', '', texto)
    texto = re.sub(r'http\S+', '', texto)
    texto = texto.replace('medo', '')
    novos_textos.append(texto)


df_covid['Texto'] = novos_textos

In [12]:
embeddings_pandemia = criar_embeddings(df_covid)

Fazendo os word embeddings


In [13]:
previsoes_pandemia = logreg_grid.predict(embeddings_pandemia)

In [14]:
df_covid_previsto = df_covid.copy()
df_covid_previsto['Previsões'] = previsoes_pandemia

df_covid_previsto.to_csv(f'{path_datasets}/previsoes_pandemia.csv', index=False)

In [17]:
df_covid_previsto['Previsões'].value_counts()

Previsões
feliz     46116
nojo      11480
medo       4614
raiva      3267
triste     2553
Name: count, dtype: int64

In [16]:
import pickle

path_modelos = '../resources/modelos'
path_logreg = f'{path_modelos}/logreg_emocoes_precisao.pkl'

pickle.dump(logreg, open(path_logreg, 'wb'))