# Teste com o dataset classificado pelo BERT
Foi utilizada a Pipeline Zero-shot classification do BERTimbau para a filtragem do dataset. Estão sendo utilizados apenas os registros cuja hastag coincidiu com a classificação feita pelo BERTimbau.

Inicialmente, foi feito o treinamento com os parâmetros padrão. Porém, devido ao mau desempenho, foi feito o ajuste dos parâmetros. No entanto, o desempenho alcançado ainda não foi satisfatório.

Será feito novamente o teste assim que for lançada a nova versão do dataset.

In [1]:
import pandas as pd
import numpy as np
import spacy
import os
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from spacy.cli.download import download

In [2]:
def criar_embeddings(df_tweets):
    try:
        nlp = spacy.load('pt_core_news_lg')
    except (IOError, OSError):
        download('pt_core_news_lg')
        nlp = spacy.load('pt_core_news_lg')
    # desativamos todos os outros pipes que vem com o modelo nlp porque não preicsaremos deles
    with nlp.disable_pipes():
        # transformamos cada texto em um vetor e colocamos em uma array
        print('Fazendo os word embeddings')
        vetores = np.array([nlp(texto).vector for texto in df_tweets.Texto])

    return vetores


def ler_modelo(path: str):
    return pickle.load(open(path, 'rb'))


def salvar_modelo(path: str, modelo):
    return pickle.dump(modelo, open(path, 'wb'))    


def fazer_amostragem(train_dataset: pd.DataFrame):
    """

    :param train_dataset:
    :return:
    """
    sentimentos = train_dataset['Sentimento'].unique()
    df = pd.DataFrame([])

    for sentimento in sentimentos:
        df_filtrado = train_dataset.loc[train_dataset['Sentimento'] == sentimento][:600]
        df = pd.concat([df, df_filtrado])

    return df

In [3]:
path_datasets = '../resources/datasets'

# importação dos dados
df_treinamento = pd.read_csv(
    f'{path_datasets}/tweets_ekman_4.csv'
).dropna()

x_train, x_test, y_train, y_test = train_test_split(
    df_treinamento,
    df_treinamento['Sentimento'],
    test_size=0.2,
    random_state=42
)

In [4]:
# processamento dos dados para word embeddings
path_embeddings_treinamento = '../resources/modelos/embeddings_treinamento_novo.pkl'
path_embeddings_teste = '../resources/modelos/embeddings_teste_novo.pkl'

embeddings = criar_embeddings(x_train)
embeddings_teste = criar_embeddings(x_test)

Fazendo os word embeddings
Fazendo os word embeddings


In [5]:
svc = LinearSVC(C=100, random_state=0, dual=True, max_iter=10000)
svc.fit(embeddings, y_train)

lgr = LogisticRegression(random_state=0, max_iter=10000)
lgr.fit(embeddings, y_train)

forest = RandomForestClassifier(random_state=0, n_jobs=-1)
forest.fit(embeddings, y_train)



In [6]:
# previsões
previsoes_svc = svc.predict(embeddings_teste)
previsoes_lgr = lgr.predict(embeddings_teste)
previsoes_forest = forest.predict(embeddings_teste)

In [7]:
# testar performance
print('Relatório Linear SVC')
print(classification_report(y_test, previsoes_svc))

Relatório Linear SVC
              precision    recall  f1-score   support

       feliz       0.43      0.53      0.48        36
        medo       0.45      0.37      0.41        59
        nojo       0.39      0.45      0.42        49
       raiva       0.24      0.26      0.25        42
      triste       0.29      0.24      0.26        50

    accuracy                           0.36       236
   macro avg       0.36      0.37      0.36       236
weighted avg       0.36      0.36      0.36       236



In [8]:
print('Relatório Logistic Regression')
print(classification_report(y_test, previsoes_lgr))

Relatório Logistic Regression
              precision    recall  f1-score   support

       feliz       0.44      0.58      0.50        36
        medo       0.50      0.37      0.43        59
        nojo       0.39      0.41      0.40        49
       raiva       0.29      0.33      0.31        42
      triste       0.30      0.26      0.28        50

    accuracy                           0.38       236
   macro avg       0.38      0.39      0.38       236
weighted avg       0.39      0.38      0.38       236



In [9]:
print('Relatório Random Forest')
print(classification_report(y_test, previsoes_forest))

Relatório Random Forest
              precision    recall  f1-score   support

       feliz       0.42      0.42      0.42        36
        medo       0.39      0.37      0.38        59
        nojo       0.42      0.33      0.37        49
       raiva       0.28      0.40      0.33        42
      triste       0.40      0.36      0.38        50

    accuracy                           0.37       236
   macro avg       0.38      0.38      0.38       236
weighted avg       0.38      0.37      0.37       236



In [10]:
pontuacoes_logreg = cross_val_score(lgr, embeddings_teste, y_test, cv=5, n_jobs=-1, scoring='f1_weighted')
pontuacoes_svc = cross_val_score(svc, embeddings_teste, y_test, cv=5, n_jobs=-1, scoring='f1_weighted')
pontuacoes_forest = cross_val_score(forest, embeddings_teste, y_test, cv=5, n_jobs=-1, scoring='f1_weighted');

In [11]:
def exibir_pontuacoes(pontuacoes):
    soma_ponuacoes = 0

    for valor in pontuacoes:
        soma_ponuacoes += valor
        
    media = soma_ponuacoes / len(pontuacoes)
    
    print(f'Lista de pontuações: {pontuacoes}\nMédia: {media}' )
    
exibir_pontuacoes(pontuacoes_logreg)
exibir_pontuacoes(pontuacoes_svc)
exibir_pontuacoes(pontuacoes_forest)

Lista de pontuações: [0.31915558 0.25867018 0.24349993 0.25442329 0.38898504]
Média: 0.2929468030796896
Lista de pontuações: [0.29355984 0.21362157 0.28701104 0.27227848 0.37040619]
Média: 0.28737542249323134
Lista de pontuações: [0.24541005 0.24346771 0.16140125 0.30952735 0.34595684]
Média: 0.2611526396877588


## Ajuste de parâmetros

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
x_trainval, x_test, y_trainval, y_test = train_test_split(
    df_treinamento,
    df_treinamento['Sentimento'],
    test_size=0.2,
    random_state=42
)

x_train, x_valid, y_train, y_valid = train_test_split(
    x_trainval,
    y_trainval,
    random_state=42
)

In [14]:
embeddings_treinamento = criar_embeddings(x_train)
embeddings_teste = criar_embeddings(x_test)

Fazendo os word embeddings
Fazendo os word embeddings


In [20]:
logreg = LogisticRegression(random_state=0, n_jobs=-1, max_iter=1000)

parametros = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'sag', 'newton_cholesky', 'saga', 'libfgs'],
    'multi_class': ['auto', 'over', 'multinomial']
}

logreg_grid = GridSearchCV(logreg, parametros, cv=5, n_jobs=-1)
logreg_grid.fit(embeddings_treinamento, y_train)
previsoes_lgr_best = logreg_grid.predict(embeddings_teste)

print(classification_report(y_test, previsoes_lgr_best))
print(logreg_grid.best_params_)

1110 fits failed out of a total of 1350.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covi

              precision    recall  f1-score   support

       feliz       0.48      0.56      0.51        36
        medo       0.45      0.31      0.36        59
        nojo       0.42      0.41      0.41        49
       raiva       0.32      0.43      0.37        42
      triste       0.34      0.34      0.34        50

    accuracy                           0.39       236
   macro avg       0.40      0.41      0.40       236
weighted avg       0.40      0.39      0.39       236

{'C': 0.01, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'sag'}


In [26]:
svc = LinearSVC(C=100, random_state=0, dual=True, max_iter=10000)
parametros_svc = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'multi_class': ['crammer_singer', 'ovr']
}

svc_grid = GridSearchCV(svc, parametros_svc, cv=5, n_jobs=-1)
svc_grid.fit(embeddings_treinamento, y_train)
previsoes_svc_best = svc_grid.predict(embeddings_teste)

print(classification_report(y_test, previsoes_svc_best))
print(svc_grid.best_params_)

              precision    recall  f1-score   support

       feliz       0.48      0.58      0.53        36
        medo       0.39      0.22      0.28        59
        nojo       0.37      0.39      0.38        49
       raiva       0.33      0.40      0.37        42
      triste       0.32      0.36      0.34        50

    accuracy                           0.37       236
   macro avg       0.38      0.39      0.38       236
weighted avg       0.37      0.37      0.37       236

{'C': 0.001, 'multi_class': 'ovr', 'penalty': 'l2'}


90 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 638, in _vali

In [27]:
svc = LinearSVC(C=100, random_state=0, dual=True, max_iter=10000)
parametros_svc = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.00001, 0.0001, 0.001],
    'multi_class': ['crammer_singer', 'ovr']
}

svc_grid = GridSearchCV(svc, parametros_svc, cv=5, n_jobs=-1)
svc_grid.fit(embeddings_treinamento, y_train)
previsoes_svc_best = svc_grid.predict(embeddings_teste)

print(classification_report(y_test, previsoes_svc_best))
print(svc_grid.best_params_)

              precision    recall  f1-score   support

       feliz       0.48      0.58      0.53        36
        medo       0.39      0.22      0.28        59
        nojo       0.37      0.39      0.38        49
       raiva       0.33      0.40      0.37        42
      triste       0.32      0.36      0.34        50

    accuracy                           0.37       236
   macro avg       0.38      0.39      0.38       236
weighted avg       0.37      0.37      0.37       236

{'C': 0.001, 'multi_class': 'ovr', 'penalty': 'l2'}


45 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\sefaz\Documents\Projetos Python\Analise-de-sentimentos-pandemia-covid19\tweets_venv\Lib\site-packages\sklearn\base.py", line 638, in _valid