In [1]:
# Bibliotecas basicas
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from gensim.models import doc2vec
from xgboost import XGBClassifier

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer

# Trocando diretorios para utilização dos bancos de dados e scripts
os.chdir("../")

# verifica diretorio
os.getcwd()

#importa scripts
from py_scripts.test_models_plot_roc_auc_curve import test_models_plot_roc_auc_curve
from py_scripts.preprocessing import preprocessing
from py_scripts.read_corpus import read_corpus

In [2]:
df = pd.read_csv('../mario_cesa_780_projeto_2/data/Subm3Classes.csv') #Leitura dos dados em CSV
df_train = pd.read_csv('../mario_cesa_780_projeto_2/data/Train3Classes.csv')

In [3]:
#Verificando os dados gerais do dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          5000 non-null   int64 
 1   tweet_text  5000 non-null   object
 2   tweet_date  5000 non-null   object
 3   query_used  5000 non-null   object
dtypes: int64(1), object(3)
memory usage: 156.4+ KB


In [4]:
#Verificando os dados gerais do dataframe de treino
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95000 entries, 0 to 94999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          95000 non-null  int64 
 1   tweet_text  95000 non-null  object
 2   tweet_date  95000 non-null  object
 3   sentiment   95000 non-null  int64 
 4   query_used  95000 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.6+ MB


In [5]:
#As classes estão equilibradas
df_train.groupby('sentiment').count()

Unnamed: 0_level_0,id,tweet_text,tweet_date,query_used
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,31696,31696,31696,31696
1,31678,31678,31678,31678
2,31626,31626,31626,31626


In [6]:
#verificando alguns textos
df_train.tweet_text[2]

'Para Theresa May, seu plano para o Brexit é a única opção https://t.co/epl39YD9bj'

In [7]:
#Inicializando stemmer e lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [8]:
#Filtrando as palavras
df_train["filtered_words"] = df_train['tweet_text'].apply(lambda x: preprocessing(x, lemmatizer, stemmer))

In [9]:
#Verificando as palavras filtradas
df_train.filtered_words[2]

['theresa', 'may', 'plano', 'brexit', 'unica', 'opcao']

In [10]:
#Reunindo as palavras
df_train['join_words'] = df_train['filtered_words'].apply(lambda x: ' '.join(x))

In [11]:
#Verificando palavras unidas
df_train['join_words']

0                   rio eleg maior bancada polici historia
1                         fiquei tao trist vi preco camera
2                     theresa may plano brexit unica opcao
3              caralho quero proteg danielli pote tadinhaa
4                                       sicaetano viva cao
                               ...                        
94995    cuba defensor direito humano unem contra chefe...
94996    oportunidad venha fazer part equip vaga aberta...
94997    syoo sei significa to feliz demai amo aqui pra...
94998                         louistsexh n conheco posta d
94999                                                  deu
Name: join_words, Length: 95000, dtype: object

In [12]:
#Função que faz uma amostra dos dados coletados, foi necessário diminuir o numero de dados devido a problemas de memória
percent = 0.2
df0 = (df_train[df_train.sentiment == 0].sample(n=int(df_train[df_train.sentiment == 0].shape[0]*percent), random_state=42))
df1 = (df_train[df_train.sentiment == 1].sample(n=int(df_train[df_train.sentiment == 1].shape[0]*percent), random_state=42))
df2 = (df_train[df_train.sentiment == 2].sample(n=int(df_train[df_train.sentiment == 2].shape[0]*percent), random_state=42))

df_train_sample = pd.concat([df0, df1, df2])

In [13]:
df_train_sample.reset_index(inplace=True)

In [14]:
#Separando as variaveis
X = df_train_sample['join_words']
X2 = df_train_sample['filtered_words'] #Para utilizar o D2V é necessaria uma lista de strings, foi utilizada a coluna sem Join
y = df_train_sample['sentiment']

In [15]:
#Aplicando divisão de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                     y, 
                                                     test_size = 0.3, 
                                                     random_state = 42)

#Aplicando divisão especifica para treino e teste de D2V 
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, 
                                                     y, 
                                                     test_size = 0.3, 
                                                     random_state = 42)


In [16]:
#Inicializa as funções
counter = CountVectorizer()
tfidf = TfidfVectorizer(use_idf = True)

In [70]:
#Ajuste dos dados as funções de BoW e TF IDF
X_train_bow = counter.fit_transform(X_train).toarray()
X_test_bow = counter.transform(X_test).toarray()

X_train_tfidf = tfidf.fit_transform(X_train).todense()
X_test_tfidf  = tfidf.transform(X_test).todense()


In [53]:
X_train_tfidf.shape

(13299, 24093)

In [18]:
#Ajustando os dados para Doc2Vec
train_corpus = read_corpus(X_train2)
test_corpus = read_corpus(X_test2, tokens_only=True)
model_doc2vec = doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=20)
model_doc2vec.build_vocab(train_corpus)
model_doc2vec.train(train_corpus, total_examples=model_doc2vec.corpus_count, epochs=model_doc2vec.epochs)
X_train_d2v = np.array(list(map(model_doc2vec.infer_vector, X_train2)))
X_test_d2v = np.array(list(map(model_doc2vec.infer_vector, X_test2)))

In [19]:
#Aplicando regressão logisita em BoW
model_logistic = LogisticRegression(max_iter=5000)
model_logistic.fit(X_train_bow, y_train)
y_log_predict = model_logistic.predict(X_test_bow)

In [20]:
print(classification_report(y_test, y_log_predict))

              precision    recall  f1-score   support

           0       0.70      0.73      0.72      1869
           1       0.69      0.68      0.68      1928
           2       0.94      0.92      0.93      1903

    accuracy                           0.77      5700
   macro avg       0.78      0.78      0.78      5700
weighted avg       0.78      0.77      0.78      5700



In [21]:
#Aplicando regressão logistica em TF IDF
model_logistic2 = LogisticRegression(max_iter=5000)
model_logistic2.fit(X_train_tfidf, y_train)
y_log_predict2 = model_logistic2.predict(X_test_tfidf)



In [22]:
print(classification_report(y_test, y_log_predict2))

              precision    recall  f1-score   support

           0       0.75      0.69      0.72      1869
           1       0.69      0.73      0.71      1928
           2       0.91      0.94      0.93      1903

    accuracy                           0.79      5700
   macro avg       0.79      0.79      0.78      5700
weighted avg       0.79      0.79      0.78      5700



In [23]:
#Aplicando regressão logistica em D2V
model_logistic3 = LogisticRegression(max_iter=5000)
model_logistic3.fit(X_train_d2v, y_train2)
y_log_predict3 = model_logistic3.predict(X_test_d2v)

In [24]:
print(classification_report(y_test2, y_log_predict3))

              precision    recall  f1-score   support

           0       0.62      0.60      0.61      1869
           1       0.59      0.62      0.60      1928
           2       0.79      0.77      0.78      1903

    accuracy                           0.66      5700
   macro avg       0.66      0.66      0.66      5700
weighted avg       0.66      0.66      0.66      5700



In [25]:
#Aplicando floresta aleatoria em BoW
model_random  = RandomForestClassifier()
model_random.fit(X_train_bow, y_train)
y_random_predict = model_random.predict(X_test_bow)

In [26]:
print(classification_report(y_test, y_random_predict))

              precision    recall  f1-score   support

           0       0.67      0.71      0.69      1869
           1       0.67      0.56      0.61      1928
           2       0.83      0.91      0.87      1903

    accuracy                           0.73      5700
   macro avg       0.72      0.73      0.72      5700
weighted avg       0.72      0.73      0.72      5700



In [27]:
#Aplicando floresta aleatoria em TFIDF
model_random2  = RandomForestClassifier()
model_random2.fit(X_train_tfidf, y_train)
y_random_predict2 = model_random2.predict(X_test_tfidf)



In [28]:
print(classification_report(y_test, y_random_predict2))

              precision    recall  f1-score   support

           0       0.70      0.70      0.70      1869
           1       0.69      0.60      0.64      1928
           2       0.83      0.94      0.88      1903

    accuracy                           0.75      5700
   macro avg       0.74      0.75      0.74      5700
weighted avg       0.74      0.75      0.74      5700



In [29]:
#Aplicando floresta aleatoria em D2V
model_random3  = RandomForestClassifier()
model_random3.fit(X_train_d2v, y_train)
y_random_predict3 = model_random3.predict(X_test_d2v)

In [30]:
print(classification_report(y_test2, y_random_predict3))

              precision    recall  f1-score   support

           0       0.62      0.66      0.64      1869
           1       0.60      0.59      0.59      1928
           2       0.85      0.80      0.82      1903

    accuracy                           0.68      5700
   macro avg       0.69      0.68      0.69      5700
weighted avg       0.69      0.68      0.69      5700



In [31]:
#Aplicando XGBoost em BoW
model_xgboost = XGBClassifier()
model_xgboost.fit(X_train_bow, y_train)
y_xg_predict = model_xgboost.predict(X_test_bow)





In [32]:
print(classification_report(y_test, y_xg_predict))

              precision    recall  f1-score   support

           0       0.74      0.61      0.67      1869
           1       0.61      0.76      0.67      1928
           2       0.92      0.84      0.88      1903

    accuracy                           0.74      5700
   macro avg       0.76      0.74      0.74      5700
weighted avg       0.76      0.74      0.74      5700



In [33]:
#Aplicando XGBoost em TFIDF
model_xgboost2 = XGBClassifier()
model_xgboost2.fit(X_train_tfidf, y_train)
y_xg_predict2 = model_xgboost2.predict(X_test_tfidf)





In [34]:
print(classification_report(y_test, y_xg_predict2))

              precision    recall  f1-score   support

           0       0.73      0.64      0.68      1869
           1       0.62      0.74      0.67      1928
           2       0.94      0.85      0.89      1903

    accuracy                           0.74      5700
   macro avg       0.76      0.74      0.75      5700
weighted avg       0.76      0.74      0.75      5700



In [35]:
#Aplicando XGBoost em D2V
model_xgboost3 = XGBClassifier()
model_xgboost3.fit(X_train_d2v, y_train)
y_xg_predict3 = model_xgboost3.predict(X_test_d2v)





In [36]:
print(classification_report(y_test2, y_xg_predict3))

              precision    recall  f1-score   support

           0       0.62      0.65      0.64      1869
           1       0.59      0.60      0.60      1928
           2       0.87      0.82      0.85      1903

    accuracy                           0.69      5700
   macro avg       0.70      0.69      0.69      5700
weighted avg       0.70      0.69      0.69      5700



In [39]:
print('''
O melhor desempenho obtido foi em regrassão logística utilizando TFIDF, como demonstrado abaixo:
 
    
    precision    recall  f1-score   support

           0       0.75      0.69      0.72      1869
           1       0.69      0.73      0.71      1928
           2       0.91      0.94      0.93      1903

    accuracy                           0.79      5700
   macro avg       0.79      0.79      0.78      5700
weighted avg       0.79      0.79      0.78      5700

Não foram usados otimizadores como RandomSearch por demorarem muito a execução e não mostrarem ganho satisfatório
''')




O melhor desempenho obtido foi em regrassão logística utilizando TFIDF, como demonstrado abaixo:
 
    
    precision    recall  f1-score   support

           0       0.75      0.69      0.72      1869
           1       0.69      0.73      0.71      1928
           2       0.91      0.94      0.93      1903

    accuracy                           0.79      5700
   macro avg       0.79      0.79      0.78      5700
weighted avg       0.79      0.79      0.78      5700




In [40]:
#Tratamento do dataset de submissão
df["filtered_words"] = df['tweet_text'].apply(lambda x: preprocessing(x, lemmatizer, stemmer))

In [41]:
#Verificando as palavras filtradas
df.filtered_words[2]

['analogica',
 'correio',
 'espera',
 'd',
 'so',
 'falta',
 'receb',
 'dua',
 'lent',
 'comecar',
 'revelar',
 'casa']

In [42]:
#Reunindo as palavras
df['join_words'] = df['filtered_words'].apply(lambda x: ' '.join(x))

In [43]:
#Verificando as palavras unidas
df['join_words']

0       apartamento vila mariana praca monteiro santo ...
1       fallenc brasilgameshow quero x scout dizem dou...
2       analogica correio espera d so falta receb dua ...
3       festa poss president stf toffoli canta legiao ...
4       thethiagor jubsilva gscisa grupomulheri flavia...
                              ...                        
4995    nao nada demai apena verdad oh pronto obrigada...
4996     veja fato fake entrevista anthoni garotinho rjtv
4997                nattvieiira queria ver sai causa luca
4998    assassin s creed origin enfrentando irmao ze r...
4999    gnt conversava paramo acho mt engracado cara k...
Name: join_words, Length: 5000, dtype: object

In [44]:
#Seperando os tweets a serem classificados
X_sub = df['join_words']

In [72]:
#Ajustando os dados para a função TFIDF
X_sub_train_tfidf = tfidf.transform(X_sub)

In [74]:
#Classificando no melhor modelo
y_log_predict_sub = model_logistic2.predict(X_sub_train_tfidf)

In [75]:
y_log_predict_sub

array([2, 0, 1, ..., 0, 2, 0], dtype=int64)

In [76]:
#Inserindo no DataFrame
df['sentiment'] = y_log_predict_sub

In [77]:
df['sentiment']

0       2
1       0
2       1
3       2
4       1
       ..
4995    1
4996    2
4997    0
4998    2
4999    0
Name: sentiment, Length: 5000, dtype: int64

In [78]:
#Salvando DataFrame em CSV
df.to_csv('Mario_Cesa_780_projeto_2_submissao.csv')