In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

import time

from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

#### Busca dataset

In [2]:
dataset  = pd.read_csv('treated_dataset.csv')

In [3]:
len(dataset)

497247

In [4]:
dataset.sample(5)

Unnamed: 0,tweet_text,sentiment
134149,chegamoss :) music qu dar replay lt,1
348418,quer companh :(,0
230409,bom diaa :),1
297434,hj pass mama rua chei gat gnt ate par tir mei ...,0
429932,mai car ti meuuu ohhhh :(,0


### Separa os dados

In [5]:
# dados para o cross validate
x = dataset.tweet_text
y = dataset.sentiment

# dados para treino e teste
treino, teste, classe_treino, classe_teste = train_test_split(
    dataset,
    dataset.sentiment,
    test_size= 0.2,
    random_state = 0)

### Aplicando o TF-IDF e o 2-grams

In [6]:
tfidf = TfidfVectorizer(lowercase = False , 
                        ngram_range = (1,2),
                        #analyzer = 'word',
                        #token_pattern = "([\w']+)",
                        max_features = 250)

# to string
treino = treino.tweet_text.apply(lambda tweet: np.str_(tweet))
teste = teste.tweet_text.apply(lambda tweet: np.str_(tweet))

x = x.apply(lambda tweet: np.str_(tweet))
y = y.apply(lambda tweet: np.str_(tweet))

tfidf.fit(x)

treino_vectorized = tfidf.transform(treino)
teste_vectorized = tfidf.transform(teste)

x_vectorized = tfidf.transform(x)

In [7]:
dense = pd.DataFrame(
    treino_vectorized.todense(),
    columns = tfidf.get_feature_names()
)
dense.head(5)

Unnamed: 0,abrac,acab,ach,acontec,acord,acredit,ador,agor,agr,ah,...,vez,vi,vid,vide,vir,viv,volt,vontad,vot,vou
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.380766
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293013
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
print(tfidf.get_feature_names())

['abrac', 'acab', 'ach', 'acontec', 'acord', 'acredit', 'ador', 'agor', 'agr', 'ah', 'ai', 'aind', 'ajud', 'alg', 'algu', 'algum', 'am', 'amanh', 'amig', 'amo', 'and', 'anj', 'ano', 'ant', 'aqu', 'assim', 'assist', 'aul', 'beb', 'bem', 'boa', 'bom', 'bom dia', 'bonit', 'brasil', 'cabel', 'cad', 'car', 'cas', 'cert', 'cham', 'cheg', 'chor', 'cmg', 'cois', 'com', 'comec', 'comig', 'compr', 'conhec', 'consegu', 'consig', 'cont', 'continu', 'convers', 'coraca', 'curt', 'da', 'dar', 'deix', 'demal', 'desculp', 'dess', 'deu', 'dev', 'dia', 'dificil', 'diss', 'diz', 'dm', 'doi', 'dor', 'dorm', 'eh', 'enta', 'entend', 'entr', 'es', 'escol', 'esper', 'esquec', 'est', 'estud', 'fac', 'fal', 'falt', 'favor', 'faz', 'feliz', 'fic', 'final', 'fiq', 'fiz', 'fod', 'fof', 'fot', 'ganh', 'gent', 'gost', 'gt', 'hj', 'hoj', 'hor', 'ia', 'import', 'infeliz', 'ir', 'ja', 'jog', 'la', 'legal', 'lembr', 'lev', 'lind', 'livr', 'log', 'lt', 'mae', 'mal', 'man', 'mand', 'mat', 'med', 'mei', 'melhor', 'menin', '

### Funções para auxiliar na classificação

In [9]:
def crossvalidate(modelo):
    cv = StratifiedKFold( n_splits = 5 , shuffle = True)

    results = cross_validate(modelo , 
                             x_vectorized.toarray(),
                             y, 
                             cv = cv , 
                             return_train_score = True)
    media = results['test_score'].mean()
    desvio_padrao = results['test_score'].std()
    print("Accuracy mean CV: %f%%" % (media * 100))
    print("Accuracy CV[%f%%, %f%%]" % 
          ((media - 2 *desvio_padrao) * 100, 
           (media + 2 * desvio_padrao) * 100))
    
def treino_teste(modelo):
    start = time.time()
    modelo.fit(treino_vectorized,classe_treino)
    end = time.time()
    fit_time = end - start
    print('[Fit time : %f segundos]' % (fit_time))

    predict = modelo.predict(teste_vectorized)
    acc_score = accuracy_score(classe_teste, predict)
    print("[Acc score %f%%]\n\n" % (acc_score*100))
    
    print("Confusion Matrix")
    print(confusion_matrix(classe_teste, predict))
    print('\n\n')
    print(classification_report(classe_teste, predict))

## Regressão Logística

In [10]:
modelo = LogisticRegression(solver = 'lbfgs' , C = 10 ,max_iter = 250)
crossvalidate(modelo)

Accuracy mean CV: 70.004042%
Accuracy CV[69.840419%, 70.167666%]


In [11]:
treino_teste(modelo)

[Fit time : 2.240968 segundos]
[Acc score 69.861237%]


Confusion Matrix
[[31304 18439]
 [11534 38173]]



              precision    recall  f1-score   support

           0       0.73      0.63      0.68     49743
           1       0.67      0.77      0.72     49707

    accuracy                           0.70     99450
   macro avg       0.70      0.70      0.70     99450
weighted avg       0.70      0.70      0.70     99450



#### Mostra os maiores pesos ( sentimento positivo)

In [12]:
weight = pd.DataFrame(
    # peso de cada termo
    modelo.coef_[0].T,
    index = tfidf.get_feature_names()
) 
# 0 = coluna do Dataframe
weight.nlargest(10,0)

Unnamed: 0,0
parab,3.183566
val,2.793439
feliz,2.766026
boa,2.379787
quis,2.357406
bom,2.256572
obrig,2.200787
ador,2.013864
legal,1.707412
segu,1.657933


#### Mostra os menores pesos ( sentimento negativo)

In [13]:
# 0 = coluna do Dataframe
weight.nsmallest(10,0)

Unnamed: 0,0
trist,-5.263269
pox,-4.435032
infeliz,-3.680914
saudad,-3.653627
sdd,-3.558819
quer,-3.039984
nen,-2.609216
ruim,-2.327017
gt,-2.295753
sint,-2.161114


## SVM

In [14]:
modelo = LinearSVC(random_state=0, tol=1e-5)
crossvalidate(modelo)

Accuracy mean CV: 69.947732%
Accuracy CV[69.552231%, 70.343234%]


In [15]:
treino_teste(modelo)

[Fit time : 5.920915 segundos]
[Acc score 69.802916%]


Confusion Matrix
[[31137 18606]
 [11425 38282]]



              precision    recall  f1-score   support

           0       0.73      0.63      0.67     49743
           1       0.67      0.77      0.72     49707

    accuracy                           0.70     99450
   macro avg       0.70      0.70      0.70     99450
weighted avg       0.70      0.70      0.70     99450



#### Mostra os maiores pesos ( sentimento positivo)

In [16]:
weight = pd.DataFrame(
    # peso de cada termo
    modelo.coef_[0].T,
    index = tfidf.get_feature_names()
) 
# 0 = coluna do Dataframe
weight.nlargest(10,0)

Unnamed: 0,0
parab,1.017633
feliz,1.005936
val,0.966555
quis,0.857515
boa,0.844133
bom,0.832062
obrig,0.759308
ador,0.736319
legal,0.66265
segu,0.63224


#### Mostra os menores pesos ( sentimento negativo)

In [17]:
# 0 = coluna do Dataframe
weight.nsmallest(10,0)

Unnamed: 0,0
trist,-1.771805
pox,-1.479291
infeliz,-1.346822
saudad,-1.188833
sdd,-1.174141
quer,-1.144307
nen,-0.973127
gt,-0.927424
ruim,-0.919588
sint,-0.867684


## Decision Tree Classifier

In [18]:
modelo = DecisionTreeClassifier(random_state=0,max_depth = 10)
crossvalidate(modelo)

Accuracy mean CV: 62.486652%
Accuracy CV[62.336566%, 62.636737%]


In [19]:
treino_teste(modelo)

[Fit time : 2.832539 segundos]
[Acc score 62.548014%]


Confusion Matrix
[[17146 32597]
 [ 4649 45058]]



              precision    recall  f1-score   support

           0       0.79      0.34      0.48     49743
           1       0.58      0.91      0.71     49707

    accuracy                           0.63     99450
   macro avg       0.68      0.63      0.59     99450
weighted avg       0.68      0.63      0.59     99450



## Random Forest Classifier

In [20]:
modelo = RandomForestClassifier(n_estimators=100, max_depth=10,random_state=0)
crossvalidate(modelo)

Accuracy mean CV: 65.871891%
Accuracy CV[65.757479%, 65.986303%]


In [21]:
treino_teste(modelo)

[Fit time : 16.566944 segundos]
[Acc score 65.832076%]


Confusion Matrix
[[22647 27096]
 [ 6884 42823]]



              precision    recall  f1-score   support

           0       0.77      0.46      0.57     49743
           1       0.61      0.86      0.72     49707

    accuracy                           0.66     99450
   macro avg       0.69      0.66      0.64     99450
weighted avg       0.69      0.66      0.64     99450

