In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

import time

from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

#### Busca dataset

In [2]:
dataset  = pd.read_csv('treated_dataset.csv')

In [3]:
len(dataset)

497247

In [4]:
dataset.sample(5)

Unnamed: 0,tweet_text,sentiment
490646,vou tent post hoj nen jur to tod enrol faculda...,0
449026,luiz deix soft :(,0
124070,jurooooo bue mental abert que qd mand pi and p...,1
410828,irma agr vai volt viag vou dorm cm filh :( tao...,0
332387,ti :(,0


### Separa os dados

In [5]:
# dados para o cross validate
x = dataset.tweet_text
y = dataset.sentiment

# dados para treino e teste
treino, teste, classe_treino, classe_teste = train_test_split(
    dataset,
    dataset.sentiment,
    test_size= 0.2,
    random_state = 0)

### Aplicando o TF-IDF e o 2-grams

In [6]:
tfidf = TfidfVectorizer(lowercase = False , 
                        ngram_range = (1,2),
                        analyzer = 'word',
                        token_pattern = "([\w']+|:\(|:-\(|:-\)|:\)|:\/)",
                        max_features = 250)

# to string
treino = treino.tweet_text.apply(lambda tweet: np.str_(tweet))
teste = teste.tweet_text.apply(lambda tweet: np.str_(tweet))

x = x.apply(lambda tweet: np.str_(tweet))
y = y.apply(lambda tweet: np.str_(tweet))

tfidf.fit(x)

treino_vectorized = tfidf.transform(treino)
teste_vectorized = tfidf.transform(teste)

x_vectorized = tfidf.transform(x)

In [7]:
dense = pd.DataFrame(
    treino_vectorized.todense(),
    columns = tfidf.get_feature_names()
)
dense.head(5)

Unnamed: 0,:(,:),:-(,:-),abrac,acab,ach,acontec,acord,acredit,...,vez,vi,vid,vide,vir,viv,volt,vontad,vot,vou
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.091386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379149
2,0.0,0.194459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.124087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.290749
4,0.0,0.252699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
print(tfidf.get_feature_names())

[':(', ':)', ':-(', ':-)', 'abrac', 'acab', 'ach', 'acontec', 'acord', 'acredit', 'ador', 'agor', 'agr', 'ah', 'ai', 'aind', 'ajud', 'alg', 'algu', 'algum', 'am', 'amanh', 'amig', 'amo', 'and', 'anj', 'ano', 'ant', 'aqu', 'assim', 'assist', 'aul', 'beb', 'bem', 'boa', 'bom', 'bom dia', 'bonit', 'brasil', 'cabel', 'cad', 'car', 'cas', 'cert', 'cham', 'cheg', 'chor', 'cmg', 'cois', 'com', 'comec', 'comig', 'compr', 'conhec', 'consegu', 'consig', 'cont', 'continu', 'convers', 'coraca', 'curt', 'da', 'dar', 'deix', 'demal', 'desculp', 'dess', 'deu', 'dev', 'dia', 'diss', 'diz', 'dm', 'doi', 'dor', 'dorm', 'eh', 'enta', 'entend', 'entr', 'escol', 'esper', 'esquec', 'est', 'estud', 'fac', 'fal', 'falt', 'favor', 'faz', 'feliz', 'fic', 'final', 'fiq', 'fiz', 'fod', 'fof', 'fot', 'ganh', 'gent', 'gost', 'gt', 'gt :(', 'hj', 'hoj', 'hor', 'ia', 'import', 'infeliz', 'ir', 'ja', 'jog', 'la', 'legal', 'lembr', 'lev', 'lind', 'livr', 'log', 'lt', 'mae', 'mal', 'man', 'mand', 'mat', 'med', 'mei', 'm

### Funções para auxiliar na classificação

In [9]:
def crossvalidate(modelo):
    cv = StratifiedKFold( n_splits = 5 , shuffle = True)

    results = cross_validate(modelo , 
                             x_vectorized.toarray(),
                             y, 
                             cv = cv , 
                             return_train_score = True)
    media = results['test_score'].mean()
    desvio_padrao = results['test_score'].std()
    print("Accuracy mean CV: %f%%" % (media * 100))
    print("Accuracy CV[%f%%, %f%%]" % 
          ((media - 2 *desvio_padrao) * 100, 
           (media + 2 * desvio_padrao) * 100))
    
def treino_teste(modelo):
    start = time.time()
    modelo.fit(treino_vectorized,classe_treino)
    end = time.time()
    fit_time = end - start
    print('[Fit time : %f segundos]' % (fit_time))

    predict = modelo.predict(teste_vectorized)
    acc_score = accuracy_score(classe_teste, predict)
    print("[Acc score %f%%]\n\n" % (acc_score*100))
    
    print("Confusion Matrix")
    print(confusion_matrix(classe_teste, predict))
    print('\n\n')
    print(classification_report(classe_teste, predict))

## Regressão Logística

In [10]:
modelo = LogisticRegression(solver = 'lbfgs' , C = 10 ,max_iter = 250)
crossvalidate(modelo)

Accuracy mean CV: 99.810959%
Accuracy CV[99.782776%, 99.839142%]


In [11]:
treino_teste(modelo)

[Fit time : 3.991187 segundos]
[Acc score 99.816993%]


Confusion Matrix
[[49602   141]
 [   41 49666]]



              precision    recall  f1-score   support

           0       1.00      1.00      1.00     49743
           1       1.00      1.00      1.00     49707

    accuracy                           1.00     99450
   macro avg       1.00      1.00      1.00     99450
weighted avg       1.00      1.00      1.00     99450



#### Mostra os maiores pesos ( sentimento positivo)

In [12]:
weight = pd.DataFrame(
    # peso de cada termo
    modelo.coef_[0].T,
    index = tfidf.get_feature_names()
) 
# 0 = coluna do Dataframe
weight.nlargest(10,0)

Unnamed: 0,0
:),54.249605
:-),8.046323
parab,1.473287
bom,0.793572
val,0.534693
boa,0.530428
obrig,0.506088
quis,0.465115
feliz,0.443555
and,0.359272


#### Mostra os menores pesos ( sentimento negativo)

In [13]:
# 0 = coluna do Dataframe
weight.nsmallest(10,0)

Unnamed: 0,0
:(,-68.491277
:-(,-18.05586
trist,-4.490042
obrig :),-3.445602
amo,-3.331282
pox,-3.286808
saudad,-3.278211
quer,-2.877087
sint,-2.840461
sdd,-2.810356


## SVM

In [15]:
modelo = LinearSVC(random_state=0, tol=1e-5)
crossvalidate(modelo)

Accuracy mean CV: 99.808948%
Accuracy CV[99.788243%, 99.829654%]


In [18]:
treino_teste(modelo)

[Fit time : 1.836426 segundos]
[Acc score 99.815988%]


Confusion Matrix
[[49602   141]
 [   42 49665]]



              precision    recall  f1-score   support

           0       1.00      1.00      1.00     49743
           1       1.00      1.00      1.00     49707

    accuracy                           1.00     99450
   macro avg       1.00      1.00      1.00     99450
weighted avg       1.00      1.00      1.00     99450



#### Mostra os maiores pesos ( sentimento positivo)

In [19]:
weight = pd.DataFrame(
    # peso de cada termo
    modelo.coef_[0].T,
    index = tfidf.get_feature_names()
) 
# 0 = coluna do Dataframe
weight.nlargest(10,0)

Unnamed: 0,0
:),11.899504
:-),1.549974
parab,0.103544
bom,-0.00811
boa,-0.055013
quis,-0.055478
feliz,-0.060434
val,-0.063477
obrig,-0.070316
bom dia,-0.086478


#### Mostra os menores pesos ( sentimento negativo)

In [20]:
# 0 = coluna do Dataframe
weight.nsmallest(10,0)

Unnamed: 0,0
:(,-14.451294
:-(,-4.416612
trist,-0.937906
obrig :),-0.903237
amo,-0.749568
infeliz,-0.724233
pox,-0.681379
saudad,-0.661827
anj,-0.654989
sint,-0.645004


## Decision Tree Classifier

In [21]:
modelo = DecisionTreeClassifier(random_state=0,max_depth = 10)
crossvalidate(modelo)

Accuracy mean CV: 99.809350%
Accuracy CV[99.792449%, 99.826251%]


In [22]:
treino_teste(modelo)

[Fit time : 3.140998 segundos]
[Acc score 99.815988%]


Confusion Matrix
[[49609   134]
 [   49 49658]]



              precision    recall  f1-score   support

           0       1.00      1.00      1.00     49743
           1       1.00      1.00      1.00     49707

    accuracy                           1.00     99450
   macro avg       1.00      1.00      1.00     99450
weighted avg       1.00      1.00      1.00     99450



## Random Forest Classifier

In [23]:
modelo = RandomForestClassifier(n_estimators=100, max_depth=10,random_state=0)
crossvalidate(modelo)

Accuracy mean CV: 99.802714%
Accuracy CV[99.785773%, 99.819654%]


In [24]:
treino_teste(modelo)

[Fit time : 38.894793 segundos]
[Acc score 99.810960%]


Confusion Matrix
[[49601   142]
 [   46 49661]]



              precision    recall  f1-score   support

           0       1.00      1.00      1.00     49743
           1       1.00      1.00      1.00     49707

    accuracy                           1.00     99450
   macro avg       1.00      1.00      1.00     99450
weighted avg       1.00      1.00      1.00     99450

