In [38]:
import nltk
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import cross_val_predict

In [120]:
tweets = pd.read_csv("tweets.csv", encoding='utf-8')
tweets.shape

(25911, 4)

In [122]:
tweets.dropna(inplace=True)
tweets.shape

(25905, 4)

In [287]:
def remove_stopwords(text):
    stopwords = set(nltk.corpus.stopwords.words('portuguese'))
    words = [i for i in text.split() if not i in stopwords]
    return (" ".join(words))

def remove_links(text):
    return re.sub(r"http\S+", "", text)

def remove_mentions(text):
    return re.sub(r"@\w+", "", text)

def remove_retweets(text):
    return re.sub(r"rt\s", "", text)

def remove_special_chars(text):
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", "", text)
    text_with_no_special_chars = re.sub("\s+", " ", text)
    return text_with_no_special_chars

def stemming(text):
    stemmer = nltk.stem.RSLPStemmer()
    words = []
    for word in text.split():
        words.append(stemmer.stem(word))
    return (" ".join(words))

def standardize_text(text):
    text = text.lower()
    text = remove_links(text)
    text = remove_mentions(text)
    text = remove_retweets(text)
    text = remove_stopwords(text)
    text = remove_special_chars(text)
    #text = stemming(text)
    return text
def sentiment_to_name(n):
    if n == 0:
        n = 'Negativo'
    elif n == 1:
        n = 'Positivo'
    return n

def name_to_sentiment(n):
    if n == 'Negativo':
        n = "0"
    elif n == 'Positivo':
        n = "1"
    return n

In [124]:
tweets.text = tweets.text.apply(standardize_text)
tweets.sentiment = tweets.sentiment.apply(sentiment_to_name)
tweets.head()

Unnamed: 0.1,Unnamed: 0,id,text,sentiment
0,7896,7895,hoje vídeo abrindo emp,Positivo
1,7967,7965,passar tarde estudar olha q bonito,Positivo
2,3611,3610,tava gaja torneio ai filha puta ainda tentei a...,Positivo
3,9612,1139,chateadissima eliminação red canids diria sent...,Negativo
4,15859,7386,tô pensando fazer colinha professora é esperta,Negativo


In [125]:
tweets_text = tweets["text"].values
classes = tweets['sentiment'].values

In [126]:
len(tweets_text)

25905

In [208]:
vectorizer = CountVectorizer(ngram_range = (1, 2))
freq_tweets = vectorizer.fit_transform(tweets_text)

modelo = MultinomialNB()
modelo.fit(freq_tweets, classes)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [209]:
testes = ["bom",
          "ruim",
          "Odeio a politica brasileira",
          "final de período tá complicado",
          "Amizade",
          "link ruim"]

freq_testes = vectorizer.transform(testes)
modelo.predict(freq_testes)

array(['Positivo', 'Negativo', 'Negativo', 'Negativo', 'Negativo',
       'Negativo'], dtype='<U8')

In [210]:
resultados = cross_val_predict(modelo, freq_tweets, classes, cv = 10)
resultados

array(['Positivo', 'Positivo', 'Negativo', ..., 'Negativo', 'Positivo',
       'Negativo'], dtype='<U8')

In [131]:
metrics.accuracy_score(classes, resultados)

0.7695811619378499

In [132]:
sentimentos = ["Positivo", "Negativo"]
print(metrics.classification_report(classes, resultados, sentimentos))

             precision    recall  f1-score   support

   Positivo       0.69      0.57      0.62      8703
   Negativo       0.80      0.87      0.83     17202

avg / total       0.76      0.77      0.76     25905



In [134]:
freq = vectorizer.transform(tweets.text)
predict_tweets = modelo.predict(freq)

In [136]:
tweets['predicted'] = predict_tweets
tweets.head()

Unnamed: 0.1,Unnamed: 0,id,text,sentiment,predicted
0,7896,7895,hoje vídeo abrindo emp,Positivo,Positivo
1,7967,7965,passar tarde estudar olha q bonito,Positivo,Positivo
2,3611,3610,tava gaja torneio ai filha puta ainda tentei a...,Positivo,Positivo
3,9612,1139,chateadissima eliminação red canids diria sent...,Negativo,Negativo
4,15859,7386,tô pensando fazer colinha professora é esperta,Negativo,Negativo


In [138]:
not_match = tweets[tweets["sentiment"] != tweets["predicted"]]
not_match.shape

(1402, 5)

In [140]:
match = tweets[tweets["sentiment"] == tweets["predicted"]]
match.shape

(24503, 5)

In [250]:
marielle_tweets = pd.read_csv("marielle_tweets.csv", encoding='utf-8').dropna(subset=['text'])
marielle_tweets.head()

Unnamed: 0,date,text,sentiment
0,2018-07-30 23:40:15,"RT @joaoassi2: #mariellepresente, não só quand...",1.0
1,2018-07-30 23:31:31,"RT @joaoassi2: #mariellepresente, não só quand...",1.0
2,2018-07-30 23:30:52,"#mariellepresente, não só quando convém! https...",1.0
3,2018-07-30 23:14:22,RT @abdalafarah: Hoje Marielle Franco completa...,1.0
4,2018-07-30 23:12:12,RT @alinecpiva: 29 congressistas estadunidense...,


In [251]:
marielle_tweets.text = marielle_tweets.text.apply(standardize_text)

In [255]:
marielle_tweets

Unnamed: 0,date,text,sentiment
0,2018-07-30 23:40:15,mariellepresente convém,1
1,2018-07-30 23:31:31,mariellepresente convém,1
2,2018-07-30 23:30:52,mariellepresente convém,1
3,2018-07-30 23:14:22,hoje marielle franco completaria anos idade v...,1
4,2018-07-30 23:12:12,congressistas estadunidenses incluindo enviar...,
5,2018-07-30 23:12:04,dias passaram matou marielle mandou matar mar...,1
6,2018-07-30 22:28:50,dias meses dias dias marielle franco matou ma...,1
7,2018-07-30 22:09:14,cadê bocas gamela ficavam gritando mariellepre...,0
8,2018-07-30 20:00:48,hoje marielle completaria anos idade pergunta...,1
9,2018-07-30 19:45:30,dias meses dias dias marielle franco matou ma...,1


In [271]:
marielle_tweets_freq = vectorizer.transform(marielle_tweets.text)

marielle_predict = modelo.predict(marielle_tweets_freq)

In [272]:
marielle_tweets['predicted'] = marielle_predict

In [288]:
marielle_t_class = marielle_tweets.copy()
marielle_t_class.dropna(inplace=True)
marielle_t_class.predicted = marielle_t_class.predicted.apply(name_to_sentiment)

In [294]:
marielle_class_freq = vectorizer.transform(marielle_t_class.text)


In [292]:
not_match = marielle_t_class[marielle_t_class["sentiment"] != marielle_t_class["predicted"]]
not_match.shape

(66, 4)

In [293]:
match = marielle_t_class[marielle_t_class["sentiment"] == marielle_t_class["predicted"]]
match.shape

(24, 4)

In [295]:
resultados = cross_val_predict(modelo, marielle_class_freq, marielle_t_class.sentiment, cv = 10)
metrics.accuracy_score(marielle_t_class.sentiment, resultados)



0.8222222222222222

In [299]:
sentimentos = ["1", "0"]
print(metrics.classification_report(marielle_t_class.sentiment, resultados, sentimentos))

             precision    recall  f1-score   support

          1       0.80      1.00      0.89        59
          0       0.92      0.48      0.63        25

avg / total       0.83      0.85      0.81        84

