## Análise de Sentimento - Tweets de compania aérea

In [2]:
## Importando as dependências do python
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from sklearn import svm

In [3]:
## Funções para realizaçaõ da limpeza dos tweets E
def stemmer(df):
    # https://www.datacamp.com/community/tutorials/stemming-lemmatization-python?utm_source=adwords_ppc&utm_campaignid=1455363063&utm_adgroupid=65083631748&utm_device=c&utm_keyword=&utm_matchtype=b&utm_network=g&utm_adpostion=&utm_creative=332602034358&utm_targetid=aud-299261629574:dsa-429603003980&utm_loc_interest_ms=&utm_loc_physical_ms=1001773&gclid=Cj0KCQjws-OEBhCkARIsAPhOkIaz_Gl4LR3zdQUBErnFXQNyFuad-t0PO-0q2KsTqKRgqSNQilO19TcaAgcmEALw_wcB
    # http://www.nltk.org/howto/portuguese_en.html
    allstopwords = stopwords.words('english')
    allstopwords.remove('not')
    stemmer = RSLPStemmer()
    ntxt = []
    for i in df.split():
        if not i in set(allstopwords):
            ntxt += [stemmer.stem(i)]
    return " ".join(ntxt)

def remove_symbols(df):
    # removes links
    # https://stackoverflow.com/questions/6718633/python-regular-expression-again-match-url
    df = re.sub(r"[^@#\w\s]", "", df)
    #doesnt remove @, #
    df = re.sub(r"((http | https)\: \/\/)?[a-zA-Z0-9\.\/\?\: @\-_=  # ]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*",
                "", df)

    return df

def limpa_tudo(df):
    df = remove_symbols(df)
    df = stemmer(df)
    df = df.lower()
    return df

Preparando a base de dados para execução das análises

In [4]:
# Lendo o database
dataset = pd.read_csv('Tweets.csv')
dataset = pd.DataFrame(dataset)
tweets = dataset[['airline_sentiment','text']]
yds = dataset['airline_sentiment']

y = []
for i in range(0,4000):
    if yds[i] == "positive":
        y.append(1)
    elif yds[i] == "neutral":
        y.append(0)
    else:
        y.append(-1)
y = np.array(y)

x = []
for i in range(0, 4000):
    tweet = tweets['text'][i]
    tweet = limpa_tudo(tweet)
    x.append(tweet)

# Dividindo a base entre base de treinamento e teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=25)

Análise exploratória do database

Vetoriando o dataset em Bag of Words e em TFIDF

In [8]:
countVec = CountVectorizer(lowercase=True).fit(x_train)
tfidfVec = TfidfVectorizer(use_idf=True, smooth_idf=True).fit(x_train)

x_train_vec_bow = countVec.transform(x_train).toarray()
x_train_vec_bow = np.array(x_train_vec_bow)
x_test_vec_bow = countVec.transform(x_test).toarray()

x_train_vec_tfidf = tfidfVec.transform(x_train)
x_test_vec_tfidf = tfidfVec.transform(x_test)

Criando resultados com o modelos Naive Bayes

In [10]:
nb = MultinomialNB(alpha=alpha)

nb.fit(x_train_vec_tfidf, y_train)
y_pred_tfidf = nb.predict(x_test_vec_tfidf)

nb.fit(x_train_vec_bow, y_train)
y_pred_bow = nb.predict(x_test_vec_bow)

score = accuracy_score(y_test, y_pred_tfidf)
score_bow = accuracy_score(y_test, y_pred_bow)

print("Score NB with alpha {0} and TfIdF is {1}".format(alpha, score))
print("Score NB with alpha {0} and BOW is {1}".format(alpha, score_bow))

Score NB with alpha 1 and TfIdF is 0.6775
Score NB with alpha 1 and BOW is 0.725


In [11]:
alphas = [0.01, 0.1, 0.5, 1, 1.5, 10]

for alpha in alphas:
    nb = MultinomialNB(alpha=alpha)

    nb.fit(x_train_vec_tfidf, y_train)
    y_pred_tfidf = nb.predict(x_test_vec_tfidf)

    nb.fit(x_train_vec_bow, y_train)
    y_pred_bow = nb.predict(x_test_vec_bow)

    score = accuracy_score(y_test, y_pred_tfidf)
    score_bow = accuracy_score(y_test, y_pred_bow)

    print("Score NB with alpha {0} and TfIdF is {1}".format(alpha, score))
    print("Score NB with alpha {0} and BOW is {1}".format(alpha, score_bow))

Score NB with alpha 0.01 and TfIdF is 0.7225
Score NB with alpha 0.01 and BOW is 0.73125
Score NB with alpha 0.1 and TfIdF is 0.72625
Score NB with alpha 0.1 and BOW is 0.74
Score NB with alpha 0.5 and TfIdF is 0.69625
Score NB with alpha 0.5 and BOW is 0.75
Score NB with alpha 1 and TfIdF is 0.6775
Score NB with alpha 1 and BOW is 0.725
Score NB with alpha 1.5 and TfIdF is 0.66625
Score NB with alpha 1.5 and BOW is 0.71375
Score NB with alpha 10 and TfIdF is 0.66
Score NB with alpha 10 and BOW is 0.66375


Criando resultados com o modelo de regressão logística

In [16]:
clf = LogisticRegression(random_state=0,solver='lbfgs')
clf.fit(x_train_vec_tfidf, y_train)
y_pred_logistics = clf.predict(x_test_vec_tfidf)

clf.fit(x_train_vec_bow, y_train)
y_pred_logistics_bow = clf.predict(x_test_vec_bow)

score_tfidf = accuracy_score(y_test, y_pred_logistics)
score_bow = accuracy_score(y_test, y_pred_logistics_bow)

print("Score with TfIdf is {0}".format(score_tfidf))
print("Score with BOW is {0}".format(score_bow))

Score with TfIdf is 0.76625
Score with BOW is 0.75625


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Criando resultados com SVM

In [26]:
svc = svm.SVC()
svc.fit(x_train_vec_bow, y_train)
y_pred_svc_bow = svc.predict(x_test_vec_bow)

svc.fit(x_train_vec_tfidf, y_train)
y_pred_svc_tfidf = svc.predict(x_test_vec_tfidf)

score_tfidf = accuracy_score(y_test, y_pred_svc_bow)
score_bow = accuracy_score(y_test, y_pred_svc_tfidf)

print("Score with TfIdf is {0}".format(score_tfidf))
print("Score with BOW is {0}".format(score_bow))

Score with TfIdf is 0.76
Score with BOW is 0.7525


Análise dos resultados

In [17]:
#logistic regression vs naivebayes vs svC

#naive bayes da overfitting checar erro de treino parametro alpha
#logistic regreersion parametro l1