### Notebook Goals
* O objetivo deste notebook é obter mensagens da rede social Twitter, para analisar o sentimento geral sobre um produto na lingua portuguesa (pt).
* Neste caso será considerado o produto 'livros'.
* Para esse cálculo será utilizado inteligência artifical com NLP.

#### NLP
* NLP (Natural Linguage Processing) explora habilidades computacionais para entender e classificar textos.

##### Data Prep

In [1]:
import tweepy
import pandas as pd

In [2]:
#Import text file keys 
txtKeys = open('ktw.txt', 'r').read().splitlines()
api_key = txtKeys[0]
api_key_secret = txtKeys[1]
api_token = txtKeys[2]
api_token_secret = txtKeys[3]

In [3]:
#Twitter API authentication 
auth = tweepy.OAuthHandler(consumer_key=api_key, consumer_secret=api_key_secret)
auth.set_access_token(api_token, api_token_secret)
api = tweepy.API(auth)

In [4]:
#API params
search_query = 'livro' + ' -filter:retweets'
tweet_amount = 10
tweets = tweepy.Cursor(api.search, q=search_query, lang='pt').items(tweet_amount)

In [5]:
#Tweets list
cols = set()
allowed_types = [str, int]
tw_data = []

for tw in tweets:
    tw_dict = dict(vars(tw))
    tw_keys = tw_dict.keys()
    filter_data = {}
    for k in tw_keys:
        try:
            k_type = type(tw_dict[k])
        except:
            k_type = None
        if k_type != None:
            if k_type in allowed_types:
                cols.add(k)
                filter_data[k] = tw_dict[k]
    tw_data.append(filter_data)

header_cols = list(cols)
#tw_data

In [6]:
#Tweets Dataframe
df = pd.DataFrame(tw_data, columns=header_cols)
#df.head()
df.columns

Index(['in_reply_to_status_id', 'lang', 'retweet_count', 'favorite_count',
       'in_reply_to_status_id_str', 'id_str', 'in_reply_to_user_id_str', 'id',
       'in_reply_to_user_id', 'in_reply_to_screen_name', 'source',
       'source_url', 'text'],
      dtype='object')

In [7]:
#Cleaning Data
df.drop(['in_reply_to_status_id_str', 'in_reply_to_screen_name',
         'in_reply_to_user_id', 'id_str', 'in_reply_to_status_id',
         'favorite_count', 'in_reply_to_user_id_str', 'lang', 'source',
         'source_url']
        , inplace=True
        , axis=1)

In [8]:
df = df[['id', 'text', 'retweet_count']]

In [9]:
#Sample by retweets count
df.sort_values('retweet_count', ascending=False).head()

Unnamed: 0,id,text,retweet_count
0,1391135563932131329,comprei um livro lindo,0
1,1391135557221236742,3 surtos diferentes com o mesmo livro,0
2,1391135543862439938,@zoyalinabot Eu faço quando o livro é pra vest...,0
3,1391135542633504768,Meu livro chegouuuuu,0
4,1391135540221710336,@zoyalinabot Uma vibe de quem ler livro e faz ...,0


##### NLP

In [10]:
import re #Regulary Expretions
import nltk #Natural Linguage Toolkit
from nltk import tokenize
from nltk import word_tokenize
#from nltk import sent_tokenize
from nltk.corpus import stopwords

#from nltk.tokenize import sent_tokenize, word_tokenize

In [11]:
#Regulary Expretions
def remove_re(expressions):
    tweets_cl = []
    for exp in expressions:
        f_expre = re.sub('@\S+', '', exp)
        s_expre = re.sub('https\S+', '', f_expre)
        final_expre = s_expre.lower().replace('.', '').replace(',', '').replace('-','').replace('\n', '').replace(')','').replace('(','').replace('#', '').replace('!', '').replace('?', '').replace(';', '').replace('[', '').replace(']', '') 
        tweets_cl.append(final_expre)
    return(tweets_cl)

df_list = list(df.text)
tweets_cl = remove_re(df_list)
#tweets_cl

In [12]:
#Tokenize
word_tokens = []
for tk in tweets_cl:
    #print(word_tokenize(tk, language='portuguese'))
    word_tokens.append(word_tokenize(tk, language='portuguese'))

In [13]:
#Stopwords
stopwords = set(nltk.corpus.stopwords.words('portuguese'))
#stopwords
token_filter = []
for wt in word_tokens:
    for wtk in wt:
        if wtk not in stopwords:
            token_filter.append(wtk)
        
    

In [14]:
#Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()