# Coleta dados do Twitter: Full-archive search 

Documentação da API: https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all 

Endpoint URL: https://api.twitter.com/2/tweets/search/all


In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
import pandas as pd
import requests
import json
import time


auth_token = os.environ.get('AUTH_TOKEN')

header = {'Authorization': 'Bearer ' + auth_token}

#query='((vacina%20vacinacao)%20OR%20(vacina%20OR%20vacinacao))%20-rt'
query='%23cpidacovid%20-rt' # Query com o filtro -rt, para não trazer retweets 
start_time='2020-02-29T00%3A00%3A00Z'
end_time='2021-07-12T00%3A00%3A00Z'
#start_time='2021-05-03T00%3A00%3A00Z'
#end_time='2021-01-18T11:26:54.000Z'

max_results='500'
next_token=''

url='https://api.twitter.com/2/tweets/search/all?query='+query+'&start_time='+start_time+'&end_time='+end_time+'&max_results='+max_results+'&expansions=author_id&tweet.fields=created_at'
response = requests.get(url,headers=header)
time.sleep(1)
listOfTweets = json.loads(response.content)
print('New Request on',url)

twitterData = pd.DataFrame(listOfTweets['data'])   
#twitterUsers = pd.DataFrame(listOfTweets['includes'])

if 'next_token' in listOfTweets['meta']:    
    next_token = listOfTweets['meta']['next_token']   
    
    while 'next_token' in listOfTweets['meta']:        
        url='https://api.twitter.com/2/tweets/search/all?query='+query+'&start_time='+start_time+'&end_time='+end_time+'&max_results='+max_results+'&next_token='+next_token+'&expansions=author_id&tweet.fields=created_at'
        response = requests.get(url,headers=header)  
        time.sleep(1)
        listOfTweets = json.loads(response.content)         
       
        print('New Request on',url)
        
        if 'data' in listOfTweets:
            twitterData = twitterData.append(pd.DataFrame(listOfTweets['data']),ignore_index=True)
            #twitterUsers = twitterUsers.append(pd.DataFrame(listOfTweets['includes']),ignore_index=True)

            if  'meta' in listOfTweets:         
                if 'next_token' in listOfTweets['meta']:
                    next_token =  listOfTweets['meta']['next_token']
                else:
                    print('Done! Total of ', len(twitterData), 'tweets collected.')                
                    break
            else:
                break
        else:
            print('Missing request')
            break
else:
    twitterData = pd.DataFrame(listOfTweets['data'])
    #twitterUsers = pd.DataFrame(listOfTweets['includes'])

    print('Done! Total of', len(twitterData), 'tweets collected.')

In [None]:
#twitterData.to_csv('tweets.csv')
twitterData.to_csv('./datasets/cpi.csv',index=False)

In [None]:
import pandas as pd

twitterData = pd.read_csv('./datasets/old/allpandemic.csv', lineterminator='\n',low_memory=False)

reawTweets = {'created_at': twitterData.created_at, 'text': twitterData.text}
tweets = pd.DataFrame(reawTweets)

In [None]:
tweets

# Pré-processamento dos dados textuais extraidos

In [None]:
import nltk
from nltk import tokenize
import numpy as np 
from string import punctuation
import unidecode
stemmer = nltk.RSLPStemmer()

# Removendo hashtags, menções a usuários, numeros, termos curtos e links

twitterData['processed_text'] = twitterData.text.str.lower() \
                                                .str.replace(r'(http\S+)', '') \
                                                .str.replace(r'@[\w]*', '') \
                                                .str.replace(r'#[\w]*','') 
                        
textWords = ' '.join([text for text in twitterData.processed_text])

# Removendo acentuação
textWords = [unidecode.unidecode(text) for text in twitterData.processed_text ]

# Criando lista com palavras e caracteres (stopwords) a serem removidos do texto
stopWords = nltk.corpus.stopwords.words("portuguese")

# Separando a pontuação das palavras
punctSeparator = tokenize.WordPunctTokenizer()
punctuationList = list()
for punct in punctuation:
    punctuationList.append(punct)
    
personalList=['pra','predictions']    

stopWords =  stopWords + punctuationList + personalList

# Iterando o texto removendo as stopwords
trasnformedText = list()

for text in textWords:
    newText = list()   
    #text = text.lower()
    textWords = punctSeparator.tokenize(text)
    for words in textWords:
        if words not in stopWords:
             #newText.append(stemmer.stem(words))
            newText.append(words)
    trasnformedText.append(' '.join(newText))
twitterData.processed_text = trasnformedText

twitterData.processed_text = twitterData.processed_text.str.replace(r"[^a-zA-Z#]", " ") \
                                                           .replace(r"k\k", " ") \
                                                           .apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))       

# Visualização dos dados textuais pré-processados

In [None]:
twitterData

In [None]:
twitterData.processed_text[10]

In [None]:
twitterData.text[10]

In [None]:
import nltk
from nltk import tokenize

textWords = ' '.join([text for text in  twitterData.processed_text])

tokenizing = tokenize.WhitespaceTokenizer()
tokenizedWords = tokenizing.tokenize(textWords)
frequency = nltk.FreqDist(tokenizedWords)

df_frequency = pd.DataFrame({"Word": list(frequency.keys()),
                                   "Frequency": list(frequency.values())})

df_frequency.nlargest(columns = "Frequency", n = 10)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def pareto(text, text_column, count):
    textWords = ' '.join([text for text in text[text_column]])
    tokenizedWords = tokenizing.tokenize(textWords)
    frequency = nltk.FreqDist(tokenizedWords)
    df_frequency = pd.DataFrame({"Word": list(frequency.keys()),
                                   "Fequency": list(frequency.values())})
    df_frequency = df_frequency.nlargest(columns = "Fequency", n = count)
    plt.figure(figsize=(12,8))
    ax = sns.barplot(data =  df_frequency, x = "Word", y = "Fequency", color = 'gray')
    ax.set(ylabel = "Count")
    plt.show()

In [None]:
pareto(twitterData, "processed_text", 10)

In [None]:
from wordcloud import WordCloud
%matplotlib inline

textWords = ' '.join([text for text in  twitterData.processed_text ])

wordCloud = WordCloud(width= 800, height= 600,  #WordCloud com a lista de palavas
                          max_font_size = 110,
                          collocations = False).generate(textWords)

import matplotlib.pyplot as plt

plt.figure(figsize=(15,10))
plt.imshow(wordCloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# TF-IDF / LDA

In [None]:
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
from tqdm import tqdm
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):        
            result.append(token)
    return result

processed_docs = twitterData.processed_text.map(preprocess)

dictionary = gensim.corpora.Dictionary(processed_docs)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

corpus_tfidf = tfidf[bow_corpus]

# Funções para ajustar os hiperpârametros do modelo 

<ul>
    <li><b>coherence:</b> valor de coerência mede, dentro de cada tópico, a coerência semântica das palavras, utilizando a métrica de cossenos para                  determinar sua similaridade.Varia de 0 a 1.  </li>
    <li><b>num_topics:</b> numeros de tópicos</li>
    <li><b>alpha:</b> desnidade de tópicos no documento</li>
    <li><b>beta:</b> densidade de palavras no tópico</li>
</ul> 


In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step)):
        model = LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values
# Colocando parametros na função
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus_tfidf, texts=processed_docs, start=1, limit=30, step=1)
# Mostrando visualmente a quantidade de tópicos
limit=30; start=1; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Tópicos")
plt.ylabel("Score de Coerência")
plt.legend(("Valores de Coerência"), loc='best')
plt.show()
# Lista dos valores de coerência, para melhor identificar o ponto de inflexão do gráfico
for m, cv in zip(x, coherence_values):
    print("A quantidade de tópicos =", m, " tem um valor de coerência de ", round(cv, 4))

In [None]:
import numpy as np
import tqdm

def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus_tfidf,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
print()
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus_tfidf)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus_tfidf, num_of_docs*0.75), 
               corpus_tfidf]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=20)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_tfidf[i], dictionary=dictionary, k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

# Aplicando LDA com os hiperparâmetros sugeridos

In [None]:
lda_model_tfidf = LdaMulticore(corpus_tfidf, id2word=dictionary, num_topics=12 , passes=100,chunksize = 1000, workers=4, per_word_topics=True,
                        alpha = 0.9,
                        eta = 0.3)

In [None]:
lda_model_tfidf.print_topics( num_words = 20)

In [None]:
coherence_model_lda  = CoherenceModel(model=lda_model_tfidf, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nScore de Coerência: ', coherence_lda)

In [None]:
print('\nPerplexidade: ', lda_model_tfidf.log_perplexity(corpus_tfidf))

In [None]:
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.XKCD_COLORS.items()]

cloud = WordCloud( background_color='white',
                  width=2500,
                  height=1800,
                  max_words=20,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)
topics = lda_model_tfidf.show_topics(formatted=False)
fig, axes = plt.subplots(1, 4, figsize=(15,15), sharex=True, sharey=True)
for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=600)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')
plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
vis

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step)):
        model = LdaMulticore(corpus, id2word=dictionary, num_topics=4)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values



print()
# Colocando parametros na função
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus_tfidf, texts=processed_docs, start=1, limit=4, step=1)

In [None]:
# Escolhe o model
optimal_model = model_list[0]
optimal_model.print_topics( num_words = 20)


In [None]:
# Encontrar qual o principal tópico em cada tweet
def format_topics_sentences(ldamodel, corpus, texts):
    # Output
    sent_topics_df = pd.DataFrame()
# Seleciona o principal tópico de cada tweet
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Calcula o principal tópico, o percentual de contribuição e as palavras chaves de cada tweet
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => mostra o principal tópico
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Principal_Topico', 'Perc_Contributicao', 'Palavras_Chave']
# Inclui o texto original no final do DataFrame
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)
# roda a função de calcular os principais tópicos de cada tweet
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus_tfidf, texts=processed_docs)
# Formata o DataFrame
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Index_DF_Original', 'Principal_Topico', 'Perc_Contrib_Topico', 'Palavras_Chave', 'Tweets']
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Principal_Topico')
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contributicao'], ascending=[0]).head(1)], 
                                            axis=0)
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
# Format
sent_topics_sorteddf_mallet.columns = ['topic', "topic_perc_contrib", "terms", "tweet"]
# Show
sent_topics_sorteddf_mallet


## Quantidade de tweets por tópico

In [None]:
# Numero de tweets por topico
topic_counts = df_topic_sents_keywords['Principal_Topico'].value_counts()
# Porcentagem de tweets por tópico
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
# Número do tópico e suas palavras chave
topic_num_keywords = sent_topics_sorteddf_mallet[['topic', 'terms']]
# Concatena as colunas
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)
# Renomeia as colunas
df_dominant_topics.columns = ['topics', 'terms', 'num_tweets', 'perc_tweets']
# Mostra o DataFrame
df_dominant_topics


In [None]:

fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))


topics = df_dominant_topics.topics
print(topics)
keywords = df_dominant_topics.terms


def func(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return "{:.1f}%\n Topic {:d}".format(pct, absolute)


wedges, texts, autotexts = ax.pie(topics, radius=2,autopct=lambda pct: func(pct, topics),
                                  textprops=dict(color="w"))

ax.legend(wedges, keywords,
          title="Terms",
          loc="center left",
          bbox_to_anchor=(1.5, -0.5, 0.5, 1))

plt.setp(autotexts, size=12, weight="bold")


plt.show()


In [None]:
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.XKCD_COLORS.items()]

cloud = WordCloud( background_color='white',
                  width=2500,
                  height=1800,
                  max_words=20,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)
topics = optimal_model.show_topics(formatted=False)
fig, axes = plt.subplots(2, 4, figsize=(15,15), sharex=True, sharey=True)
for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=600)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')
plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()