# Data crawling on Twitter: Full-archive search 

Documentation: https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all 

Endpoint URL: https://api.twitter.com/2/tweets/search/all


In [None]:
from dotenv import load_dotenv
import pandas as pd
import requests
import json
import time
import os

query='((vacina%20vacinacao)%20OR%20(vacina%20OR%20vacinacao))%20-rt'
start_time='2020-02-29T00%3A00%3A00Z'
end_time='2021-07-12T00%3A00%3A00Z'

load_dotenv()
auth_token = os.environ.get('AUTH_TOKEN')
header = {'Authorization': 'Bearer ' + auth_token}

max_results='500'
next_token=''

url='https://api.twitter.com/2/tweets/search/all?query='+query+'&start_time='+start_time+'&end_time='+end_time+'&max_results='+max_results+'&expansions=author_id&tweet.fields=created_at'
response = requests.get(url,headers=header)
time.sleep(1)
listOfTweets = json.loads(response.content)
print('New Request on',url)

tweets = pd.DataFrame(listOfTweets['data'])   

if 'next_token' in listOfTweets['meta']:    
    next_token = listOfTweets['meta']['next_token']   
    
    while 'next_token' in listOfTweets['meta']:        
        url='https://api.twitter.com/2/tweets/search/all?query='+query+'&start_time='+start_time+'&end_time='+end_time+'&max_results='+max_results+'&next_token='+next_token+'&expansions=author_id&tweet.fields=created_at'
        response = requests.get(url,headers=header)  
        time.sleep(1)
        listOfTweets = json.loads(response.content)         
       
        print('New Request on',url)
        
        if 'data' in listOfTweets:
            tweets = tweets.append(pd.DataFrame(listOfTweets['data']),ignore_index=True)

            if  'meta' in listOfTweets:         
                if 'next_token' in listOfTweets['meta']:
                    next_token =  listOfTweets['meta']['next_token']
                else:
                    print('Done! Total of ', len(tweets), 'tweets collected.')                
                    break
            else:
                break
        else:
            print('Missing request')
            break
else:
    tweets = pd.DataFrame(listOfTweets['data'])
    print('Done! Total of', len(tweets), 'tweets collected.')

In [None]:
twitterData.to_csv('./tweets.csv',index=False)

# Text pre-processing

In [1]:
import pandas as pd
#provaxxers
provaxxers = pd.read_csv('./datasets/provaxxersTweets.csv', low_memory=False)

#antivaxxers
antivaxxers = pd.read_csv('./datasets/antivaxxersTweets.csv', low_memory=False)

In [2]:
df = provaxxers

In [3]:
tweets = {'id': df.id, 'created_at':df.created_at, 'text':df.text}
tweets = pd.DataFrame(tweets) 

In [5]:
start_date ='2020-02-29T00:00:00.000Z'
end_date = '2021-05-04T00:00:00.000Z'

mask = (tweets['created_at'] > start_date) & (tweets['created_at'] <= end_date)
tweets = tweets.loc[mask]

tweets = tweets.sort_values(['created_at']).reset_index().drop(columns=["index"])

In [6]:
tweets

Unnamed: 0,id,created_at,text
0,1233589655133659136,2020-02-29T03:07:47.000Z,Primeiro caso de morte por sarampo. Uma crianç...
1,1235059227166527489,2020-03-04T04:27:20.000Z,#dengue #sarampo #coronavirüs e agora?\n.\nLei...
2,1236081675332960258,2020-03-07T00:10:11.000Z,Listen to the most recent episode of my podcas...
3,1238185487426256896,2020-03-12T19:29:59.000Z,O medo do Corona Vírus não é só pela contamina...
4,1238426737958158337,2020-03-13T11:28:37.000Z,#prontofalei \n\nCompartilhe!!! Vamos divulgar...
...,...,...,...
252066,1389368142288363530,2021-05-03T23:55:52.000Z,"@jdoriajr Doria por favor, coloca como prefere..."
252067,1389368381439107082,2021-05-03T23:56:50.000Z,@carlosriconbr @juninhomengao10 @UOLNoticias @...
252068,1389368577132793856,2021-05-03T23:57:36.000Z,"Salvador acaba de receber 26,9mil doses de vac..."
252069,1389368588700639240,2021-05-03T23:57:39.000Z,"Mano, que inveja me deu agora assistindo o Jor..."


In [7]:
import nltk
from nltk import tokenize

import numpy as np 
from string import punctuation
import unidecode
stemmer = nltk.RSLPStemmer()

# Removendo hashtags, menções a usuários, numeros, termos curtos e links

def proccess_text(text):
    
    twitterData = pd.DataFrame(text) 
    twitterData['processed_text'] = twitterData.text.str.replace(r'(http\S+)', '') \
                                                    .str.replace(r'@[\w]*', '') \
                                                    .str.replace(r'#[\w]*','') 

    textWords = ' '.join([text for text in twitterData.processed_text])

    # Removendo acentuação
    textWords = [unidecode.unidecode(text) for text in twitterData.processed_text ]

    # Criando lista com palavras e caracteres (stopwords) a serem removidos do texto
    stopWords = nltk.corpus.stopwords.words("portuguese")

    # Separando a pontuação das palavras
    punctSeparator = tokenize.WordPunctTokenizer()
    punctuationList = list()
    for punct in punctuation:
        punctuationList.append(punct)
        
    stopWords =   punctuationList + stopWords


    # Iterando o texto removendo as stopwords
    
    trasnformedText = list()
    
    for text in textWords:
        newText = list()   
        text = text.lower()
        textWords = punctSeparator.tokenize(text)
        for words in textWords:
             if words not in stopWords:
                #newText.append(stemmer.stem(words))
                newText.append(words)
        trasnformedText.append(' '.join(newText))
    twitterData.processed_text = trasnformedText

   
    twitterData.processed_text = twitterData.processed_text.str.replace(r"[^a-zA-Z#]", " ") 
                                                        
    
    return twitterData.processed_text



In [8]:
tweets = {'created_at': tweets.created_at, 'text': tweets.text,'id':tweets.id}
rawTweets = pd.DataFrame(tweets)

rawTweets['processed_text'] = proccess_text(rawTweets.text)

processedTweets = rawTweets.drop(columns=["text"])

trasnformedText = list()
for phrase in processedTweets.processed_text:
    newPhrase = list()   
    newPhrase.append(' '.join(phrase.split()))
    for words in newPhrase:
        trasnformedText.append(''.join(newPhrase))
processedTweets.processed_text = trasnformedText

index=[x for x in processedTweets.index if processedTweets.processed_text[x].count(' ') < 3]
processedTweets = processedTweets.drop(index)

removeEmpty  = processedTweets.processed_text != ' '
processedTweets = processedTweets[removeEmpty]

processedTweets.reset_index(inplace=True)

tweets = {'created_at': processedTweets.created_at, 'text': processedTweets.processed_text, 'id':processedTweets.id}
docs = pd.DataFrame(tweets)

docs = docs.sort_values(['created_at']).reset_index()

docs = docs.drop(columns=["index"])

#docs.to_csv('./datasets/kdmile/provaxxers.csv',index=False)

  twitterData['processed_text'] = twitterData.text.str.replace(r'(http\S+)', '') \
  twitterData.processed_text = twitterData.processed_text.str.replace(r"[^a-zA-Z#]", " ")


In [9]:
docs

Unnamed: 0,created_at,text,id
0,2020-02-29T03:07:47.000Z,primeiro caso morte sarampo crianca anos histo...,1233589655133659136
1,2020-03-04T04:27:20.000Z,agora leia recomendacoes marque amigos compart...,1235059227166527489
2,2020-03-07T00:10:11.000Z,listen to the most recent episode of my podcas...,1236081675332960258
3,2020-03-12T19:29:59.000Z,medo corona virus nao so contaminacao conseque...,1238185487426256896
4,2020-03-13T11:28:37.000Z,compartilhe vamos divulgar informacoes qualida...,1238426737958158337
...,...,...,...
163129,2021-05-03T23:55:52.000Z,doria favor coloca preferencial vacinacao pess...,1389368142288363530
163130,2021-05-03T23:56:50.000Z,tb acredito porem so apos qnd recomecarmos rec...,1389368381439107082
163131,2021-05-03T23:57:36.000Z,salvador acaba receber mil doses vacinas pfize...,1389368577132793856
163132,2021-05-03T23:57:39.000Z,mano inveja deu agora assistindo jornal nacion...,1389368588700639240


In [10]:
tweets = {'id': docs.id, 'created_at':docs.created_at}
rawTweets = pd.DataFrame(tweets) 

In [11]:
rawTweets

Unnamed: 0,id,created_at
0,1233589655133659136,2020-02-29T03:07:47.000Z
1,1235059227166527489,2020-03-04T04:27:20.000Z
2,1236081675332960258,2020-03-07T00:10:11.000Z
3,1238185487426256896,2020-03-12T19:29:59.000Z
4,1238426737958158337,2020-03-13T11:28:37.000Z
...,...,...
163129,1389368142288363530,2021-05-03T23:55:52.000Z
163130,1389368381439107082,2021-05-03T23:56:50.000Z
163131,1389368577132793856,2021-05-03T23:57:36.000Z
163132,1389368588700639240,2021-05-03T23:57:39.000Z


In [12]:
rawTweets.to_csv('./datasets/kdmile/provaxxers.csv',index=False)