# 1-  Data crawling on TwitterAPI: Full-archive search 

Documentation: https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all 

Endpoint URL: https://api.twitter.com/2/tweets/search/all


In [None]:
from dotenv import load_dotenv
import urllib.request as urllib2
import pandas as pd
import requests
import json
import time
import os

load_dotenv()
auth_token = os.environ.get('AUTH_TOKEN')
header = {'Authorization': 'Bearer '+auth_token}

class TwitterHook():

    def __init__(self, query, header = None, start_time = None, end_time = None, max_results= None):
        self.query = query
        self.header = header
        self.start_time = '2020-02-29T00%3A00%3A00Z'
        self.end_time = '2021-05-04T00%3A00%3A00Z'
        self.max_results = '500'

    def create_url(self):
        query = self.query
        start_time = self.start_time
        end_time = self.end_time        
        
        tweet_fields = "tweet.fields=author_id,id,created_at,in_reply_to_user_id,text"
        user_fields = "expansions=author_id&user.fields=id,name,username,created_at"
        start_time = (
            f"&start_time={self.start_time}"
            if self.start_time
            else ""
        )
        end_time = (
            f"&end_time={self.end_time}"
            if self.end_time
            else ""
        )
        max_results  = (
            f"&max_results={self.max_results}"
            if self.max_results
            else ""
        )
        url = "https://api.twitter.com/2/tweets/search/all?query={}&{}&{}{}{}{}".format(
               query, tweet_fields, user_fields, start_time, end_time, max_results
        )
        return url

    def connect_to_endpoint(self, url, header):
        response = requests.get(url,headers=header)
        listOfTweets = json.loads(response.content)
        return  listOfTweets


    def paginate(self, url, header, next_token=""):
        if next_token:
            full_url = f"{url}&next_token={next_token}"
            print('New Request on',full_url)
        else:
            full_url = url
            print('New Request on',full_url)
        data = self.connect_to_endpoint(full_url, header)
        yield data
        if "next_token" in data.get("meta", {}):
            yield from self.paginate(url, header, data['meta']['next_token'])


    def run(self):  
        url = self.create_url()
        yield from self.paginate(url, header)
        
        
def GetTweets(query):
    tweets = pd.DataFrame()
    for pg in TwitterHook(query).run():
        time.sleep(1)  
        
        if 'data' in pg:
            tweets =  tweets.append(pg['data'],ignore_index=True)
        else:
             print('Missing request')
        
    print('Done! Total of', len(tweets), 'tweets collected.')
    return tweets

In [None]:
tweets = GetTweets(urllib2.quote('#vacinanao -rt'))

In [None]:
#tweets.to_csv('./vacinaobrigatorianao.csv',index=False)

# 2- Text preprocessing

In [15]:
import pandas as pd

#antivaxxers = pd.read_csv('./datasets/antivaxxers/antivaxxersTweets.csv', low_memory=False)
#provaxxers = pd.read_csv('./datasets/provaxxers/provaxxersTweets.csv', low_memory=False)

provaxxers = pd.read_csv('./datasets/antivaxxers/novaccinepassports.csv', low_memory=False)

In [16]:
import nltk
from nltk import tokenize
import numpy as np 
from string import punctuation
import unidecode
stemmer = nltk.RSLPStemmer()


def proccess_text(tweets):
    
    # Removing links, mentions and hashtags
    tweets['processed_text'] = tweets.text.str.replace(r'(http\S+)', '',regex=True) \
                                          .str.replace(r'@[\w]*', '',regex=True) \
                                          .str.replace(r'#[\w]*','',regex=True) 
    print('[ok] - Removing links.')
    print('[ok] - Removing mentions.')
    print('[ok] - Removing hashtags.')

    textWords = ' '.join([text for text in tweets.processed_text])

    # Removing accent
    textWords = [unidecode.unidecode(text) for text in tweets.processed_text ]    
    print('[ok] - Removing accent.')
    
    # Creating a list of words and characters (stopwords) to be removed from the text
    #stopWords = nltk.corpus.stopwords.words("portuguese")    
    print('[ok] - Creating a list of words and characters (stopwords) to be removed from the text.')
    
    
    # Separating punctuation from words
    punctSeparator = tokenize.WordPunctTokenizer()
    punctuationList = list()
    for punct in punctuation:
        punctuationList.append(punct)
        
    #stopWords =   punctuationList + stopWords    
    stopWords =   punctuationList
    print('[ok] - Separating punctuation from words.')


    # Iterating over the text and removing stop words 
    trasnformedText = list()    
    for text in textWords:
        newText = list()   
        text = text.lower()
        textWords = punctSeparator.tokenize(text)
        for words in textWords:
             if words not in stopWords:
                #newText.append(stemmer.stem(words))
                newText.append(words)
        trasnformedText.append(' '.join(newText))
    tweets.processed_text = trasnformedText
    print('[ok] - Iterating over the text and removing stop words.')
   
    # Removing all non-text characters
    tweets.processed_text = tweets['processed_text'].str.replace(r"[^a-zA-Z#]", " ", regex=True)                                                         
    print('[ok] - Removing all non-text characters.')
   
    trasnformedText = list()
    for phrase in tweets.processed_text:
        newPhrase = list()   
        newPhrase.append(' '.join(phrase.split()))
        for words in newPhrase:
            trasnformedText.append(''.join(newPhrase))
    tweets.processed_text = trasnformedText
    
    # Removing tweets with less than three terms
    index=[x for x in tweets.index if tweets.processed_text[x].count(' ') < 3]
    tweets = tweets.drop(index)
    print('[ok] - Removing tweets with less than three terms.')

    # Removing empty lines
    removeEmpty  = tweets.processed_text != ' '
    tweets = tweets[removeEmpty]
    print('[ok] - Removing empty lines.')

    tweets.reset_index(inplace=True)
    tweets = {'created_at': tweets.created_at, 'id':tweets.id,'author_id':tweets.author_id,'in_reply_to_user_id':tweets.in_reply_to_user_id, 'text': tweets.processed_text}
    tweets = pd.DataFrame(tweets)
    tweets = tweets.sort_values(['created_at']).reset_index().drop(columns=["index"])
    
    return tweets

In [17]:
tweets = proccess_text(provaxxers)

[ok] - Removing links.
[ok] - Removing mentions.
[ok] - Removing hashtags.
[ok] - Removing accent.
[ok] - Creating a list of words and characters (stopwords) to be removed from the text.
[ok] - Separating punctuation from words.
[ok] - Iterating over the text and removing stop words.
[ok] - Removing all non-text characters.
[ok] - Removing tweets with less than three terms.
[ok] - Removing empty lines.


# 3- Topic Modeling with BERTopic

### Checking GPUs

In [19]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():        
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())    
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2060 SUPER


### Initializing BERTopic parameters

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(ngram_range=(2, 2), stop_words="english")

In [7]:
docs = tweets

from bertopic import BERTopic
topic_model = BERTopic(language = 'english',
                       #embedding_model=bert_model,
                       top_n_words=10,
                       #n_gram_range=(1, 2),
                       min_topic_size=50,   
                       nr_topics = 'auto',
                       #umap_model=umap_model,  
                       vectorizer_model=vectorizer_model,
                       low_memory=True,
                       calculate_probabilities=False, 
                       verbose=True)

### Generating the model

In [8]:
topics, probs = topic_model.fit_transform(docs.text)

Batches: 100%|██████████| 5771/5771 [01:04<00:00, 89.78it/s] 
2022-04-07 15:49:49,044 - BERTopic - Transformed documents to Embeddings
2022-04-07 15:55:33,959 - BERTopic - Reduced dimensionality with UMAP
2022-04-07 15:55:50,470 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-04-07 15:56:09,359 - BERTopic - Reduced number of topics from 419 to 325


In [9]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

### Reducing the number of topics

In [10]:
newTopics, newProbs = topic_model.reduce_topics(docs.text, topics, probs, nr_topics=10) 

2022-04-07 15:57:42,146 - BERTopic - Reduced number of topics from 325 to 11


In [11]:
topic_model.visualize_topics()

In [None]:
topic_model.get_topics()

In [20]:
topic_model.get_representative_docs(0)

['don t cooperate with the incoming caste system there should be no vaccine passports verifications registrations id cards licenses or apps',
 'all double vaccinated what is the point of vaccine passports',
 'passports to show your immune status are the opposite of normal life they are also unnecessary once you are vaccinated someone else s immune status does not matter so what are they for',
 'so you have said a number of times now there are no mandates for vaccination put your words into action and support the vaccination discrimination bill',
 'as the st vaccine was administered on this day last year we were promised our rightful freedoms back exactly months on and we find ourselves fighting for them very same freedoms someone lied to us',
 'dear please read to see what happens when you implement vaccine mandates please see what it legitimizes rescind your vaccine mandate for clergy workers and volunteers',
 'do you mean the vaccine passports that we are definitely not introducing i

### Dynamic modeling

In [12]:
timestamps = docs.created_at.to_list()
tweets = docs.text.to_list()

In [13]:
topics_over_time = topic_model.topics_over_time(docs=tweets, 
                                                topics=newTopics,                                                                                           
                                                timestamps=timestamps, 
                                                global_tuning=True,
                                                evolution_tuning=True, 
                                                nr_bins=20)

19it [04:20, 13.73s/it]


In [14]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=11)