# 1-  Data crawling on TwitterAPI: Full-archive search 

Documentation: https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all 

Endpoint URL: https://api.twitter.com/2/tweets/search/all


In [None]:
from dotenv import load_dotenv
import urllib.request as urllib2
import pandas as pd
import requests
import json
import time
import os

load_dotenv()
auth_token = os.environ.get('AUTH_TOKEN')
header = {'Authorization': 'Bearer ' + auth_token}

class TwitterHook():

    def __init__(self, query, header = None, start_time = None, end_time = None, max_results= None):
        self.query = query
        self.header = header
        self.start_time = '2020-02-29T00%3A00%3A00Z'
        self.end_time = '2021-05-04T00%3A00%3A00Z'
        self.max_results = '500'

    def create_url(self):
        query = self.query
        start_time = self.start_time
        end_time = self.end_time        
        
        tweet_fields = "tweet.fields=author_id,id,created_at,in_reply_to_user_id,text"
        user_fields = "expansions=author_id&user.fields=id,name,username,created_at"
        start_time = (
            f"&start_time={self.start_time}"
            if self.start_time
            else ""
        )
        end_time = (
            f"&end_time={self.end_time}"
            if self.end_time
            else ""
        )
        max_results  = (
            f"&max_results={self.max_results}"
            if self.max_results
            else ""
        )
        url = "https://api.twitter.com/2/tweets/search/all?query={}&{}&{}{}{}{}".format(
               query, tweet_fields, user_fields, start_time, end_time, max_results
        )
        return url

    def connect_to_endpoint(self, url, header):
        response = requests.get(url,headers=header)
        listOfTweets = json.loads(response.content)
        return  listOfTweets


    def paginate(self, url, header, next_token=""):
        if next_token:
            full_url = f"{url}&next_token={next_token}"
            print('New Request on',full_url)
        else:
            full_url = url
            print('New Request on',full_url)
        data = self.connect_to_endpoint(full_url, header)
        yield data
        if "next_token" in data.get("meta", {}):
            yield from self.paginate(url, header, data['meta']['next_token'])


    def run(self):  
        url = self.create_url()
        yield from self.paginate(url, header)
        
        
def GetTweets(query):
    tweets = pd.DataFrame()
    for pg in TwitterHook(query).run():
        time.sleep(1)  
        
        if 'data' in pg:
            tweets =  tweets.append(pg['data'],ignore_index=True)
        else:
             print('Missing request')
        
    print('Done! Total of', len(tweets), 'tweets collected.')
    return tweets

In [None]:
tweets = GetTweets(urllib2.quote('#vacinanao -rt'))

# 2- Text preprocessing

### 2.1- Import datasets from Google Drive

In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')
tweets =  pd.read_csv('/content/drive/MyDrive/datasets/provaxxers/your_tweets.csv')

### 2.2- Import datasets from local disck

In [None]:
import pandas as pd


antivaxxers = pd.read_csv('./yourdirectory/antivaxxersTweets.csv', low_memory=False)
provaxxers = pd.read_csv('./yourdirectory/provaxxersTweets.csv', low_memory=False)

start_date ='2020-03-11T00:59:59.000Z'
end_date = '2022-04-06T00:00:00.000Z'

mask_provaxxers = (provaxxers['created_at'] >= start_date) & (provaxxers['created_at'] <= end_date)
provaxxers = provaxxers.loc[mask_provaxxers]

mask_antivaxxers = (antivaxxers['created_at'] >= start_date) & (antivaxxers['created_at'] <= end_date)
antivaxxers = antivaxxers.loc[mask_antivaxxers]

### 2.3- Pre-proccess function

In [None]:
import nltk
from nltk import tokenize
import numpy as np 
from string import punctuation
import unidecode
stemmer = nltk.RSLPStemmer()


def proccess_text(tweets):
    
    # Removing links, mentions and hashtags
    tweets['processed_text'] = tweets.text.str.replace(r'(http\S+)', '',regex=True) \
                                          .str.replace(r'@[\w]*', '',regex=True) \
                                          .str.replace(r'#[\w]*','',regex=True) 
    print('[ok] - Removing links.')
    print('[ok] - Removing mentions.')
    print('[ok] - Removing hashtags.')

    textWords = ' '.join([text for text in tweets.processed_text])

    # Removing accent
    textWords = [unidecode.unidecode(text) for text in tweets.processed_text ]    
    print('[ok] - Removing accent.')
    
    # Creating a list of words and characters (stopwords) to be removed from the text
    # stopWords = nltk.corpus.stopwords.words("portuguese")    
    print('[ok] - Creating a list of words and characters (stopwords) to be removed from the text.')
    
    
    # Separating punctuation from words
    punctSeparator = tokenize.WordPunctTokenizer()
    punctuationList = list()
    for punct in punctuation:
        punctuationList.append(punct)
        
    #stopWords =   punctuationList + stopWords    
    stopWords =   punctuationList
    #print('[ok] - Separating punctuation from words.')


    # Iterating over the text and removing stop words 
    trasnformedText = list()    
    for text in textWords:
        newText = list()   
        text = text.lower()
        textWords = punctSeparator.tokenize(text)
        for words in textWords:
             if words not in stopWords:
                #newText.append(stemmer.stem(words))
                newText.append(words)
        trasnformedText.append(' '.join(newText))
    tweets.processed_text = trasnformedText
    print('[ok] - Removing punctuation and set text to lowecase.')
   
    # Removing all non-text characters
    tweets.processed_text = tweets['processed_text'].str.replace(r"[^a-zA-Z#]", " ", regex=True)                                                         
    print('[ok] - Removing all non-text characters.')
   
    trasnformedText = list()
    for phrase in tweets.processed_text:
        newPhrase = list()   
        newPhrase.append(' '.join(phrase.split()))
        for words in newPhrase:
            trasnformedText.append(''.join(newPhrase))
    tweets.processed_text = trasnformedText
    
    # Removing tweets with less than three terms
    index=[x for x in tweets.index if tweets.processed_text[x].count(' ') < 3]
    tweets = tweets.drop(index)
    print('[ok] - Removing tweets with less than three terms.')

    # Removing empty lines
    removeEmpty  = tweets.processed_text != ' '
    tweets = tweets[removeEmpty]
    print('[ok] - Removing empty lines.')

    tweets.reset_index(inplace=True)
    tweets = {'created_at': tweets.created_at, 'id':tweets.id,'author_id':tweets.author_id,'in_reply_to_user_id':tweets.in_reply_to_user_id, 'text': tweets.processed_text}
    #tweets = {'text': tweets.processed_text,'stance':tweets.stance}
    tweets = pd.DataFrame(tweets)
    tweets = tweets.sort_values(['created_at']).reset_index().drop(columns=["index"])
    #tweets = tweets.reset_index().drop(columns=["index"])
    
    return tweets

In [None]:
provaxxers = proccess_text(provaxxers)

# 3- Topic Modeling with BERTopic

### 3.1- Load pre-processed data

In [27]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')
tweets =  pd.read_csv('/content/drive/MyDrive/datasets/provaxxers/provaxxers_processed.csv')

ModuleNotFoundError: No module named 'google'

### 3.2- Checking dependencies

In [None]:
import sys

if 'bertopic' not in sys.modules:
    print('Installing requeriment..')
    ! pip install bertopic
else:
    print('Requirement already satisfied..')

### 3.3- Checking GPUs

In [None]:
import torch

if torch.cuda.is_available():        
    device = torch.device("cuda")    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())    
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
!nvidia-smi

### 3.4- Checking RAM

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

### 3.5- Custom Embbedings model

In [None]:
from sentence_transformers import SentenceTransformer, util

bert_model = SentenceTransformer("all-mpnet-base-v2", device="cuda")

### 3.6- Custom UMAP model

In [None]:
from umap import UMAP
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

umap_model = UMAP(n_neighbors=15, n_components=5, metric='cosine')

### 3.7- Custom HDBSCAN model

In [None]:
import hdbscan

hdbscan_model = hdbscan.HDBSCAN(min_cluster_size = 65,
                                metric='euclidean', 
                                cluster_selection_method='eom')

### 3.8- Custom vectorizer model model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(ngram_range=(2, 2), stop_words="english")

### 3.9- Initializing BERTopic parameters

In [None]:
docs = tweets

from bertopic import BERTopic
topic_model = BERTopic(#language = 'english',
                       embedding_model=bert_model,
                       top_n_words=10,
                       #n_gram_range=(1, 2),
                       #min_topic_size=50,   
                       nr_topics = 'auto',
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model
                       vectorizer_model=vectorizer_model,
                       low_memory=True,
                       calculate_probabilities=False, 
                       verbose=True)

### 3.10- Generating the model

In [None]:
topics, probs = topic_model.fit_transform(docs.text)

### 3.11- Serialize models, topics and docs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
topic_model.save("/content/drive/MyDrive/datasets/models/model_name")

In [None]:
topic = pd.DataFrame(topics, columns=['topic'])
topic.to_csv('/content/drive/MyDrive/datasets/models/provaxxers_all_mpnet_base_v2_topics.csv')

### 3.12- Load models,topics and docs

In [None]:
import sys
import pandas as pd
from google.colab import drive


if 'bertopic' not in sys.modules:
    print('Installing requeriment..')
    ! pip install bertopic
    from bertopic import BERTopic
else:
    from bertopic import BERTopic
    print('Requirement already satisfied..')

In [None]:
drive.mount('/content/drive')
topic_model = BERTopic.load("/content/drive/MyDrive/datasets/models/provaxxers_all_mpnet_base_v2")

In [None]:
topics= pd.read_csv('/content/drive/MyDrive/datasets/models/provaxxers_all_mpnet_base_v2_topics.csv')
topics=topics.drop(columns=["Unnamed: 0"])
topics = topics.topic

docs = pd.read_csv('/content/drive/MyDrive/datasets/provaxxers/provaxxers_processed.csv')

### 3.13- Inspect topics

In [None]:
topic_docs = {topic: [] for topic in set(topics)}
for topic, doc in zip(topics, docs.text):
    topic_docs[topic].append(doc)

In [None]:
topic_model.find_topics("canada")

In [None]:
topic_model.get_representative_docs(0)

In [None]:
topic_docs[0]

In [26]:
import json

#provaxxers
t_values_favor = []
t_values_none = []
t_values_against = []

#antivaxxesrs
#t_values_favor = [] 
#t_values_against = []
#t_values_none = []

In [None]:
with open('/content/drive/MyDrive/datasets/provaxxers/rotulos/favor.json', 'w') as file:
     for t in t_values_favor:
        file.write(json.dumps(topic_docs[t])) 

In [None]:
with open('/content/drive/MyDrive/datasets/provaxxers/rotulos/none.json', 'w') as file:
     for t in t_values_none:
        file.write(json.dumps(topic_docs[t])) 

In [None]:
with open('/content/drive/MyDrive/datasets/provaxxers/rotulos/against.json', 'w') as file:
     for t in t_values_against:
        file.write(json.dumps(topic_docs[t])) 

### 3.14- Visualize topics

In [None]:
topic_model.visualize_topics()

### 3.15- Reducing the number of topics

In [None]:
newTopics, newProbs = topic_model.reduce_topics(docs.text, topics, probs, nr_topics=10) 

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.get_topics()

In [None]:
topic_model.get_representative_docs(0)

### 3.16- Dynamic modeling

In [None]:
timestamps = docs.created_at.to_list()
tweets = docs.text.to_list()

In [None]:
topics_over_time = topic_model.topics_over_time(docs=tweets, 
                                                topics=newTopics,                                                                                           
                                                timestamps=timestamps, 
                                                global_tuning=True,
                                                evolution_tuning=True, 
                                                nr_bins=20)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=11)