# 1-  Data crawling on TwitterAPI: Full-archive search 

Documentation: https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all 

Endpoint URL: https://api.twitter.com/2/tweets/search/all


In [None]:
from dotenv import load_dotenv
import urllib.request as urllib2
import pandas as pd
import requests
import json
import time
import os

load_dotenv()
auth_token = os.environ.get('AUTH_TOKEN')
header = {'Authorization': 'Bearer ' + auth_token}

class TwitterHook():

    def __init__(self, query, header = None, start_time = None, end_time = None, max_results= None):
        self.query = query
        self.header = header
        self.start_time = '2020-02-29T00%3A00%3A00Z'
        self.end_time = '2021-05-04T00%3A00%3A00Z'
        self.max_results = '500'

    def create_url(self):
        query = self.query
        start_time = self.start_time
        end_time = self.end_time        
        
        tweet_fields = "tweet.fields=author_id,id,created_at,in_reply_to_user_id,text"
        user_fields = "expansions=author_id&user.fields=id,name,username,created_at"
        start_time = (
            f"&start_time={self.start_time}"
            if self.start_time
            else ""
        )
        end_time = (
            f"&end_time={self.end_time}"
            if self.end_time
            else ""
        )
        max_results  = (
            f"&max_results={self.max_results}"
            if self.max_results
            else ""
        )
        url = "https://api.twitter.com/2/tweets/search/all?query={}&{}&{}{}{}{}".format(
               query, tweet_fields, user_fields, start_time, end_time, max_results
        )
        return url

    def connect_to_endpoint(self, url, header):
        response = requests.get(url,headers=header)
        listOfTweets = json.loads(response.content)
        return  listOfTweets


    def paginate(self, url, header, next_token=""):
        if next_token:
            full_url = f"{url}&next_token={next_token}"
            print('New Request on',full_url)
        else:
            full_url = url
            print('New Request on',full_url)
        data = self.connect_to_endpoint(full_url, header)
        yield data
        if "next_token" in data.get("meta", {}):
            yield from self.paginate(url, header, data['meta']['next_token'])


    def run(self):  
        url = self.create_url()
        yield from self.paginate(url, header)
        
        
def GetTweets(query):
    tweets = pd.DataFrame()
    for pg in TwitterHook(query).run():
        time.sleep(1)  
        
        if 'data' in pg:
            tweets =  tweets.append(pg['data'],ignore_index=True)
        else:
             print('Missing request')
        
    print('Done! Total of', len(tweets), 'tweets collected.')
    return tweets

In [None]:
tweets = GetTweets(urllib2.quote('#vacinanao -rt'))

In [None]:
#tweets.to_csv('./vacinaobrigatorianao.csv',index=False)

# 2- Text preprocessing

### Import datasets from Google Drive

In [None]:
import pandas as pd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id':"1kzlbUH76yT_J4sMj8ZMf0ZsyrpNK0uhX"}) 
downloaded.GetContentFile('provaxxersProcessed.csv') 

In [None]:
provaxxers =  pd.read_csv('./provaxxersProcessed.csv')

### Import datasets from local disck

In [2]:
import pandas as pd


antivaxxers = pd.read_csv('./datasets/antivaxxersTweets.csv', low_memory=False)
provaxxers = pd.read_csv('./datasets/provaxxersTweets.csv', low_memory=False)

start_date ='2020-03-11T00:59:59.000Z'
end_date = '2022-04-06T00:00:00.000Z'

mask_provaxxers = (provaxxers['created_at'] >= start_date) & (provaxxers['created_at'] <= end_date)
provaxxers = provaxxers.loc[mask_provaxxers]

mask_antivaxxers = (antivaxxers['created_at'] >= start_date) & (antivaxxers['created_at'] <= end_date)
antivaxxers = antivaxxers.loc[mask_antivaxxers]

### Pre-proccess function

In [3]:
import nltk
from nltk import tokenize
import numpy as np 
from string import punctuation
import unidecode
stemmer = nltk.RSLPStemmer()


def proccess_text(tweets):
    
    # Removing links, mentions and hashtags
    tweets['processed_text'] = tweets.text.str.replace(r'(http\S+)', '',regex=True) \
                                          .str.replace(r'@[\w]*', '',regex=True) \
                                          .str.replace(r'#[\w]*','',regex=True) 
    print('[ok] - Removing links.')
    print('[ok] - Removing mentions.')
    print('[ok] - Removing hashtags.')

    textWords = ' '.join([text for text in tweets.processed_text])

    # Removing accent
    textWords = [unidecode.unidecode(text) for text in tweets.processed_text ]    
    print('[ok] - Removing accent.')
    
    # Creating a list of words and characters (stopwords) to be removed from the text
    # stopWords = nltk.corpus.stopwords.words("portuguese")    
    print('[ok] - Creating a list of words and characters (stopwords) to be removed from the text.')
    
    
    # Separating punctuation from words
    punctSeparator = tokenize.WordPunctTokenizer()
    punctuationList = list()
    for punct in punctuation:
        punctuationList.append(punct)
        
    #stopWords =   punctuationList + stopWords    
    stopWords =   punctuationList
    #print('[ok] - Separating punctuation from words.')


    # Iterating over the text and removing stop words 
    trasnformedText = list()    
    for text in textWords:
        newText = list()   
        text = text.lower()
        textWords = punctSeparator.tokenize(text)
        for words in textWords:
             if words not in stopWords:
                #newText.append(stemmer.stem(words))
                newText.append(words)
        trasnformedText.append(' '.join(newText))
    tweets.processed_text = trasnformedText
    print('[ok] - Removing punctuation and set text to lowecase.')
   
    # Removing all non-text characters
    tweets.processed_text = tweets['processed_text'].str.replace(r"[^a-zA-Z#]", " ", regex=True)                                                         
    print('[ok] - Removing all non-text characters.')
   
    trasnformedText = list()
    for phrase in tweets.processed_text:
        newPhrase = list()   
        newPhrase.append(' '.join(phrase.split()))
        for words in newPhrase:
            trasnformedText.append(''.join(newPhrase))
    tweets.processed_text = trasnformedText
    
    # Removing tweets with less than three terms
    index=[x for x in tweets.index if tweets.processed_text[x].count(' ') < 3]
    tweets = tweets.drop(index)
    print('[ok] - Removing tweets with less than three terms.')

    # Removing empty lines
    removeEmpty  = tweets.processed_text != ' '
    tweets = tweets[removeEmpty]
    print('[ok] - Removing empty lines.')

    tweets.reset_index(inplace=True)
    tweets = {'created_at': tweets.created_at, 'id':tweets.id,'author_id':tweets.author_id,'in_reply_to_user_id':tweets.in_reply_to_user_id, 'text': tweets.processed_text}
    tweets = pd.DataFrame(tweets)
    tweets = tweets.sort_values(['created_at']).reset_index().drop(columns=["index"])
    
    return tweets

In [4]:
#provaxxers = proccess_text(provaxxers)
antivaxxers  = proccess_text(antivaxxers)

[ok] - Removing links.
[ok] - Removing mentions.
[ok] - Removing hashtags.
[ok] - Removing accent.
[ok] - Creating a list of words and characters (stopwords) to be removed from the text.
[ok] - Removing punctuation and set texto to lowecase.
[ok] - Removing all non-text characters.
[ok] - Removing tweets with less than three terms.
[ok] - Removing empty lines.


In [5]:
#provaxxers.to_csv('./datasets/provaxxersProcessed.csv',index=False)
antivaxxers.to_csv('./datasets/antivaxxersProcessed.csv',index=False)

# 3- Topic Modeling with BERTopic

### Checking GPUs

In [1]:
import torch

if torch.cuda.is_available():        
    device = torch.device("cuda")    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())    
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1050 Ti


In [2]:
!nvidia-smi

Tue Apr 12 16:16:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 462.31       Driver Version: 462.31       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 105... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   45C    P8    N/A /  N/A |    266MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Enable RAPIDS

In [3]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/env-check.py

Cloning into 'rapidsai-csp-utils'...
Traceback (most recent call last):
  File "rapidsai-csp-utils/colab/env-check.py", line 1, in <module>
    import pynvml
ModuleNotFoundError: No module named 'pynvml'


In [None]:
!bash rapidsai-csp-utils/colab/update_gcc.sh
import os
os._exit(00)

^C


In [None]:
import condacolab
condacolab.install()

In [None]:
import condacolab
condacolab.check()

In [None]:
!python rapidsai-csp-utils/colab/install_rapids.py stable
import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
os.environ['CONDA_PREFIX'] = '/usr/local'

### Checking RAM

In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 17.0 gigabytes of available RAM

Not using a high-RAM runtime


### Custom BERT model

In [None]:
from sentence_transformers import SentenceTransformer, util

bert_model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

### Custom UMAP model

In [None]:
from cuml.manifold import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5,verbose=True)

### Custom HDBSCAN model

In [None]:
from cuml.cluster import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')

### Custom vectorizer model model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(ngram_range=(2, 2), stop_words="english")

### Initializing BERTopic parameters

In [None]:
docs = tweets

from bertopic import BERTopic
topic_model = BERTopic(#language = 'english',
                       embedding_model=bert_model,
                       top_n_words=10,
                       #n_gram_range=(1, 2),
                       min_topic_size=50,   
                       nr_topics = 'auto',
                       #umap_model=umap_model,  
                       vectorizer_model=vectorizer_model,
                       low_memory=True,
                       calculate_probabilities=False, 
                       verbose=True)

### Generating the model

In [None]:
topics, probs = topic_model.fit_transform(docs.text)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

### Reducing the number of topics

In [None]:
newTopics, newProbs = topic_model.reduce_topics(docs.text, topics, probs, nr_topics=10) 

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.get_topics()

In [None]:
topic_model.get_representative_docs(0)

### Dynamic modeling

In [None]:
timestamps = docs.created_at.to_list()
tweets = docs.text.to_list()

In [None]:
topics_over_time = topic_model.topics_over_time(docs=tweets, 
                                                topics=newTopics,                                                                                           
                                                timestamps=timestamps, 
                                                global_tuning=True,
                                                evolution_tuning=True, 
                                                nr_bins=20)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=11)