In [1]:
#Install Packages
!pip install snscrape
!pip install googletrans==3.1.0a0



In [2]:
#Import packages
import numpy as np
import pandas as pd
from tqdm import tqdm
import snscrape.modules.twitter as sntwitter

In [3]:
#Google Translator
from googletrans import Translator
translator = Translator()

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
import re

# DATA PREPROCESSING

In [5]:
#Me quedo con los tweets que no son en respuesta a otro usuario ni que carga un archivo en el tweet.
prueba = sntwitter.TwitterSearchScraper('from:CarlosMaslaton since:2022-01-01 until:2022-12-31')

tweets = []

for tweet in prueba.get_items():
    if (tweet.inReplyToUser is None) and (tweet.media is None):
        data = [
            tweet.date,
            tweet.id,
            tweet.rawContent,
            tweet.user.username,
            tweet.likeCount,
            tweet.retweetCount,
            tweet.inReplyToUser,
            tweet.media
        ]
        tweets.append(data)
    else:
        continue

In [6]:
df = pd.DataFrame(tweets, columns=['date','id','rawContent','username','likeCount','retweetCount','inReplyToUser','media'])

In [9]:
print(df['inReplyToUser'].value_counts())
print('\n')
print(df['inReplyToUser'].value_counts())
print('\n')
print(df.shape)

Series([], Name: inReplyToUser, dtype: int64)


Series([], Name: inReplyToUser, dtype: int64)


(2413, 8)


**TWEETS TRANSLATION**

In [11]:
tqdm.pandas()

  from pandas import Panel


In [12]:
def trad_tweets(df):
    result = {}

    errors = []

    for t in range(10):

        for i in tqdm(df.index):

            try:
                if i in result.keys():
                    continue
                result[i] = translator.translate(df.loc[i]['rawContent'], src="auto", dest="en").text
                if i in errors:
                    errors.remove(i)
            except:
                if i not in errors:
                    errors.append(i)

        if len(errors) < 1:
            break
    
    return result

In [13]:
df['rawContent_eng'] = trad_tweets(df).values()

100%|██████████████████████████████████████████████████████████████████████████████| 2413/2413 [12:47<00:00,  3.15it/s]


In [15]:
df[['rawContent','rawContent_eng']].tail()

Unnamed: 0,rawContent,rawContent_eng
2408,"Señores, el gobierno argentino se va a quedar ...","Gentlemen, the Argentine government is going t..."
2409,Una gran ventaja de la República de Haití es q...,A great advantage of the Republic of Haiti is ...
2410,@Afederico87 Justicia no.,@Afederico87 Justice no.
2411,@AlbertoABarcelo Es un cargo que puedo desempe...,@AlbertoABarcelo It is a position that I can c...
2412,El colchón que me ha tocado aquí en Haití es t...,The mattress that has touched me here in Haiti...


In [17]:
#Save the dataset
df.to_csv('masla_tweets.csv')

In [45]:
df = pd.read_csv('masla_tweets.csv', index_col=0)
df.head()

Unnamed: 0,date,id,rawContent,username,likeCount,retweetCount,inReplyToUser,media,rawContent_eng
0,2022-12-30 17:45:11+00:00,1608881901749809153,"En una hora, a las 1540, última sesión del año...",CarlosMaslaton,1200,20,,,"In an hour, at 1540, last session of the year ..."
1,2022-12-30 10:11:31+00:00,1608767734086762496,"Ante consultas de numerosos foristas, quiero r...",CarlosMaslaton,478,12,,,In response to inquiries from numerous forum m...
2,2022-12-30 10:01:52+00:00,1608765302631976963,El triunfo electoral del PRO-Juntos por el Cam...,CarlosMaslaton,2846,434,,,The electoral triumph of the PRO-Together for ...
3,2022-12-30 09:30:02+00:00,1608757293038055424,"No se rompan la cabeza los ""investigadores"", a...",CarlosMaslaton,446,58,,,"The ""investigators"" should not puzzle over D'A..."
4,2022-12-30 00:08:48+00:00,1608616052598124544,"La potencia de este fin de año en Argentina, q...",CarlosMaslaton,403,26,,,The power of this end of the year in Argentina...


# PREPROCESSING - LEMMATIZATION

In [46]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
#THANKS CHATGPT!!!
#doc_series = df['rawContent_eng'].apply(lambda x: nlp(x))

In [52]:
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text

#lemmatized_series = df['rawContent_eng'].apply(lemmatize_text)

#How can I use TQDM in this type of apply functions?
df['rawContent_eng_lemma'] = df['rawContent_eng'].apply(lemmatize_text)

# TFIDF Tokenization

I'm going to generate 2 Vectorizers, one for bi and tri-grams and other just for singular words.

In [54]:
vectorizer = TfidfVectorizer(
    lowercase=True,
    max_df=0.8, 
    min_df=5, 
    stop_words=ENGLISH_STOP_WORDS)

In [55]:
pattern = re.compile(r'@([A-Za-z0-9_]+)')
vectors = vectorizer.fit_transform(df['rawContent_eng_lemma'].str.lower().replace([',','crazy',pattern],"", regex=True))

In [56]:
feature_names = vectorizer.get_feature_names()

In [57]:
dense = vectors.todense()

In [58]:
denselist = dense.tolist()

--

In [59]:
vectorizer_ngram = TfidfVectorizer(
    lowercase=True,
    max_df=0.8, 
    min_df=5,
    ngram_range=(1,3),
    stop_words=ENGLISH_STOP_WORDS)

In [60]:
pattern = re.compile(r'@([A-Za-z0-9_]+)')
vectors_ngram = vectorizer_ngram.fit_transform(df['rawContent_eng_lemma'].str.lower().replace([',','crazy',pattern],"", regex=True))

In [61]:
feature_names_ngram = vectorizer_ngram.get_feature_names()

In [62]:
dense_ngram = vectors_ngram.todense()

In [63]:
denselist_ngram = dense_ngram.tolist()

--

In [64]:
def keyword_list(dense_list, features_names):
    all_keywords = []
    for description in dense_list:
        x = 0
        keywords = []
        for word in description:
            if word > 0:
                keywords.append(features_names[x])
            x += 1
        all_keywords.append(keywords)
    return all_keywords

In [65]:
keywords_singular = keyword_list(denselist, feature_names)
keywords_ngram = keyword_list(denselist_ngram, feature_names_ngram)

In [66]:
print(df['rawContent_eng'][0:3])
print('\n')
print(keywords_singular[0:3])
print('\n')
print(keywords_ngram[0:3])

0    In an hour, at 1540, last session of the year ...
1    In response to inquiries from numerous forum m...
2    The electoral triumph of the PRO-Together for ...
Name: rawContent_eng, dtype: object


[['god', 'hour', 'let', 'psychologist', 'session', 'want', 'year'], ['12', 'celebrate', 'default', 'forum', 'friend', 'greet', 'inquiry', 'international', 'judaism', 'member', 'new', 'numerous', 'remind', 'response', 'support', 'want', 'year'], ['2015', 'argument', 'base', 'change', 'come', 'country', 'economic', 'electoral', 'fraud', 'good', 'jet', 'like', 'make', 'moral', 'nature', 'pay', 'peronist', 'policy', 'pro', 'propaganda', 'steal', 'time', 'triumph']]


[['god', 'god want', 'hour', 'let', 'psychologist', 'session', 'want', 'year'], ['12', 'celebrate', 'default', 'forum', 'forum member', 'friend', 'greet', 'inquiry', 'inquiry numerous', 'inquiry numerous forum', 'international', 'judaism', 'member', 'new', 'numerous', 'numerous forum', 'numerous forum member', 'remind', 'respon

# TOPIC MODELING - KMEANS

In [73]:
from sklearn.cluster import KMeans

In [88]:
true_k = 10

In [89]:
model_singular = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)

In [90]:
model_singular.fit(vectors)

KMeans(max_iter=100, n_clusters=10, n_init=1)

In [91]:
order_centroids_singular = model_singular.cluster_centers_.argsort()[:, ::-1]

In [92]:
terms_singular = vectorizer.get_feature_names()

In [93]:
# Shotout: https://www.youtube.com/watch?v=i74DVqMsRWY&list=PL2VXyKi-KpYttggRATQVmgFcQst3z6OlX&index=6
with open ("../topics_kmeans_singular.text", "w", encoding="utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids_singular[i, :10]:
            f.write(' %s' % terms_singular[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")

In [94]:
model_ngram = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)

In [95]:
model_ngram.fit(vectors_ngram)

KMeans(max_iter=100, n_clusters=10, n_init=1)

In [96]:
order_centroids_ngram = model_ngram.cluster_centers_.argsort()[:, ::-1]

In [97]:
terms_ngram = vectorizer_ngram.get_feature_names()

In [98]:
# Shotout: https://www.youtube.com/watch?v=i74DVqMsRWY&list=PL2VXyKi-KpYttggRATQVmgFcQst3z6OlX&index=6
with open ("../topics_kmeans_ngram.text", "w", encoding="utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids_ngram[i, :10]:
            f.write(' %s' % terms_ngram[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")

# DIMENSIONALITY REDUCTION - PCA 

In [103]:
print(f"The number of features we have working with tf-idf without biagrams are: {vectors.shape}")
print("\n")
print(f"The number of features we have working with tf-idf with biagrams and triagrams are: {vectors_ngram.shape}")

The number of features we have working with tf-idf without biagrams are: (2413, 1561)


The number of features we have working with tf-idf with biagrams and triagrams are: (2413, 1889)


In [104]:
from sklearn.decomposition import PCA

In [136]:
#I don't have to scale data cause tfidf values are from 0 to 1.
#n_components = 95 is to use the number of features that grab the 95% of the variation of the data.
pca_singular = PCA(n_components = 0.95)
pca_ngram = PCA(n_components = 0.95)

In [137]:
X_pca_singular = pca_singular.fit_transform(vectors.toarray())
X_pca_ngram = pca_ngram.fit_transform(vectors_ngram.toarray())

# KMEANS with PCA (I have to use pipeline to do the KMEANS clusters making clusters with PCAdata and data without PCA)

In [138]:
model_singular.fit(X_pca_singular)
model_ngram.fit(X_pca_ngram)

KMeans(max_iter=100, n_clusters=10, n_init=1)

In [139]:
order_centroids_singular = model_singular.cluster_centers_.argsort()[:, ::-1]
order_centroids_ngram = model_ngram.cluster_centers_.argsort()[:, ::-1]

In [140]:
terms_singular = vectorizer.get_feature_names()
terms_ngram = vectorizer_ngram.get_feature_names()

In [141]:
with open ("../topics_kmeans_singular_PCA.text", "w", encoding="utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids_singular[i, :10]:
            f.write(' %s' % terms_singular[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")

In [142]:
with open ("../topics_kmeans_ngram_PCA.text", "w", encoding="utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids_ngram[i, :10]:
            f.write(' %s' % terms_ngram[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")

Check cause the top words in boths clusters are numbers working with PCAdata.

# TOPIC MODELING - NMF

In [67]:
nmf_singular = NMF(n_components=5, random_state=42)

In [69]:
nmf_singular.fit(vectors)

NMF(n_components=5, random_state=42)

**TOPICS SINGULAR**

In [71]:
for index, topic in enumerate(nmf_singular.components_):
    print(f"THE TOP 10 WORDS FOR TOPIC # {index}")
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print("\n")

THE TOP 10 WORDS FOR TOPIC # 0
['carlos', 'report', 'yesterday', 'interested', 'say', 'article', 'maslatón', 'note', 'mind', 'https']


THE TOP 10 WORDS FOR TOPIC # 1
['year', 'good', 'dollar', 'price', 'country', 'inflation', 'world', 'argentine', 'argentina', 'market']


THE TOP 10 WORDS FOR TOPIC # 2
['kikuchi', 'political', 'larreta', 'sombrilla', '2023', 'avanza', 'libertad', 'la', 'javier', 'milei']


THE TOP 10 WORDS FOR TOPIC # 3
['care', 'pro', 'express', 'die', 'despite', 'day', 'really', 'want', 'friday', 'love']


THE TOP 10 WORDS FOR TOPIC # 4
['numerous', 'economic', 'program', 'issue', 'interested', 'today', 'link', 'https', 'radio', 'statement']




In [68]:
nmf_ngram = NMF(n_components=5, random_state=42)

In [70]:
nmf_ngram.fit(vectors_ngram)

NMF(n_components=5, random_state=42)

**TOPICS NGRAM**

In [72]:
for index, topic in enumerate(nmf_ngram.components_):
    print(f"THE TOP 10 WORDS FOR TOPIC # {index}")
    print([vectorizer_ngram.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print("\n")

THE TOP 10 WORDS FOR TOPIC # 0
['roja', 'calaca roja', '2023', 'sombrilla larreta', 'calaca', 'larreta', 'sombrilla', 'javier milei', 'javier', 'milei']


THE TOP 10 WORDS FOR TOPIC # 1
['statement today', 'today', 'interested', 'mind', 'link https', 'mind https', 'radio', 'link', 'statement', 'https']


THE TOP 10 WORDS FOR TOPIC # 2
['good', 'bull', 'dollar', 'price', 'country', 'inflation', 'world', 'argentine', 'argentina', 'market']


THE TOP 10 WORDS FOR TOPIC # 3
['javier', 'karina', 'kikuchi', 'milei', 'libertad', 'la libertad avanza', 'libertad avanza', 'avanza', 'la libertad', 'la']


THE TOP 10 WORDS FOR TOPIC # 4
['really', 'wednesday', 'care', 'express', 'despite', 'day', 'want', 'friday', 'friday love', 'love']




# TOPIC MODELING HDBSCAN

In [146]:
!pip install --upgrade pip

!pip install hdbscan
#!pip install --upgrade git+https://github.com/scikit-learn-contrib/hdbscan.git#egg=hdbscan

Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.1.1
    Uninstalling pip-20.1.1:
      Successfully uninstalled pip-20.1.1


ERROR: Could not install packages due to an EnvironmentError: [WinError 5] Acceso denegado: 'C:\\Users\\Usuario\\AppData\\Local\\Temp\\pip-uninstall-tvuezj7v\\pip.exe'
Consider using the `--user` option or check the permissions.



Collecting hdbscan
  Using cached hdbscan-0.8.29.tar.gz (5.2 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting joblib>=1.0
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml): started
  Building wheel for hdbscan (pyproject.toml): finished with status 'error'
Failed to build hdbscan


  error: subprocess-exited-with-error
  
  Building wheel for hdbscan (pyproject.toml) did not run successfully.
  exit code: 1
  
  [40 lines of output]
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-cpython-38
  creating build\lib.win-amd64-cpython-38\hdbscan
  copying hdbscan\flat.py -> build\lib.win-amd64-cpython-38\hdbscan
  copying hdbscan\hdbscan_.py -> build\lib.win-amd64-cpython-38\hdbscan
  copying hdbscan\plots.py -> build\lib.win-amd64-cpython-38\hdbscan
  copying hdbscan\prediction.py -> build\lib.win-amd64-cpython-38\hdbscan
  copying hdbscan\robust_single_linkage_.py -> build\lib.win-amd64-cpython-38\hdbscan
  copying hdbscan\validity.py -> build\lib.win-amd64-cpython-38\hdbscan
  copying hdbscan\__init__.py -> build\lib.win-amd64-cpython-38\hdbscan
  creating build\lib.win-amd64-cpython-38\hdbscan\tests
  copying hdbscan\tests\test_flat.py -> build\lib.win-amd64-cpython-38\hdbscan\tests
  copying hdbscan\tests\te