In [None]:
# Start writing code here# numerical computation
import numpy as np

# data processing/manipulation
import pandas as pd
pd.options.mode.chained_assignment = None
import re

from unidecode import unidecode

!pip install wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# stopwords, tokenizer, stemmer
import nltk  
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Gensim
!pip install gensim
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary 
from gensim.models.ldamodel import LdaModel
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import remove_stopwords

# spacy for lemmatization and additional stopwords
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# LDA plotting tools
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn

In [None]:
# Loading each dataset
pisa_df = pd.read_csv('pisa_df_unique_gen.csv', lineterminator='\n')

pisa_df.rename(columns={"text": "tweet"}, inplace = True)

pisa_df = pisa_df[pisa_df.lang == 'en']

In [None]:
keywords = ['pisa', 'pisa4development', 'pisaoecd', 'oecdpisa', 'pisa2003', 'pisa2006', 'pisa2009', 'pisa2012', 'pisa2015', 'pisa2018']
unwanted = ['oecd']

In [None]:
'''
Helper function that:
    - removes accents
    - removes stopwords
    - removes punctuation
    - remove all 1 and 2 letter 'words' that we create after automatic removal of the apostrophe character
'''

def clean_text(text):
    
    # remove accents of text
    text=unidecode(text)
    
    # lowercase
    text=text.lower()
    
    text=re.sub(r'&amp;',' ',text) # remove ampersand 
    #text=re.sub(r'[^\sa-zA-Z0-9@\[\]]',' ',text) # remove characters: punctuation and other special characters, except for: alphabet letters, numbers, '@' and '\' 
    
    text=re.sub(r'@[A-Za-z0-9]+','',text) #remove mentions of other twitter accounts (remove the whole user name)
    text=re.sub(r'#','',text) # remove the hashtag symbol (but leaves the hashtag word)
    text=re.sub(r'RT[\s]+','',text) #remove retweet keyword
    text=re.sub(r'rt[\s]+','',text) #remove retweet keyword
    text=re.sub(r'https?:\/\/\S+','',text) # removes hyperlinks
    text=re.sub(r'[^\w]', ' ', text)

    # remove stopwords
    text=remove_stopwords(text)
    
    text = re.sub(r'@\S+', "", text)    #remove all @name (mentions of other Twitter usernames) -> remove again mentioned of other usernames ?
    text = re.sub(r"http\S+", "", text) #remove all URLs (the whole URL, until the next ' ' is encountered) -> remove again all URL ?

    '''
    remove he mentions of Pisa
    '''
    for k in keywords :
        to_remove = k
        text = re.sub(to_remove, " ", text)
    
    for w in unwanted :
        to_remove = w
        text = re.sub(to_remove, " ", text)
    
    '''
    remove any 1 or 2 letter entities that remain in a text after automatically removing the apostrophes 
    [d, f, i, m, s, t, u, y]
    [el, en, la, ll, pa, ve]
    '''
    text = re.sub(r"\sd\s", " ", text)
    text = re.sub(r"\sf\s", " ", text)
    text = re.sub(r"\si\s", " ", text)
    text = re.sub(r"\sm\s", " ", text)
    text = re.sub(r"\ss\s", " ", text)
    text = re.sub(r"\st\s", " ", text)
    text = re.sub(r"\su\s", " ", text)
    text = re.sub(r"\sy\s", " ", text)    
    text = re.sub(r"\sel\s", " ", text)
    text = re.sub(r"\sen\s", " ", text)
    text = re.sub(r"\sla\s", " ", text)
    text = re.sub(r"\sll\s", " ", text)
    text = re.sub(r"\spa\s", " ", text)    
    text = re.sub(r"\sve\s", " ", text)
    

    return text

In [None]:
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
        
# Lemmatization function
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [None]:
# Tokenize Tweets 
def tokenize_text(tweet):
    filtered_tweet = []
    words = word_tokenize(tweet) 

    for word in words:
        filtered_tweet.append(word)
                
    return filtered_tweet


In [None]:
# Cleaning all trump and biden tweets by applying clean_text()
pisa_df['clean_tweets'] = pisa_df['tweet'].apply(lambda x: clean_text(x))

'''
if you want to compare the original and 
the cleaned text to see what exactly was removed 
uncomment both lines below
'''
#pd.options.display.max_colwidth = 300
#print(twitter_usa_df[['tweet', 'cleaned_tweet']].head(10))

'\nif you want to compare the original and \nthe cleaned text to see what exactly was removed \nuncomment both lines below\n'

In [None]:
# lemmatazing the  tweets by applying lemmatize_sentence()
pisa_df['lemmat_tweets'] = pisa_df.clean_tweets.apply(lambda x: lemmatize_sentence(x))

pisa_df['tokenized_tweets'] = pisa_df.lemmat_tweets.apply(lambda x: tokenize_text(x))

In [None]:
vectorizer = CountVectorizer( analyzer='word',
                             min_df=3,# minimum required occurences of a word 
                             stop_words='english',# remove stop words
                             lowercase=True,# convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',# num chars > 3
                             max_features=5000,# max number of unique words
                            )


In [None]:
data_matrix = vectorizer.fit_transform(pisa_df.lemmat_tweets)

In [None]:
topics = 14

lda_model = LatentDirichletAllocation(
n_components=topics, # Number of topics
learning_method='online',
random_state=62,       
n_jobs = -1  # Use all available CPUs
)

start_time = time.time()
lda_output = lda_model.fit_transform(data_matrix)
stop_time = time.time()

print(f'Fitting LDA model for {topics} topics took: {(stop_time-start_time)/60:.2f}')

Fitting LDA model for 14 topics took: 3.50


In [None]:
pyLDAvis.enable_notebook()
p = pyLDAvis.sklearn.prepare(lda_model, data_matrix, vectorizer, mds='tsne')
pyLDAvis.save_html(p, 'lda_14topics_min3.html')

### Predictive model for the new tweets

In [None]:
topic_values = lda_model.transform(data_matrix)
pisa_df['topic'] = topic_values.argmax(axis=1)

In [None]:
pisa_df.to_csv('pisa_df_unique_gen_topic.csv')

In [None]:
np.save('topic_values', topic_values)

In [None]:
pisa_df.shape

(80839, 38)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=914019af-b74b-46cd-bce4-57738580bf42' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>