# Exercise 5

### Necessary imports

In [112]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem import PorterStemmer
from nltk import pos_tag
import pyLDAvis

stop_words = stopwords.words('english')

### Dataset loading

In [85]:
df = pd.read_csv('data/blogtext.csv', usecols=['topic', 'text'], encoding='utf-8', nrows=4000)

### Pre-processing of dataset

In [90]:
def pre_process(sentence: str) -> list:
    sentence = sentence.lower()
    sentence.replace(' mail ', ' ')
    sentence.replace(' urllink ', ' ')
    tokenizer = RegexpTokenizer(r"[a-zA-Z0-9]+")
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stop_words and not w.isdigit()]
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(w) for w in filtered_words]
    nouns = [word for word, pos in pos_tag(stemmed_words) if pos.startswith('NN')]
    return ' '.join(nouns)

df['text'] = df['text'].apply(pre_process)

In [91]:
df['text'].head()

0                          page mb leader process html
1    team member der mail ruiyu xie mail bryan aald...
2    het kader van kernfusi je eigen build h bomb r...
3                                            test test
4    thank yahoo toolbar captur popup mean show pop...
Name: text, dtype: object

### I vectorize the documents using the TF-IDF vectorizer

In [92]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])

### LDA for topic modeling

In [96]:
# LDA for topic modelling
lda = LatentDirichletAllocation(n_components=20,
                                learning_decay=0.5,         
                                max_iter=50,                
                                learning_method='online',   
                                random_state=42,            
                                batch_size=5000,            
                                evaluate_every = -1,        
                                n_jobs = -1) 
lda_output = lda.fit_transform(X)

### Visualizing the topics with pyLDAvis

In [110]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda.score(X))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda.perplexity(X))
# See model parameters
print(lda.get_params())

Log Likelihood:  -899224.3097723977
Perplexity:  3320.8834647388862
{'batch_size': 5000, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.5, 'learning_method': 'online', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 50, 'mean_change_tol': 0.001, 'n_components': 20, 'n_jobs': -1, 'perp_tol': 0.1, 'random_state': 42, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}
