# Exercise 5

### Necessary imports

In [4]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem import SnowballStemmer
from nltk import pos_tag
import re
import string
import matplotlib.pyplot as plt

stop_words = stopwords.words('english')

### Dataset loading

In [5]:
df = pd.read_csv('data/blogtext.csv', encoding='utf-8', nrows=5000)
df.drop('id', axis=1, inplace=True)

### Pre-processing of dataset

In [6]:
snow_stem = SnowballStemmer('english')  

def preprocess(sentence):
    # Maintain only content words (nouns, verbs, adjectives, adverbs) of the sentence
    sentence = pos_tag(sentence.split())
    sentence = [word for word, tag in sentence if tag.startswith('NN') or tag.startswith('VB') or tag.startswith('JJ') or tag.startswith('RB')]
    sentence = ' '.join(sentence)
    sentence = sentence.lower()
    sentence = sentence.replace('mail', '')
    sentence = sentence.replace('urllink', '')
    sentence = sentence.replace('nbsp', '')
    sentence = sentence.replace('link', '')
    sentence = sentence.replace('url', '')

    # Removing unwanted characters
    sentence = re.sub('[^A-Za-z0-9 ]+', " ", sentence)

    # Removing whitespaces
    sentence = sentence.strip()

    # Replacing more than 1 spaces with single space
    sentence = sentence.replace(r"\s\s+", " ")
    
    # Removing HTML tags
    cleanr = re.compile('<.*?>')
    sentence = re.sub(cleanr, ' ', sentence)

    # Removing punctuation
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))

    # Removing numbers
    sentence = re.sub(r'\d+', '', sentence)

    # Removing stop words
    words = [snow_stem.stem(word) for word in sentence.split() if word not in stop_words]
    return ' '.join(words)

In [7]:
df['text_cleaned'] = df['text'].apply(preprocess)

### I vectorize the documents using the TF-IDF vectorizer

In [12]:
vectorizer = CountVectorizer(max_features=5000, min_df=10, token_pattern='[a-zA-Z0-9]{3,}')
X = vectorizer.fit_transform(df['text_cleaned'])

### LDA for topic modeling

In [13]:
# LDA for topic modelling
lda = LatentDirichletAllocation(n_components=20,
                                learning_decay=0.5,         
                                max_iter=20,                
                                learning_method='online',   
                                random_state=42,            
                                batch_size=128,            
                                evaluate_every = -1,        
                                n_jobs = -1) 
lda_output = lda.fit_transform(X)

In [14]:
terms = vectorizer.get_feature_names_out()

for index, component in enumerate(lda.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['yeah', 'sigh', 'day', 'haha', 'wanna', 'write', 'hey']
Topic 1:  ['republican', 'angi', 'bob', 'faith', 'pant', 'strong', 'boston']
Topic 2:  ['peopl', 'point', 'year', 'believ', 'respons', 'issu', 'power']
Topic 3:  ['cat', 'pillow', 'religion', 'protest', 'sex', 'nose', 'power']
Topic 4:  ['use', 'wall', 'hous', 'build', 'valu', 'window', 'new']
Topic 5:  ['song', 'name', 'band', 'play', 'nobodi', 'sing', 'listen']
Topic 6:  ['eat', 'food', 'photo', 'bar', 'ice', 'candi', 'hee']
Topic 7:  ['think', 'know', 'get', 'peopl', 'want', 'thing', 'make']
Topic 8:  ['space', 'johnathan', 'hell', 'ridicul', 'heaven', 'career', 'statement']
Topic 9:  ['look', 'car', 'see', 'eye', 'never', 'away', 'head']
Topic 10:  ['got', 'get', 'day', 'went', 'night', 'back', 'good']
Topic 11:  ['blog', 'post', 'com', 'friday', 'html', 'pic', 'check']
Topic 12:  ['red', 'white', 'lie', 'bring', 'rain', 'love', 'beauti']
Topic 13:  ['year', 'friend', 'love', 'life', 'live', 'know', 'famili']
Topic 

In [15]:
# GridSearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

search_params = {'n_components': [5, 10, 15, 20, 25], 'learning_decay': [.5, .7, .9]}
lda = LatentDirichletAllocation()
model = GridSearchCV(lda, param_grid=search_params)
model.fit(X)

# Best model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(X))

Best Model's Params:  {'learning_decay': 0.7, 'n_components': 5}
Best Log Likelihood Score:  -464108.5695584055
Model Perplexity:  1451.1484474595875


In [16]:
model.cv_results_

{'mean_fit_time': array([15.24805226, 13.92610822, 14.31365829, 14.22124124, 15.73362937,
        13.85746818, 13.13738604, 13.58199811, 13.48960199, 14.74309187,
        13.83305035, 13.03937597, 13.67054095, 15.59306135, 17.2330337 ]),
 'std_fit_time': array([1.36910063, 1.08994086, 1.31470549, 1.42872672, 1.59831267,
        1.05528171, 1.11257615, 1.20919761, 1.19643241, 1.3646233 ,
        1.04241091, 1.29813436, 1.29953219, 1.84508111, 1.57063865]),
 'mean_score_time': array([0.3147028 , 0.32480226, 0.35491614, 0.36134357, 0.41154037,
        0.29681435, 0.31809549, 0.33589187, 0.36500115, 0.42325621,
        0.29870787, 0.31541457, 0.36905551, 0.38660111, 0.47491136]),
 'std_score_time': array([0.07394804, 0.0755045 , 0.08144761, 0.10150902, 0.12444253,
        0.05833286, 0.08707437, 0.10360188, 0.10775972, 0.14053318,
        0.06585443, 0.09941303, 0.09722951, 0.08550264, 0.16515361]),
 'param_learning_decay': masked_array(data=[0.5, 0.5, 0.5, 0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.

In [17]:
for gscore in model.cv_results_:
    print(gscore)

mean_fit_time
std_fit_time
mean_score_time
std_score_time
param_learning_decay
param_n_components
params
split0_test_score
split1_test_score
split2_test_score
split3_test_score
split4_test_score
mean_test_score
std_test_score
rank_test_score


In [18]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [5, 10, 15, 20, 25]
log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if model.cv_results_.params['learning_decay']==0.5]
log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if model.cv_results_.params['learning_decay']==0.7]
log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if model.cv_results_.params['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

AttributeError: 'dict' object has no attribute 'params'