# Exercise 5

### Necessary imports

In [15]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem import SnowballStemmer
from nltk import pos_tag
import re

stop_words = stopwords.words('english')

### Dataset loading

In [10]:
df = pd.read_csv('data/blogtext.csv', encoding='utf-8', nrows=5000)
df.drop('id', axis=1, inplace=True)

### Pre-processing of dataset

In [51]:
temp = []
snow = SnowballStemmer('english')

for sentence in df['text']:
    sentence = sentence.lower()
    sentence = sentence.replace(' mail ', ' ')
    sentence = sentence.replace(' urllink ', ' ')
    cleanr = re.compile('<.*?>')
    sentence = re.sub(cleanr, ' ', sentence)   # removing HTML tags
    sentence = re.sub(r'[?|!|\'|"|#]', r'', sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]', r'', sentence)  # removing punctuations
    words = [snow.stem(word) for word in sentence.split() if word not in stop_words]
    temp.append(words)

final_text = temp.copy()

In [54]:
sent = []
for row in final_text:
    seq = ''
    for word in row:
        seq = seq + ' ' + word
    sent.append(seq)

### I vectorize the documents using the TF-IDF vectorizer

In [69]:
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(sent)

### LDA for topic modeling

In [70]:
# LDA for topic modelling
lda = LatentDirichletAllocation(n_components=20,
                                learning_decay=0.5,         
                                max_iter=50,                
                                learning_method='online',   
                                random_state=42,            
                                batch_size=5000,            
                                evaluate_every = -1,        
                                n_jobs = -1) 
lda_output = lda.fit_transform(X)

In [71]:
terms = vectorizer.get_feature_names_out()

for index, component in enumerate(lda.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['la', 'de', 'un', 'que', 'el', 'en', 'su']
Topic 1:  ['brought', 'court', 'quizilla', 'state', 'law', 'right', 'lott']
Topic 2:  ['movi', 'http', 'film', 'new', 'mountain', 'star', 'one']
Topic 3:  ['peopl', 'would', 'us', 'one', 'world', 'bush', 'war']
Topic 4:  ['name', 'exam', 'histori', 'record', 'french', 'movi', 'us']
Topic 5:  ['said', 'omar', 'offic', 'ask', 'prison', 'say', 'day']
Topic 6:  ['john', 'bo', 'flower', 'man', 'name', 'red', 'like']
Topic 7:  ['vote', 'would', 'suv', 'jonah', 'one', 'uncl', 'republican']
Topic 8:  ['one', 'life', 'know', 'it', 'want', 'make', 'look']
Topic 9:  ['korean', 'one', 'korea', 'like', 'get', 'bomb', 'bar']
Topic 10:  ['job', 'pay', 'tax', 'pm', 'get', 'compani', 'flag']
Topic 11:  ['place', 'one', 'would', 'look', 'like', 'see', 'could']
Topic 12:  ['im', 'like', 'go', 'get', 'know', 'dont', 'think']
Topic 13:  ['love', 'birthday', 'happi', 'day', 'favorit', 'music', 'god']
Topic 14:  ['love', 'walk', 'eye', 'stop', 'way', 'one