In [1]:
import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Load Dataset
documents_list = []
with open( os.path.join("survey.csv") ,"r") as fin:
    for line in fin.readlines():
        text = line.strip()
        documents_list.append(text)

In [14]:
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,10),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data = tfidf.fit_transform(documents_list)   

In [15]:
# Define the number of topics or components
num_components=10

# Create SVD object
lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

# Fit SVD model on data
lsa.fit_transform(train_data)

# Get Singular values and Components 
Sigma = lsa.singular_values_ 
V_transpose = lsa.components_.T

In [16]:
# Print the topics with their terms
terms = tfidf.get_feature_names()

for index, component in enumerate(lsa.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:5]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['15', '5', '15 5', '11', '2020']
Topic 1:  ['metropolitan', '12e 15 5 current events damage typhoon goni 2 5', '15 5 current events damage typhoon goni 2 5', '15 5 current events damage typhoon goni 2 5 74e', '15 ph manila_typhoon_goni_damage']
Topic 2:  ['68e', '68e 15', '15 5 current events damage typhoon goni 2 4 68e', '15 ph bicol_typhoon_goni_damage', '15 ph bicol_typhoon_goni_damage 5']
Topic 3:  ['caused severe winds don', 'caused severe winds don t', 'caused severe winds don t know', 'caused severe winds don t know https', 'caused severe winds don t know https storage']
Topic 4:  ['04e 15 diamondhead 3 diamondhead male 26', '04e 15 diamondhead 3 diamondhead male 26 35', '04e 15 diamondhead 3 diamondhead male 26 35 years', '04e 15 diamondhead 3 diamondhead male 26 35 years old', '15 diamondhead 3 diamondhead male 26']
Topic 5:  ['26 35 years old rural', '26 35 years old rural afford', '26 35 years old rural afford food', '35 years old rural', '35 years old rural affor