In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import gensim
from gensim.corpora import Dictionary
from gensim.models import LsiModel
from gensim.matutils import corpus2csc

In [2]:
df = pd.read_csv("quora_questions.csv")
data = df.sample(n=1000, axis=0)
data = data['Question']

In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\scnav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\scnav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scnav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [5]:
def preprocess(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    import re
    special_chars = r'[,.:;?\(\'"\s]'
    words = [re.sub(special_chars, '', word) for word in words]
    return words

In [6]:
data = data.apply(preprocess)
dictionary = Dictionary(data)

In [7]:
dictionary.filter_extremes(no_below=5, no_above=0.5)

In [8]:
bow_corpus = [dictionary.doc2bow(text) for text in data]

In [9]:
num_topics = 5
lsamodel = LsiModel(bow_corpus, num_topics=num_topics, id2word=dictionary)

In [10]:
topics = lsamodel.show_topics(num_topics=num_topics, num_words=10)

In [11]:
top_topics = []
for topic in topics:
    top_topics.append(topic[1])

In [12]:
print("Top 5 LSA Topics:")
for i, topic in enumerate(top_topics, start=1):
    print("Topic {}: {}".format(i, topic))

Top 5 LSA Topics:
Topic 1: -0.878*"best" + -0.242*"get" + -0.144*"way" + -0.139*"s" + -0.102*"movie" + -0.101*"make" + -0.090*"learn" + -0.086*"nt" + -0.078*"online" + -0.072*"indian"
Topic 2: 0.763*"get" + -0.345*"best" + 0.254*"nt" + 0.180*"number" + 0.143*"s" + 0.124*"like" + 0.112*"message" + 0.100*"people" + 0.099*"could" + 0.089*"know"
Topic 3: 0.686*"s" + -0.411*"get" + 0.224*"nt" + 0.191*"world" + 0.167*"like" + 0.130*"ca" + -0.130*"best" + 0.128*"new" + 0.103*"good" + 0.099*"m"
Topic 4: -0.611*"make" + -0.377*"money" + -0.375*"way" + -0.370*"online" + 0.239*"s" + -0.201*"good" + 0.153*"best" + -0.116*"without" + -0.108*"using" + 0.084*"get"
Topic 5: -0.632*"nt" + 0.422*"like" + -0.282*"ca" + 0.228*"s" + -0.206*"number" + 0.205*"get" + -0.143*"stop" + 0.114*"work" + 0.106*"indian" + -0.094*"message"


In [13]:
import re

print("Top 5 LSA Topics:")
for i, topic in enumerate(top_topics, start=1):
    words = re.findall(r'"([^"]*)"', topic)
    print("Topic {}: {}".format(i, ", ".join(words)))

Top 5 LSA Topics:
Topic 1: best, get, way, s, movie, make, learn, nt, online, indian
Topic 2: get, best, nt, number, s, like, message, people, could, know
Topic 3: s, get, nt, world, like, ca, best, new, good, m
Topic 4: make, money, way, online, s, good, best, without, using, get
Topic 5: nt, like, ca, s, number, get, stop, work, indian, message
