In [1]:
import re
import nltk
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt 
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\!admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\!admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\!admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("dataset/quora_questions.csv")
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404289 entries, 0 to 404288
Data columns (total 1 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Question  404289 non-null  object
dtypes: object(1)
memory usage: 3.1+ MB


In [4]:
df.describe()

Unnamed: 0,Question
count,404289
unique,290456
top,How do I improve my English speaking?
freq,50


In [7]:
#doing some exploratory data analysis on the data
df['Length'] = df['Question'].apply(lambda x:len(x.split()))

average_words = df['Length'].mean()

print(f'mean of the text words -> {average_words}')
df.head()



mean of the text words -> 10.942234391734626


Unnamed: 0,Question,Length
0,What is the step by step guide to invest in sh...,14
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,8
2,How can I increase the speed of my internet co...,14
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",13


In [12]:
#get the document term tfidf to vectorize the text and pass to the model for prediction and analysis
tfidf = TfidfVectorizer(max_df=0.95,min_df=2,stop_words="english")

dtm = tfidf.fit_transform(df['Question'])

In [13]:
dtm

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

In [20]:
#use non negative matrix factorizer to try cluster topics in the document term
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=20,random_state=42)

In [24]:
nmf_model.fit(dtm)



In [25]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')



THE TOP 15 WORDS FOR TOPIC #0
['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']


THE TOP 15 WORDS FOR TOPIC #1
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


THE TOP 15 WORDS FOR TOPIC #2
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


THE TOP 15 WORDS FOR TOPIC #3
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']


THE TOP 15 WORDS FOR TOPIC #4
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']


THE TOP 15 WORDS FOR TOPIC #5
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'country', 

In [30]:
topic_results = nmf_model.transform(dtm)

In [31]:
df['Topic_Results'] = topic_results.argmax(axis=1)

In [36]:
#assigning topics to the index topic clustter
my_topic = {0:"Movies",1:"Work",2:"Question",3:'Internet',4:"Philosophy",5:"Company",6:"Programming",7:"Election",
            8:"War",9:"Sex Education",10:"Books",11:"Economy",12:"Idea",13:"Communication",14:"Fitness",15:"Travel",16:"Engagement",17:"Social Media"
            ,18:'Computer',19:"Research"}

In [38]:
df['Topic'] = df['Topic_Results'].map(my_topic)

In [39]:
df.head()

Unnamed: 0,Question,Length,Topic_Results,Topic
0,What is the step by step guide to invest in sh...,14,5,Company
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,8,16,Engagement
2,How can I increase the speed of my internet co...,14,17,Social Media
3,Why am I mentally very lonely? How can I solve...,11,11,Economy
4,"Which one dissolve in water quikly sugar, salt...",13,14,Fitness
