In [3]:
import pandas as pd

In [4]:
quora = pd.read_csv('quora_questions.csv')

In [5]:
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


# Preprocessing


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [8]:
dtm = tfidf.fit_transform(quora['Question'])

In [9]:
dtm

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

# Non-negative Matrix Factorization

####  Using Scikit-Learn create an instance of NMF with 10 expected components.

In [10]:
from sklearn.decomposition import NMF

In [11]:
nmf_model = NMF(n_components=10,random_state=42)

In [12]:
nmf_model.fit(dtm)

NMF(n_components=10, random_state=42)

# Print our the top 30 most common words for each of the 10 topics.

In [17]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 30 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-30:]])
    print('\n')

THE TOP 30 WORDS FOR TOPIC #0
['world', 'coaching', 'learning', 'android', 'hollywood', 'app', 'free', 'engineering', 'site', 'friend', 'thing', 'read', 'website', 'place', 'visit', 'places', 'phone', 'buy', 'lose', 'laptop', 'time', 'movie', 'ways', 'weight', '2016', 'books', 'book', 'movies', 'way', 'best']


THE TOP 30 WORDS FOR TOPIC #1
['distance', 'new', 'need', 'says', 'guy', 'weight', 'good', 'universities', 'grads', 'use', 'majors', 'recruit', 'relationship', 'differ', 'person', 'looking', 'exist', 'girl', 'look', 'compare', 'really', 'cost', 'time', 'sex', 'long', 'work', 'feel', 'like', 'mean', 'does']


THE TOP 30 WORDS FOR TOPIC #2
['topics', 'instead', 'did', 'account', 'write', 'profile', 'interview', 'googling', 'follow', 'users', 'writer', 'people', 'marked', 'search', 'use', 'add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


THE TOP 30 WORDS FOR TOPIC #3
['really',

####  Added a new column to the original quora dataframe that labels each question into one of the 10 topic categories.

In [14]:
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [15]:
topic_results = nmf_model.transform(dtm)

In [16]:
topic_results.argmax(axis=1)

quora['Topic'] = topic_results.argmax(axis=1)

quora.head(10)

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,0
2,How can I increase the speed of my internet co...,3
3,Why am I mentally very lonely? How can I solve...,8
4,"Which one dissolve in water quikly sugar, salt...",1
5,Astrology: I am a Capricorn Sun Cap moon and c...,1
6,Should I buy tiago?,0
7,How can I be a good geologist?,6
8,When do you use シ instead of し?,2
9,Motorola (company): Can I hack my Charter Moto...,5
