In [1]:
import string

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

In [4]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [5]:
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import en_core_web_sm

In [6]:
from tqdm import tqdm
from pprint import pprint

In [7]:
import pyLDAvis.sklearn

In [8]:
quora = pd.read_csv('../data/quora_clean.csv')

In [9]:
quora.head()

Unnamed: 0.1,Unnamed: 0,Question,Answer
0,0,Can I use Comet and bleach to clean my bathroom,You can use both but not at the same time. Com...
1,1,Which is the best full home deep cleaning serv...,Which is the best full home deep cleaning serv...
2,2,What is the best way to clean a tub in a hotel...,What is the best way to clean a tub in a hotel...
3,3,Can you use Clorox wipes on yoga mats,In the wake of the ongoing global health crisi...
4,4,How do you get white stains water out of light...,"Ok, I m not a chemist, but I have done a lot o..."


In [10]:
# replace missing answers with question text
mask = quora['Answer'].isnull()
quora['Answer'][mask] = quora['Question'][mask]

In [11]:
quora['Answer'][mask]

7           How to use disinfectants safely for COVID 19
10             Is there any completely free disk cleaner
29     When cleaning up a bedroom have you ever found...
31     Is slime cleaner a good option for cleaning a ...
36                 Should I trust hotel laundry services
                             ...                        
759         How to use disinfectants safely for COVID 19
760    What brand makes the highest quality robot vac...
761    Whats the best way to clean orange hard water ...
767            Is there any completely free disk cleaner
776    What product will best remove stains on the te...
Name: Answer, Length: 99, dtype: object

In [12]:
nlp = spacy.load('en_core_web_sm')

In [13]:
doc = nlp(quora["Answer"][3])
spacy.displacy.render(doc, style='ent',jupyter=True)

In [14]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

In [15]:
review = str(" ".join([i.lemma_ for i in doc]))

In [16]:
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

In [17]:
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [18]:
tqdm.pandas()
quora["processed_descriptin"] = quora["Answer"].progress_apply(spacy_tokenizer)

100%|██████████| 780/780 [00:02<00:00, 282.69it/s]


In [19]:
quora.head()

Unnamed: 0.1,Unnamed: 0,Question,Answer,processed_descriptin
0,0,Can I use Comet and bleach to clean my bathroom,You can use both but not at the same time. Com...,use time comet abrasive cleanser best removing...
1,1,Which is the best full home deep cleaning serv...,Which is the best full home deep cleaning serv...,best home deep cleaning services company dubai
2,2,What is the best way to clean a tub in a hotel...,What is the best way to clean a tub in a hotel...,best way clean tub hotel
3,3,Can you use Clorox wipes on yoga mats,In the wake of the ongoing global health crisi...,wake ongoing global health crisis consumers de...
4,4,How do you get white stains water out of light...,"Ok, I m not a chemist, but I have done a lot o...",ok m chemist lot clean white stains minerals w...


In [20]:
texts = quora['processed_descriptin']

In [21]:
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(texts)

  vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


In [22]:
NUM_TOPICS=10

In [23]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [24]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized) 

In [25]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [26]:
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [27]:
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('softener', 12.012309626598627), ('fabric', 4.956222637047824), ('dispenser', 3.6292290260487774), ('sanitizer', 2.9243370683539496), ('fungus', 1.685067242898411), ('residues', 1.5919508553184694), ('useful', 1.483075761446645), ('chemicals', 1.2772342189495016), ('adds', 0.917706404313443), ('acts', 0.851505455815941)]
Topic 1:
[('cleaning', 539.1373832009009), ('services', 102.55756894638839), ('service', 95.32300287411628), ('best', 81.58436491628989), ('clean', 64.04645425059138), ('need', 57.82128026214431), ('gloves', 56.88206788467369), ('company', 51.23963542845148), ('house', 48.32249159720743), ('bird', 47.62552265779032)]
Topic 2:
[('water', 684.9419293851734), ('wash', 556.0888574366899), ('use', 501.68839165274255), ('clean', 494.99337951463315), ('car', 407.6090803830316), ('washing', 390.1686090589196), ('detergent', 324.7646190193857), ('soap', 304.74512521899294), ('clothes', 292.0803605089642), ('best', 287.71401860618647)]
Topic 3:
[('toilet', 

In [28]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

In [30]:
docweights = lda.transform(vectorizer.transform(texts))

In [31]:
n_top_words = 8

In [32]:
def top_words(topic, n_top_words):
    return topic.argsort()[:-n_top_words - 1:-1]  

In [33]:
def topic_table(model, feature_names, n_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        t = (topic_idx)
        topics[t] = [feature_names[i] for i in top_words(topic, n_top_words)]
    return pd.DataFrame(topics)

In [36]:
lda_fn = vectorizer.get_feature_names()

In [37]:
topic_df = topic_table(
    lda,
    lda_fn,
    n_top_words
).T

In [38]:
topic_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,softener,fabric,dispenser,sanitizer,fungus,residues,useful,chemicals
1,cleaning,services,service,best,clean,need,gloves,company
2,water,wash,use,clean,car,washing,detergent,soap
3,toilet,drain,water,flush,smell,tank,shower,sewer
4,vacuum,air,cleaning,cleaner,carpet,dust,cleaners,remove
5,program,data,clean,bootcamp,offer,new,galvanize,industry
6,clean,cloth,lens,glasses,sewage,screen,use,treatment
7,bleach,alcohol,use,water,brush,disinfectant,surface,mix
8,clean,room,cleaning,house,time,way,work,like
9,search,customers,answers,experience,brand,website,delivering,yext


In [54]:
topic_df['topic summary'] = topic_df[0] + ' ' + topic_df[1] + ' ' + topic_df[2] + ' ' +  topic_df[3] + ' ' + topic_df[4] + topic_df[5] + ' ' +  topic_df[6] + ' ' + topic_df[7]

In [55]:
summary_df = topic_df['topic summary'].reset_index()

In [57]:
summary_df.columns = ['topic','summary']

In [58]:
summary_df

Unnamed: 0,topic,summary
0,0,softener fabric dispenser sanitizer fungusresi...
1,1,cleaning services service best cleanneed glove...
2,2,water wash use clean carwashing detergent soap
3,3,toilet drain water flush smelltank shower sewer
4,4,vacuum air cleaning cleaner carpetdust cleaner...
5,5,program data clean bootcamp offernew galvanize...
6,6,clean cloth lens glasses sewagescreen use trea...
7,7,bleach alcohol use water brushdisinfectant sur...
8,8,clean room cleaning house timeway work like
9,9,search customers answers experience brandwebsi...


In [59]:
question = quora['Question'].tolist()

In [72]:
df_temp = pd.DataFrame({
    'Question':question,
    'topic':docweights.argmax(axis=1)
})

In [73]:
merged_topic = df_temp.merge(
    summary_df,
    on='topic',
    how='left'
)

In [74]:
merged_topic

Unnamed: 0,Question,topic,summary
0,Can I use Comet and bleach to clean my bathroom,9,search customers answers experience brandwebsi...
1,Which is the best full home deep cleaning serv...,1,cleaning services service best cleanneed glove...
2,What is the best way to clean a tub in a hotel...,8,clean room cleaning house timeway work like
3,Can you use Clorox wipes on yoga mats,9,search customers answers experience brandwebsi...
4,How do you get white stains water out of light...,2,water wash use clean carwashing detergent soap
...,...,...,...
775,Is disinfectant the same as bleach,9,search customers answers experience brandwebsi...
776,What product will best remove stains on the te...,2,water wash use clean carwashing detergent soap
777,Why should the toothbrush be disinfected and h...,9,search customers answers experience brandwebsi...
778,How is the YIGII paper towel holder for a bath...,2,water wash use clean carwashing detergent soap


In [75]:
df_topics = pd.merge(
    quora,
    merged_topic,
    on='Question',
    how='left'
)

In [76]:
df_topics

Unnamed: 0.1,Unnamed: 0,Question,Answer,processed_descriptin,topic,summary
0,0,Can I use Comet and bleach to clean my bathroom,You can use both but not at the same time. Com...,use time comet abrasive cleanser best removing...,9,search customers answers experience brandwebsi...
1,1,Which is the best full home deep cleaning serv...,Which is the best full home deep cleaning serv...,best home deep cleaning services company dubai,1,cleaning services service best cleanneed glove...
2,2,What is the best way to clean a tub in a hotel...,What is the best way to clean a tub in a hotel...,best way clean tub hotel,8,clean room cleaning house timeway work like
3,3,Can you use Clorox wipes on yoga mats,In the wake of the ongoing global health crisi...,wake ongoing global health crisis consumers de...,9,search customers answers experience brandwebsi...
4,4,How do you get white stains water out of light...,"Ok, I m not a chemist, but I have done a lot o...",ok m chemist lot clean white stains minerals w...,2,water wash use clean carwashing detergent soap
...,...,...,...,...,...,...
955,776,What product will best remove stains on the te...,What product will best remove stains on the te...,product best remove stains teeth drinking blac...,2,water wash use clean carwashing detergent soap
956,776,What product will best remove stains on the te...,What product will best remove stains on the te...,product best remove stains teeth drinking blac...,2,water wash use clean carwashing detergent soap
957,777,Why should the toothbrush be disinfected and h...,In the wake of the ongoing global health crisi...,wake ongoing global health crisis consumers de...,9,search customers answers experience brandwebsi...
958,778,How is the YIGII paper towel holder for a bath...,How is the YIGII paper towel holder for a bath...,yigii paper towel holder bathroom,2,water wash use clean carwashing detergent soap


In [77]:
df_topics.to_csv("quora_with_topics.csv")

In [78]:
def _create_frequency_table(text_string) -> dict:

    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable

In [90]:
topic_9 =  df_topics[df_topics.topic == 9]

In [91]:
topic_9

Unnamed: 0.1,Unnamed: 0,Question,Answer,processed_descriptin,topic,summary
0,0,Can I use Comet and bleach to clean my bathroom,You can use both but not at the same time. Com...,use time comet abrasive cleanser best removing...,9,search customers answers experience brandwebsi...
3,3,Can you use Clorox wipes on yoga mats,In the wake of the ongoing global health crisi...,wake ongoing global health crisis consumers de...,9,search customers answers experience brandwebsi...
20,11,Why is reddened charcoal extinguished when it ...,In the wake of the ongoing global health crisi...,wake ongoing global health crisis consumers de...,9,search customers answers experience brandwebsi...
33,24,Why do you use newspapers to clean windows,Paper towels and rags tend to leave lint and r...,paper towels rags tend leave lint residue glas...,9,search customers answers experience brandwebsi...
63,51,How do I remove white armpit stains from my sh...,Removing Sweaty Armpit Stains Info On The Deg...,removing sweaty armpit stains info degree webs...,9,search customers answers experience brandwebsi...
86,74,Is it safe to use dishwashing gloves for every...,Treat Covid in this way. out of your home ever...,treat covid way home touch covered st wash han...,9,search customers answers experience brandwebsi...
116,101,How do I choose the best sanitization company ...,In the wake of the ongoing global health crisi...,wake ongoing global health crisis consumers de...,9,search customers answers experience brandwebsi...
153,133,Why is water leaking from the bottom of my was...,"Wait, I know Wait for it here it comes GRAVITY...",wait know wait comes gravity win seriously man...,9,search customers answers experience brandwebsi...
214,176,Does Trump want to inject people with bleach t...,Does Trump want to inject people with bleach t...,trump want inject people bleach kill coronavirus,9,search customers answers experience brandwebsi...
248,203,Why do the rotating brushes on your hoover vac...,In the wake of the ongoing global health crisi...,wake ongoing global health crisis consumers de...,9,search customers answers experience brandwebsi...


In [102]:
article = " ".join(topic_9.Answer.to_list())

In [112]:
df_for_amy = merged_topic[['Question','topic']]

In [113]:
df_for_amy.to_csv("quora_cleaning_questions.csv")

In [114]:
cp quora_cleaning_questions.csv /Users/mikefitzgerald/Desktop/