# Topic Modelling

In [7]:
import numpy as np 
import pandas as pd
import plotly.express as px

import string
from tqdm import tqdm

import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
import gensim
nltk.download('stopwords')

from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maxim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Amazon Fine Food Reviews Data

In [21]:
df = pd.read_csv('data/amazon_food_reviews/Reviews.csv')
df.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [22]:
#select only the columns we need
df = df[['Summary','Text']]
#rename columns
df.columns = ['summary','text']
#drop missing values
df = df.dropna()
df.head()

Unnamed: 0,summary,text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [23]:
# select small portion of reviews to test how algorithm is working
df = df.sample(frac=1).reset_index(drop=True)
df = df[:10000]
df

Unnamed: 0,summary,text
0,Worst K-Cup ever!,"This is by far, the worst tasting K-Cup variet..."
1,Scrumdiddiliumptious,These are the best coffee beans I've ever tast...
2,tasty product for parents & kids,The only reason I gave this a 4 star was that ...
3,Jennifer K.,My vet recommended these chewies because i'm s...
4,My son loves this cereal!,No joke--this was the first solid food we gave...
...,...,...
9995,Great mixed with flavored coffees!,"At first glance, Timothy's White Hot Chocolate..."
9996,No more tummy troubles for my kitty,I have a cat with tummy/bowel problems but not...
9997,Good coffee,Just finished off my first Amazon packaged (50...
9998,Very tasty & healthy,I first discovered these crackers when looking...


### Pre-Processing

In [24]:
# remove stopwords with spacy
def remove_stopwords(nlp, text):
    #tokenize
    doc = nlp(text)

    # Create list of word tokens
    token_list = []
    for token in doc:
        token_list.append(token.text)

    # Create list of word tokens after removing stopwords
    filtered_sentence = [] 
    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    return ' '.join(filtered_sentence)

# lemmatize text with spacy
def lemmatize(nlp,text):

    #tokenize
    doc = nlp(text)

    lemmatized = []
    for token in doc:
        # print(token, token.lemma, token.lemma_)
        lemmatized.append(token.lemma_)

    return ' '.join(lemmatized)

#Clean text
def clean_text(nlp, text):
    
    clean = text
    #remove links
    clean = re.sub(r"http\S+", "", clean)
    #remove punctuation
    clean = clean.translate(str.maketrans('', '', string.punctuation)) #https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
    #remove stopwords
    clean = remove_stopwords(nlp, clean)
    #lemmatizing
    clean = lemmatize(nlp, clean)
    #strip whitespaces
    clean = clean.strip()
    #lowercase
    clean = clean.lower()

    return clean

In [25]:
PREPROCESS = True

#Pre-Process
if PREPROCESS:
    #Load spacy model -> needed for stopword removal and lemmatizing
    nlp = spacy.load("en_core_web_sm")
    #Enable progress tracking
    tqdm.pandas()
    #Run pre-processing on the whole dataset
    df['clean'] = df.text.progress_apply(lambda x: clean_text(nlp,x))
    #Save pre-processed data
    df.to_csv('data/amazon_food_reviews/cleaned_10k.csv')

100%|██████████| 10000/10000 [04:36<00:00, 36.16it/s]


In [26]:
df.head()

Unnamed: 0,summary,text,clean
0,Worst K-Cup ever!,"This is by far, the worst tasting K-Cup variet...",far bad tasting kcup variety try date luckily ...
1,Scrumdiddiliumptious,These are the best coffee beans I've ever tast...,good coffee bean ve tasted life careful eat night
2,tasty product for parents & kids,The only reason I gave this a 4 star was that ...,reason give 4 star bulk pack thing geared i...
3,Jennifer K.,My vet recommended these chewies because i'm s...,vet recommend chewie m awful brushing dog toot...
4,My son loves this cereal!,No joke--this was the first solid food we gave...,jokethis solid food give son go hmm hmm yum yu...


## Vectorize

In [31]:
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(df["clean"])

## LDA Topic Modelling

In [43]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=10, max_iter=20, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)


iteration: 1 of max_iter: 20
iteration: 2 of max_iter: 20
iteration: 3 of max_iter: 20
iteration: 4 of max_iter: 20
iteration: 5 of max_iter: 20
iteration: 6 of max_iter: 20
iteration: 7 of max_iter: 20
iteration: 8 of max_iter: 20
iteration: 9 of max_iter: 20
iteration: 10 of max_iter: 20
iteration: 11 of max_iter: 20
iteration: 12 of max_iter: 20
iteration: 13 of max_iter: 20
iteration: 14 of max_iter: 20
iteration: 15 of max_iter: 20
iteration: 16 of max_iter: 20
iteration: 17 of max_iter: 20
iteration: 18 of max_iter: 20
iteration: 19 of max_iter: 20
iteration: 20 of max_iter: 20


## Inspect LDA Results

In [44]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 
        print()

In [45]:
selected_topics(lda, vectorizer)

Topic 0:
[('store', 831.010570358646), ('local', 509.7418145372918), ('product', 504.73763374478074), ('buy', 406.9158424116845), ('grocery', 395.2403264183786), ('price', 333.994721748483), ('amazon', 324.6292303286209), ('year', 293.8964448076468), ('bottle', 276.38340038966913), ('brand', 273.428664296629)]

Topic 1:
[('bag', 1028.3412593229943), ('buy', 344.62055974940904), ('chip', 300.008460236328), ('candy', 279.8525170597371), ('store', 195.06798245195745), ('price', 174.0197197463074), ('piece', 159.16246681087588), ('thank', 146.208319321236), ('good', 140.33829903697324), ('amazon', 128.16059917130877)]

Topic 2:
[('tea', 2563.1625430648187), ('drink', 1100.4932479081397), ('taste', 924.3614035480563), ('like', 846.3469375962175), ('flavor', 620.8473881638367), ('green', 520.0564338764792), ('water', 374.306991096445), ('good', 308.8207037536486), ('try', 289.0831005060041), ('ice', 255.20175782889837)]

Topic 3:
[('taste', 1746.7077016867104), ('like', 1415.8329335140531), 