# Unsupervised topic modelling on Amazon product descriptions
This notebook creates a Latent Dirichlet Allocation model to analyse the latent topics within a dataset of Amazon products and product descriptions. 

## Setting up and preparing key functions
Adding additional stopwords to the basic English configuration, namely 'from', 'product', 'item', 'about', and 'use'.

In [None]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'product', 'item', 'about', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Checking the dataset as DataFrame

In [None]:
import pandas as pd
products = pd.read_csv('productnames.csv')
products.head(10)

Unnamed: 0,product_description
0,\n About this item 2 year shelf life. Only...
1,\n About this item \n This fits your .\n ...
2,\n About this item 2 year shelf life. Only...
3,\n About this item CLOROX BLEACH: Use Clor...
4,\n About this item \n This fits your .\n ...
5,"\n About this item Current fresh stock, Ex..."
6,\n About this item EFFICIENTLY BLEACH: Con...
7,\n About this item EFFICIENTLY BLEACH: Con...
8,\n About this item ULTRA CLEAN TOILET BOWL...
9,\n About this item EFFICIENTLY BLEACH: Con...


## Preparing the corpus and removing stopwords

In [None]:
data = products.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['year', 'shelf', 'life', 'activates', 'hot', 'cold', 'water', 'spills', 'splash', 'mess', 'one', 'tablet', 'mixed', 'cup', 'water', 'equal', 'cup', 'standard', 'regular', 'liquid', 'bleach', 'light', 'weight', 'easy', 'store', 'transport', 'less', 'plastic', 'waste', 'helps']


In [None]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 2), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1)]


## Building the initial LDA model

In [None]:
from pprint import pprint
# number of topics
num_topics = 7
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]



[(0,
  '0.019*"laundry" + 0.014*"stains" + 0.014*"detergent" + 0.011*"safe" + '
  '0.010*"stain" + 0.010*"clean" + 0.009*"water" + 0.009*"machines" + '
  '0.008*"coffee" + 0.007*"remover"'),
 (1,
  '0.009*"moisture" + 0.009*"free" + 0.008*"clean" + 0.007*"water" + '
  '0.007*"spray" + 0.007*"cleaner" + 0.006*"xa" + 0.006*"cleaning" + '
  '0.006*"made" + 0.005*"laundry"'),
 (2,
  '0.017*"laundry" + 0.011*"detergent" + 0.011*"free" + 0.011*"cleaning" + '
  '0.009*"clean" + 0.007*"fits" + 0.007*"safe" + 0.007*"washing" + '
  '0.007*"water" + 0.006*"sheets"'),
 (3,
  '0.012*"cleaning" + 0.011*"clean" + 0.009*"laundry" + 0.008*"free" + '
  '0.008*"cleaner" + 0.007*"water" + 0.007*"safe" + 0.007*"fits" + '
  '0.007*"detergent" + 0.005*"made"'),
 (4,
  '0.011*"free" + 0.010*"laundry" + 0.010*"stains" + 0.010*"clean" + '
  '0.010*"detergent" + 0.009*"stain" + 0.008*"clothes" + 0.007*"safe" + '
  '0.007*"remover" + 0.007*"cleaning"'),
 (5,
  '0.014*"laundry" + 0.011*"detergent" + 0.010*"moistur

## Importing additional packages and libraries
PyLDAvis and os are needed for visualisation of relative intertopic distance. 

In [None]:
!pip install pyldavis
import pyLDAvis.gensim
import pickle 
import pyLDAvis
import os

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyldavis
  Downloading pyLDAvis-3.4.0-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
Collecting funcy
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 KB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: funcy, joblib, pyldavis
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.1
    Uninstalling joblib-1.1.1:
      Successfully uninstalled joblib-1.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.2.0 requires joblib~=1.1.0, but 

## Preparing initial visualisation of intertopic distance

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('data_words10'+str(num_topics))

if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, 'data_words10'+ str(num_topics) +'.html') # loading data_words set instead of 'data from disk'
LDAvis_prepared

  and should_run_async(code)
  default_term_info = default_term_info.sort_values(


In [None]:
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

## Baseline coherence score (stopwords, 7 topics) = 0.343113

  and should_run_async(code)


Coherence Score:  0.34311352054621375


In [None]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

  and should_run_async(code)


In [None]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('/content/sample_data/results.csv', index=False)
    pbar.close()


 12%|█▏        | 66/540 [1:37:25<11:39:42, 88.57s/it]

  0%|          | 1/540 [00:06<1:01:21,  6.83s/it][A
  0%|          | 2/540 [00:15<1:13:08,  8.16s/it][A
  1%|          | 3/540 [00:23<1:11:01,  7.94s/it][A
  1%|          | 4/540 [00:33<1:18:07,  8.74s/it][A
  1%|          | 5/540 [00:43<1:20:22,  9.01s/it][A
  1%|          | 6/540 [00:50<1:15:34,  8.49s/it][A
  1%|▏         | 7/540 [01:00<1:20:18,  9.04s/it][A
  1%|▏         | 8/540 [01:11<1:25:12,  9.61s/it][A
  2%|▏         | 9/540 [01:21<1:24:40,  9.57s/it][A
  2%|▏         | 10/540 [01:32<1:29:32, 10.14s/it][A
  2%|▏         | 11/540 [01:42<1:27:58,  9.98s/it][A
  2%|▏         | 12/540 [01:50<1:23:45,  9.52s/it][A
  2%|▏         | 13/540 [02:01<1:27:14,  9.93s/it][A
  3%|▎         | 14/540 [02:12<1:31:02, 10.39s/it][A
  3%|▎         | 15/540 [02:24<1:32:55, 10.62s/it][A
  3%|▎         | 16/540 [02:31<1:25:34,  9.80s/it][A
  3%|▎         | 17/540 [02:43<1:29:37, 10.28s/it][A
  3%|▎         | 18/540 [02:53<1:28

In [None]:
results = pd.read_csv('results.csv')
results.head(10)
# checking the file with results of model configurations

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,2,0.01,0.01,0.35947
1,75% Corpus,2,0.01,0.31,0.356286
2,75% Corpus,2,0.01,0.61,0.389983
3,75% Corpus,2,0.01,0.9099999999999999,0.38345
4,75% Corpus,2,0.01,symmetric,0.382419
5,75% Corpus,2,0.31,0.01,0.37646
6,75% Corpus,2,0.31,0.31,0.359567
7,75% Corpus,2,0.31,0.61,0.382419
8,75% Corpus,2,0.31,0.9099999999999999,0.401974
9,75% Corpus,2,0.31,symmetric,0.382419


## Notes detailing the optimal parameters
Maximising the models for coherence. 

In [None]:
## graphing coherence scores over numbers of topics
    ## K = 7
    ## alpha = 0.31
    ## beta = 0.91
    ## coherence = 59.0414

## Tayloring the model to optimal parameters

In [None]:
num_topics = 7

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.31,
                                           eta=0.91)

  and should_run_async(code)


In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)
## improved coherence score = 0.588189755

  and should_run_async(code)


Coherence Score:  0.5881897558818407


## Visualising the final model and its intertopic distance
The last cell saves the final model. 

In [None]:
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('data_words10'+str(num_topics))

if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, 'data_words10'+ str(num_topics) +'.html') # loading data_words set
LDAvis_prepared

  and should_run_async(code)
  default_term_info = default_term_info.sort_values(


In [None]:
save(lda_model)

'' was not found in history, as a file, url, nor in the user namespace.


  and should_run_async(code)
