# LDA with NLTK and Gensim

* Start by importing python libraries that we'll be using

In [1]:
%load_ext autoreload
%autoreload 2

import sqlite3
import string
import logging
import pickle

from gensim import corpora, utils, models, similarities
from collections import defaultdict

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.corpus import wordnet
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

* Let's set up some basic config

In [2]:
# Set up logging for gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Punctuation and stop words to be removed later
punctuation = set(string.punctuation)
stoplist = set(stopwords.words('english'))

# For LDA training later
dictionary = corpora.Dictionary()
lemma = WordNetLemmatizer()

NUM_PASSES=10
NUM_TOPICS=100
RANDOM_STATE=1

### Remove punctuation from text

In [3]:
def remove_punctuation(text):
    """
    Remove punctuation from text by checking each character against a set of punctation characters
    :text: string
    :return: string
    """
    return ''.join(char for char in text if char not in punctuation)

### Remove numbers from text

In [4]:
def remove_numbers(text):
    """
    Remove numbers from text as they aren't of value to our model
    :text: string
    :return: string
    """
    return ''.join(char for char in text if not char.isdigit())

### Remove stop words from text

In [5]:
def remove_stop_words(text):
    """
    Remove common words as they won't add any value to our model
    :text: string
    :return: string
    """
    return ' '.join([word for word in text.split() if word not in stoplist])

### Remove single character words

In [6]:
def remove_single_characters(text):
    """
    Remove any remaining single-character words
    :text: string
    :return: string
    """
    return ' '.join([word for word in text.split() if len(word) > 1])

### Lemmatize our document

In [7]:
def lemmatize(text):
    """
    Use NLTK lemma functionality to get the route word
    :text: string
    :return: string
    """
    return ' '.join([lemma.lemmatize(word) for word in text.split()])


### Single function to clean text

In [18]:
def get_cleaned_text(text):
    """
    Return the page with stopwords, digits, punctuation and single character words removed
    :text: string
    :return: string
    """
    # Remove \n characters (Wikipedia has a lot of them in the page content!)
    text = text.replace('\n', '')
    # Remove numbers
    text = remove_numbers(text)
    # Remove stop words
    text = remove_stop_words(text)
    # Remove punctuation
    text = remove_punctuation(text)
    # Remove single character words
    text = remove_single_characters(text)
    # Lemmatize the document
    text = lemmatize(text)
    return text

### Create a class for our content

This class will use the database that we created when we downloaded content from Wikipedia

In [19]:
class ContentStore:
    def __init__(self, db_file):
        """
        Intialise the crawl_wikipedia class, set up a
        lightweight database for storing content for later use
        :param db_file: string
        """
        self.categories = []
        # Connect to the DB db
        self.conn = sqlite3.connect(db_file)
        self.cursor = self.conn.cursor()

    def get_page_urls(self):
        """
        Retrieve a list of urls from the database
        :return: list of urls
        """
        return [row for row in self.cursor.execute("SELECT url FROM content")]
    
    def get_page_ids(self):
        """
        Retrieve a list of page ids from the database
        :return: list of page id tuples
        """
        return [row for row in self.cursor.execute("SELECT pageid FROM content")]   
    
    def get_page_by_id(self, pageid):
        """
        Retrieve the page with the specified pageid
        Note that this is of the format (pageid, ) for SQLite3 to work, for example
        to get the page with the id of 1 in our database, set pageid to ('1', )
        :pageid: tuple ('id', )
        :return: string
        """
        return str(self.cursor.execute("SELECT content FROM content WHERE pageid=?", pageid).fetchone()).lower()
    
    def get_page_url_by_id(self, pageid):
        """
        Retrieve the page with the specified pageid
        Note that this is of the format (pageid, ) for SQLite3 to work, for example
        to get the page with the id of 1 in our database, set pageid to ('1', )
        :pageid: tuple ('id', )
        :return: string
        """
        return self.cursor.execute("SELECT url FROM content WHERE pageid=?", pageid).fetchone()
            
    def __iter__(self):
        """
        Iterator for the document set stored in the database
        This is more efficient memory wise than loading the complete document set into memory
        and therefore will scale well for larger document sets (or those not available on local disk)
        :return: tuple (string, )
        """
        for pageid in self.get_page_ids():
            page = self.get_page_by_id(pageid)
            yield get_cleaned_text(page).split()

### Exploring and cleaning our content

* Access our database of Wikipedia content and get a list of all of the pages IDs

In [20]:
content = ContentStore('../data/content.db')
page_ids = content.get_page_ids()

* Let's view a page from our database

In [21]:
page = content.get_page_by_id(page_ids[0])
print(page)



* Now let's remove punctuation

In [22]:
page = page.replace('\n', '')
page = remove_punctuation(page)
print(page)

artificial intelligence ai sometimes called machine intelligence iss such as learning and problem solvingthe scope of ai is disputed as machines become increasingly capable tasks considered as requiring intelligence are often removed from the definition a phenomenon known as thennnartificial intelligence was founded as an academic discipline in 1956 and in the years since has experienced several waves of optimism followed by disappointment and the loss of funding known as an s or deep philosophical differences subfields have also been based on social factors particular institutions or the work of particular researchersnnthe traditional problems or goals of ai research includeand many othersnnnthe field was founded on the claim thatnnin the twentyfirst century ai techniques have experienced a resurgence following concurrent advances in helping to solve many challenging problems in computer science history thoughtcapablennthe study of mechanical orartificial neuronsthe field of ai resear

* Remove numbers

In [23]:
page = remove_numbers(page)
print(page)

artificial intelligence ai sometimes called machine intelligence iss such as learning and problem solvingthe scope of ai is disputed as machines become increasingly capable tasks considered as requiring intelligence are often removed from the definition a phenomenon known as thennnartificial intelligence was founded as an academic discipline in  and in the years since has experienced several waves of optimism followed by disappointment and the loss of funding known as an s or deep philosophical differences subfields have also been based on social factors particular institutions or the work of particular researchersnnthe traditional problems or goals of ai research includeand many othersnnnthe field was founded on the claim thatnnin the twentyfirst century ai techniques have experienced a resurgence following concurrent advances in helping to solve many challenging problems in computer science history thoughtcapablennthe study of mechanical orartificial neuronsthe field of ai research w

* And stop words

In [24]:
page = remove_stop_words(page)
print(page)

artificial intelligence ai sometimes called machine intelligence iss learning problem solvingthe scope ai disputed machines become increasingly capable tasks considered requiring intelligence often removed definition phenomenon known thennnartificial intelligence founded academic discipline years since experienced several waves optimism followed disappointment loss funding known deep philosophical differences subfields also based social factors particular institutions work particular researchersnnthe traditional problems goals ai research includeand many othersnnnthe field founded claim thatnnin twentyfirst century ai techniques experienced resurgence following concurrent advances helping solve many challenging problems computer science history thoughtcapablennthe study mechanical orartificial neuronsthe field ai research born atagreed writing within generationxa problem creating artificial intelligence substantially solvednnthey failed recognize difficulty remaining tasks progress slo

In [25]:
page = remove_single_characters(page)
print(page)

artificial intelligence ai sometimes called machine intelligence iss learning problem solvingthe scope ai disputed machines become increasingly capable tasks considered requiring intelligence often removed definition phenomenon known thennnartificial intelligence founded academic discipline years since experienced several waves optimism followed disappointment loss funding known deep philosophical differences subfields also based social factors particular institutions work particular researchersnnthe traditional problems goals ai research includeand many othersnnnthe field founded claim thatnnin twentyfirst century ai techniques experienced resurgence following concurrent advances helping solve many challenging problems computer science history thoughtcapablennthe study mechanical orartificial neuronsthe field ai research born atagreed writing within generationxa problem creating artificial intelligence substantially solvednnthey failed recognize difficulty remaining tasks progress slo

### Lemmatize the document

In [26]:
page = lemmatize(page)
print(page)

artificial intelligence ai sometimes called machine intelligence i learning problem solvingthe scope ai disputed machine become increasingly capable task considered requiring intelligence often removed definition phenomenon known thennnartificial intelligence founded academic discipline year since experienced several wave optimism followed disappointment loss funding known deep philosophical difference subfields also based social factor particular institution work particular researchersnnthe traditional problem goal ai research includeand many othersnnnthe field founded claim thatnnin twentyfirst century ai technique experienced resurgence following concurrent advance helping solve many challenging problem computer science history thoughtcapablennthe study mechanical orartificial neuronsthe field ai research born atagreed writing within generationxa problem creating artificial intelligence substantially solvednnthey failed recognize difficulty remaining task progress slowed response cr

### Now build a model using gensim

* Start by building a dictionary of all words found in the cleaned content

In [27]:
dictionary = corpora.Dictionary(content)

2018-08-19 15:26:18,071 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-08-19 15:26:36,007 : INFO : built Dictionary(132307 unique tokens: ['a', 'aaai', 'aally', 'abandoned', 'abduction']...) from 2727 documents (total 1294610 corpus positions)


* Ignore words that appear in less than 5 documents or more than 40% documents

(These settings are configurable and can produce very different results for the same document set)

In [28]:
dictionary.filter_extremes(no_below=5, no_above=0.4)

2018-08-19 15:26:36,167 : INFO : discarding 117937 tokens: [('aally', 1), ('accomplishn', 1), ('achievementsn', 2), ('acmnn', 2), ('actionsapply', 1), ('aerosense', 1), ('afternigor', 1), ('agar', 1), ('agentsnpublisher', 1), ('aicite', 1)]...
2018-08-19 15:26:36,168 : INFO : keeping 14370 tokens which were in no less than 5 and no more than 1090 (=40.0%) documents
2018-08-19 15:26:36,208 : INFO : resulting dictionary: Dictionary(14370 unique tokens: ['a', 'aaai', 'abandoned', 'abduction', 'ability']...)


* Now create a corpus

In [29]:
corpus = [dictionary.doc2bow(text) for text in content]

* Build our LDA model

Note this may take a while depending on how many documents you have and how many passes you set.  
An increased number of passes improves consistency of the model at the expense of longer training times.

In [30]:
lda = models.LdaModel(corpus, id2word=dictionary, random_state=RANDOM_STATE, num_topics=NUM_TOPICS, passes=NUM_PASSES)

2018-08-19 15:26:53,762 : INFO : using symmetric alpha at 0.01
2018-08-19 15:26:53,763 : INFO : using symmetric eta at 0.01
2018-08-19 15:26:53,766 : INFO : using serial LDA version on this node
2018-08-19 15:26:53,950 : INFO : running online (multi-pass) LDA training, 100 topics, 10 passes over the supplied corpus of 2727 documents, updating model once every 2000 documents, evaluating perplexity every 2727 documents, iterating 50x with a convergence threshold of 0.001000
2018-08-19 15:26:53,951 : INFO : PROGRESS: pass 0, at document #2000/2727
2018-08-19 15:26:58,020 : INFO : merging changes from 2000 documents into a model of 2727 documents
2018-08-19 15:26:58,201 : INFO : topic #52 (0.010): 0.007*"time" + 0.006*"model" + 0.005*"user" + 0.005*"information" + 0.004*"example" + 0.004*"action" + 0.004*"set" + 0.004*"use" + 0.003*"ontology" + 0.003*"true"
2018-08-19 15:26:58,203 : INFO : topic #21 (0.010): 0.007*"human" + 0.004*"model" + 0.003*"intelligence" + 0.003*"data" + 0.003*"time"

2018-08-19 15:27:18,562 : INFO : topic #26 (0.010): 0.006*"technology" + 0.006*"research" + 0.005*"music" + 0.005*"company" + 0.005*"human" + 0.005*"work" + 0.004*"project" + 0.004*"science" + 0.004*"time" + 0.004*"artificial"
2018-08-19 15:27:18,564 : INFO : topic #88 (0.010): 0.023*"lab" + 0.016*"enterprise" + 0.015*"building" + 0.011*"user" + 0.010*"application" + 0.010*"file" + 0.010*"knowledge" + 0.010*"business" + 0.009*"software" + 0.007*"interoperability"
2018-08-19 15:27:18,565 : INFO : topic #50 (0.010): 0.039*"value" + 0.032*"brainiac" + 0.021*"superman" + 0.014*"set" + 0.010*"condition" + 0.010*"and" + 0.010*"world" + 0.007*"forall" + 0.006*"probability" + 0.006*"true"
2018-08-19 15:27:18,566 : INFO : topic #67 (0.010): 0.007*"genetic" + 0.006*"september" + 0.005*"december" + 0.005*"time" + 0.005*"conference" + 0.005*"july" + 0.005*"april" + 0.005*"landing" + 0.004*"evolutionary" + 0.004*"nn"
2018-08-19 15:27:18,567 : INFO : topic diff=inf, rho=0.478721
2018-08-19 15:27:18,

2018-08-19 15:27:50,788 : INFO : PROGRESS: pass 5, at document #2727/2727
2018-08-19 15:27:52,299 : INFO : merging changes from 727 documents into a model of 2727 documents
2018-08-19 15:27:52,476 : INFO : topic #38 (0.010): 0.027*"model" + 0.014*"neuron" + 0.013*"virtual" + 0.012*"type" + 0.009*"wearable" + 0.008*"word" + 0.007*"method" + 0.007*"performance" + 0.007*"signal" + 0.007*"potential"
2018-08-19 15:27:52,477 : INFO : topic #67 (0.010): 0.007*"september" + 0.007*"genetic" + 0.007*"landing" + 0.006*"july" + 0.006*"december" + 0.005*"april" + 0.005*"time" + 0.005*"group" + 0.004*"name" + 0.004*"th"
2018-08-19 15:27:52,479 : INFO : topic #34 (0.010): 0.352*"–n" + 0.114*"cyborg" + 0.066*"nn" + 0.015*"gi" + 0.009*"tech" + 0.008*"term" + 0.007*"web" + 0.007*"aligncenter" + 0.006*"toronto" + 0.006*"artist"
2018-08-19 15:27:52,480 : INFO : topic #40 (0.010): 0.062*"semantic" + 0.043*"web" + 0.035*"owl" + 0.034*"rdf" + 0.025*"reasoner" + 0.018*"description" + 0.016*"dl" + 0.013*"softw

2018-08-19 15:28:24,754 : INFO : topic diff=inf, rho=0.310632
2018-08-19 15:28:26,938 : INFO : -8.116 per-word bound, 277.4 perplexity estimate based on a held-out corpus of 727 documents with 189680 words
2018-08-19 15:28:26,938 : INFO : PROGRESS: pass 8, at document #2727/2727
2018-08-19 15:28:28,393 : INFO : merging changes from 727 documents into a model of 2727 documents
2018-08-19 15:28:28,568 : INFO : topic #61 (0.010): 0.092*"ai" + 0.018*"human" + 0.015*"artificial" + 0.013*"brain" + 0.012*"would" + 0.012*"could" + 0.009*"personn" + 0.009*"experiment" + 0.008*"attempt" + 0.008*"ibm"
2018-08-19 15:28:28,571 : INFO : topic #36 (0.010): 0.040*"chess" + 0.021*"program" + 0.020*"university" + 0.016*"van" + 0.014*"competition" + 0.014*"game" + 0.013*"martin" + 0.012*"held" + 0.012*"david" + 0.012*"go"
2018-08-19 15:28:28,571 : INFO : topic #29 (0.010): 0.105*"uavs" + 0.055*"accessdate" + 0.052*"pilot" + 0.034*"storm" + 0.032*"police" + 0.030*"operator" + 0.027*"flying" + 0.025*"air" 

* Let's print 10 of the topics

In [31]:
lda.print_topics(10)

2018-08-19 15:28:38,916 : INFO : topic #4 (0.010): 0.046*"ai" + 0.041*"human" + 0.029*"intelligence" + 0.016*"artificial" + 0.016*"machine" + 0.015*"would" + 0.009*"singularity" + 0.008*"intelligent" + 0.008*"goal" + 0.007*"could"
2018-08-19 15:28:38,917 : INFO : topic #10 (0.010): 0.062*"robot" + 0.020*"control" + 0.019*"robotic" + 0.014*"material" + 0.012*"robotics" + 0.011*"mechanical" + 0.011*"human" + 0.009*"body" + 0.009*"designed" + 0.009*"arm"
2018-08-19 15:28:38,918 : INFO : topic #51 (0.010): 0.354*"user" + 0.121*"design" + 0.072*"interface" + 0.053*"usability" + 0.046*"designer" + 0.037*"task" + 0.024*"factor" + 0.022*"product" + 0.015*"visual" + 0.012*"prototype"
2018-08-19 15:28:38,919 : INFO : topic #66 (0.010): 0.025*"node" + 0.020*"tree" + 0.013*"variable" + 0.012*"number" + 0.012*"weight" + 0.011*"input" + 0.010*"network" + 0.009*"vector" + 0.009*"linear" + 0.008*"output"
2018-08-19 15:28:38,920 : INFO : topic #58 (0.010): 0.027*"conference" + 0.026*"proceeding" + 0.01

[(4,
  '0.046*"ai" + 0.041*"human" + 0.029*"intelligence" + 0.016*"artificial" + 0.016*"machine" + 0.015*"would" + 0.009*"singularity" + 0.008*"intelligent" + 0.008*"goal" + 0.007*"could"'),
 (10,
  '0.062*"robot" + 0.020*"control" + 0.019*"robotic" + 0.014*"material" + 0.012*"robotics" + 0.011*"mechanical" + 0.011*"human" + 0.009*"body" + 0.009*"designed" + 0.009*"arm"'),
 (51,
  '0.354*"user" + 0.121*"design" + 0.072*"interface" + 0.053*"usability" + 0.046*"designer" + 0.037*"task" + 0.024*"factor" + 0.022*"product" + 0.015*"visual" + 0.012*"prototype"'),
 (66,
  '0.025*"node" + 0.020*"tree" + 0.013*"variable" + 0.012*"number" + 0.012*"weight" + 0.011*"input" + 0.010*"network" + 0.009*"vector" + 0.009*"linear" + 0.008*"output"'),
 (58,
  '0.027*"conference" + 0.026*"proceeding" + 0.019*"vol" + 0.019*"international" + 0.015*"probability" + 0.015*"information" + 0.014*"in" + 0.013*"learning" + 0.013*"ieee" + 0.012*"science"'),
 (89,
  '0.049*"test" + 0.046*"turing" + 0.025*"human" + 0.

* And now let's save our model and dictionary

In [33]:
lda.save('../data/lda_model')
with open('../data/dictionary', "wb") as fp:
    pickle.dump(dictionary, fp)
fp.close()

2018-08-19 15:28:39,101 : INFO : saving LdaState object under ../lda_model.state, separately None
2018-08-19 15:28:39,161 : INFO : saved ../lda_model.state
2018-08-19 15:28:39,177 : INFO : saving LdaModel object under ../lda_model, separately ['expElogbeta', 'sstats']
2018-08-19 15:28:39,178 : INFO : storing np array 'expElogbeta' to ../lda_model.expElogbeta.npy
2018-08-19 15:28:39,206 : INFO : not storing attribute state
2018-08-19 15:28:39,207 : INFO : not storing attribute dispatcher
2018-08-19 15:28:39,208 : INFO : not storing attribute id2word
2018-08-19 15:28:39,212 : INFO : saved ../lda_model


### Now let's query our dataset to find related documents

* Let's start by creating a get_similarity() function

In [34]:
def get_similarity(lda, q_vec):
    index = similarities.MatrixSimilarity(lda[corpus])
    sims = index[q_vec]
    return sims
    #return set(sims)

* Let's manually create a subject we want to query the dataset for

In [35]:
query = "using deep learning for computer vision in real time"

* And see how our LDA model interprets this

Remember, we hare passing this through the same text cleaning functionality as the documents, so punctuation, stop words, etc. will all be removed.

This leaves us with the statistically important related words from our dictionary

In [37]:
bow = dictionary.doc2bow(get_cleaned_text(query).split())
words = [word for word in bow]
for word in words:
    print('{}: {}'.format(word[0], dictionary[word[0]]))

443: deep
966: learning
1394: real
1715: time
1797: using
2194: vision


* Now let's query our LDA model based on this bag of words

This will give us a vector based on our model for our query above. Note that LDA uses some randomisation and therefore you may appear to get different vectors output here if you run this multiple times.  However, the results below are generally consistent.

In [38]:
q_vec = lda[bow]
print(q_vec)

[(44, 0.4024969), (60, 0.45750308)]


* Let's view the details for the LDA topic relating to the query above

In [39]:
print(lda.print_topic(max(q_vec, key=lambda item: item[1])[0]))

0.057*"data" + 0.039*"learning" + 0.039*"network" + 0.031*"neural" + 0.016*"machine" + 0.015*"model" + 0.012*"feature" + 0.012*"input" + 0.011*"recognition" + 0.011*"layer"


* Now let's get the similarity of this query vector to the document vectors and sort in high-to-low order

In [40]:
sims = get_similarity(lda, q_vec)
sims = sorted(enumerate(sims), key=lambda item: -item[1])

2018-08-19 15:29:23,458 : INFO : creating matrix with 2727 documents and 100 features
  if np.issubdtype(vec.dtype, np.int):


* Now render the results

The gensim MatrixSimilarity function used above doesn't always give unique values, hence we can't just print the top n results.  If we do, we occasionally get duplication.

In [41]:
idx=0
pids=[]
result = 10
while result > 0:
    pageid = page_ids[sims[idx][0]]
    if pageid not in pids:
        pids.append(pageid)
        #print(pageid)
        print('Page ID {}: {}'.format(pageid[0], content.get_page_url_by_id(pageid)[0]))
        result -= 1
    idx += 1  

Page ID 54033657: https://en.wikipedia.org/wiki/Labeled_data
Page ID 6838895: https://en.wikipedia.org/wiki/N-jet
Page ID 5104401: https://en.wikipedia.org/wiki/Outline_of_computer_vision
Page ID 1222568: https://en.wikipedia.org/wiki/Helmholtz_machine
Page ID 2070605: https://en.wikipedia.org/wiki/Stochastic_neural_network
Page ID 55375136: https://en.wikipedia.org/wiki/Highway_network
Page ID 24286785: https://en.wikipedia.org/wiki/3D_data_acquisition_and_object_reconstruction
Page ID 470314: https://en.wikipedia.org/wiki/Instantaneously_trained_neural_networks
Page ID 50568903: https://en.wikipedia.org/wiki/Alex_Graves_(computer_scientist)
Page ID 40409788: https://en.wikipedia.org/wiki/Convolutional_neural_network
