# LDA with Gensim

* Start by importing python libraries that we'll be using

In [450]:
%load_ext autoreload
%autoreload 2

import sqlite3
import string
import logging

from gensim import corpora, utils, models, similarities
from collections import defaultdict

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

* Let's set up some basic config

In [451]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

punctuation = set(string.punctuation)
stoplist = set(stopwords.words('english'))

frequency = defaultdict(int)

stemmer = SnowballStemmer("english")
lemma = WordNetLemmatizer()

dictionary = corpora.Dictionary()

### Create a class for our content

This class will use the database that we created when we downloaded content from Wikipedia

In [452]:
class ContentStore:
    def __init__(self, db_file):
        """
        Intialise the crawl_wikipedia class, set up a
        lightweight database for storing content for later use
        :param db_file:
        """
        self.categories = []
        # Connect to the DB db
        self.conn = sqlite3.connect(db_file)
        self.cursor = self.conn.cursor()

    def get_page_urls(self):
        """
        Retrieve a list of urls from the database
        :return: list of urls
        """
        return [row for row in self.cursor.execute("SELECT url FROM content")]
    
    def get_page_ids(self):
        """
        Retrieve a list of page ids from the database
        :return: list of page id tuples
        """
        return [row for row in self.cursor.execute("SELECT pageid FROM content")]   
    
    def get_page_by_id(self, pageid):
        """
        Retrieve the page with the specified pageid
        Note that this is of the format (pageid, ) for SQLite3 to work, for example
        to get the page with the id of 1 in our database, set pageid to ('1', )
        :return: string
        """
        return str(self.cursor.execute("SELECT content FROM content WHERE pageid=?", pageid).fetchone()).lower()
    
    def get_page_url_by_id(self, pageid):
        """
        Retrieve the page with the specified pageid
        Note that this is of the format (pageid, ) for SQLite3 to work, for example
        to get the page with the id of 1 in our database, set pageid to ('1', )
        :return: string
        """
        return self.cursor.execute("SELECT url FROM content WHERE pageid=?", pageid).fetchone()
    
    def get_cleaned_text(self, text):
        """
        Return the page with stopwords, digits, punctuation and single character words removed
        :return: string
        """
        # Remove numbers
        text = ''.join(char for char in text if not char.isdigit())
        # Remove stop words
        text = ' '.join([word for word in text.split() if word not in stoplist])
        # Remove punctuation
        text = ''.join(char for char in text if char not in punctuation)
        # Remove single character words
        text = ' '.join([word for word in text.split() if len(word) > 1])
        return text
        
    def __iter__(self):
        for pageid in self.get_page_ids():
            page = self.get_page_by_id(pageid)
            yield self.get_cleaned_text(page).split()
        
#     def __iter__(self):
#         self.pageids = self.get_page_ids()
#         self.idx = 0
#         return self
    
#     def __next__(self):
#         if self.idx <= len(self.pageids):
#             pageid = self.pageids(self.idx)
#             page = dictionary.doc2bow(self.get_cleaned_page_by_id(pageid))
#             self.idx += 1
#             return page
#         else:
#             raise StopIteration
          

### Remove punctuation from text

In [453]:
def remove_punctuation(text):
    """
    Remove punctuation from text by checking each character against a set of punctation characters
    :return: string
    """
    return ''.join(char for char in text if char not in punctuation)

### Remove numbers from text

In [454]:
def remove_numbers(text):
    """
    Remove numbers from text as they aren't of value to our model
    :return: string
    """
    return ''.join(char for char in text if not char.isdigit())

### Remove stop words from text

In [455]:
def remove_stop_words(text):
    """
    Remove common words as they won't add any value to our model
    :return: string
    """
    return ' '.join([word for word in text.split() if word not in stoplist])

### Remove single character words

In [456]:
def remove_single_characters(text):
    """
    Remove any remaining single-character words
    :return: string
    """
    return ' '.join([word for word in text.split() if len(word) > 1])

### Lemmatize our document

In [457]:
def lemmatize(text):
    return text

### Stemmatize our document

In [458]:
def stemmatize(text):
    return text

### Exploring and cleaning our content

* Access our database of Wikipedia content and get a list of all of the pages IDs

In [459]:
content = ContentStore('../content.db')
page_ids = content.get_page_ids()

* Let's view a page from our database

In [460]:
page = content.get_page_by_id(page_ids[0])
print(page)



* Now let's remove punctuation

In [461]:
page = remove_punctuation(page)
print(page)

artificial intelligence ai sometimes called machine intelligence iss such as learning and problem solvingthe scope of ai is disputed as machines become increasingly capable tasks considered as requiring intelligence are often removed from the definition a phenomenon known as thennnartificial intelligence was founded as an academic discipline in 1956 and in the years since has experienced several waves of optimism followed by disappointment and the loss of funding known as an s or deep philosophical differences subfields have also been based on social factors particular institutions or the work of particular researchersnnthe traditional problems or goals of ai research includeand many othersnnnthe field was founded on the claim thatnnin the twentyfirst century ai techniques have experienced a resurgence following concurrent advances in helping to solve many challenging problems in computer science history thoughtcapablennthe study of mechanical orartificial neuronsthe field of ai resear

* Remove numbers

In [462]:
page = remove_numbers(page)
print(page)

artificial intelligence ai sometimes called machine intelligence iss such as learning and problem solvingthe scope of ai is disputed as machines become increasingly capable tasks considered as requiring intelligence are often removed from the definition a phenomenon known as thennnartificial intelligence was founded as an academic discipline in  and in the years since has experienced several waves of optimism followed by disappointment and the loss of funding known as an s or deep philosophical differences subfields have also been based on social factors particular institutions or the work of particular researchersnnthe traditional problems or goals of ai research includeand many othersnnnthe field was founded on the claim thatnnin the twentyfirst century ai techniques have experienced a resurgence following concurrent advances in helping to solve many challenging problems in computer science history thoughtcapablennthe study of mechanical orartificial neuronsthe field of ai research w

* And stop words

In [463]:
page = remove_stop_words(page)
print(page)

artificial intelligence ai sometimes called machine intelligence iss learning problem solvingthe scope ai disputed machines become increasingly capable tasks considered requiring intelligence often removed definition phenomenon known thennnartificial intelligence founded academic discipline years since experienced several waves optimism followed disappointment loss funding known deep philosophical differences subfields also based social factors particular institutions work particular researchersnnthe traditional problems goals ai research includeand many othersnnnthe field founded claim thatnnin twentyfirst century ai techniques experienced resurgence following concurrent advances helping solve many challenging problems computer science history thoughtcapablennthe study mechanical orartificial neuronsthe field ai research born atagreed writing within generationxa problem creating artificial intelligence substantially solvednnthey failed recognize difficulty remaining tasks progress slo

In [464]:
page = remove_single_characters(page)
print(page)

artificial intelligence ai sometimes called machine intelligence iss learning problem solvingthe scope ai disputed machines become increasingly capable tasks considered requiring intelligence often removed definition phenomenon known thennnartificial intelligence founded academic discipline years since experienced several waves optimism followed disappointment loss funding known deep philosophical differences subfields also based social factors particular institutions work particular researchersnnthe traditional problems goals ai research includeand many othersnnnthe field founded claim thatnnin twentyfirst century ai techniques experienced resurgence following concurrent advances helping solve many challenging problems computer science history thoughtcapablennthe study mechanical orartificial neuronsthe field ai research born atagreed writing within generationxa problem creating artificial intelligence substantially solvednnthey failed recognize difficulty remaining tasks progress slo

### Now build a model using gensim

* Start by building a dictionary of all words found in the cleaned content

In [465]:
dictionary = corpora.Dictionary(content)

2018-08-19 12:43:54,270 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-08-19 12:43:55,082 : INFO : built Dictionary(30607 unique tokens: ['aaai', 'aally', 'abandoned', 'abduction', 'ability']...) from 327 documents (total 162671 corpus positions)


* Ignore words that appear in less than 10 documents or more than 20% documents

(These settings are configurable and can produce very different results for the same document set)

In [466]:
dictionary.filter_extremes(no_below=10, no_above=0.2)

2018-08-19 12:43:55,135 : INFO : discarding 28741 tokens: [('aally', 1), ('abandoned', 1), ('abduction', 3), ('academy', 5), ('accelerating', 2), ('accepts', 2), ('accomplished', 6), ('accomplishn', 1), ('accordance', 4), ('accumulate', 3)]...
2018-08-19 12:43:55,136 : INFO : keeping 1866 tokens which were in no less than 10 and no more than 65 (=20.0%) documents
2018-08-19 12:43:55,143 : INFO : resulting dictionary: Dictionary(1866 unique tokens: ['aaai', 'ability', 'able', 'abstract', 'academic']...)


* Now create a corpus

In [467]:
corpus = [dictionary.doc2bow(text) for text in content]

In [538]:
lda = models.LdaModel(corpus, id2word=dictionary, random_state=1, num_topics=200, passes=50)

2018-08-19 13:05:01,700 : INFO : using symmetric alpha at 0.005
2018-08-19 13:05:01,701 : INFO : using symmetric eta at 0.005
2018-08-19 13:05:01,703 : INFO : using serial LDA version on this node
2018-08-19 13:05:01,752 : INFO : running online (multi-pass) LDA training, 200 topics, 50 passes over the supplied corpus of 327 documents, updating model once every 327 documents, evaluating perplexity every 327 documents, iterating 50x with a convergence threshold of 0.001000
2018-08-19 13:05:02,752 : INFO : -27.244 per-word bound, 158908680.4 perplexity estimate based on a held-out corpus of 327 documents with 73982 words
2018-08-19 13:05:02,754 : INFO : PROGRESS: pass 0, at document #327/327
2018-08-19 13:05:03,453 : INFO : topic #192 (0.005): 0.020*"cognitive" + 0.015*"solving" + 0.010*"communication" + 0.006*"people" + 0.005*"depth" + 0.005*"robot" + 0.005*"object" + 0.005*"vision" + 0.005*"functional" + 0.005*"person"
2018-08-19 13:05:03,456 : INFO : topic #145 (0.005): 0.013*"expert" 

2018-08-19 13:05:10,286 : INFO : topic #148 (0.005): 0.045*"tools" + 0.037*"potential" + 0.035*"concerns" + 0.034*"large" + 0.024*"benefits" + 0.024*"january" + 0.024*"rise" + 0.022*"business" + 0.019*"cars" + 0.018*"robust"
2018-08-19 13:05:10,288 : INFO : topic #189 (0.005): 0.029*"inference" + 0.022*"every" + 0.018*"node" + 0.014*"natural" + 0.013*"derived" + 0.012*"acting" + 0.012*"age" + 0.012*"terms" + 0.012*"acts" + 0.012*"user"
2018-08-19 13:05:10,289 : INFO : topic #95 (0.005): 0.014*"industrial" + 0.014*"workshop" + 0.014*"industry" + 0.014*"user" + 0.012*"related" + 0.011*"value" + 0.011*"aaai" + 0.011*"aims" + 0.011*"management" + 0.010*"national"
2018-08-19 13:05:10,292 : INFO : topic diff=inf, rho=0.377964
2018-08-19 13:05:11,153 : INFO : -8.830 per-word bound, 455.0 perplexity estimate based on a held-out corpus of 327 documents with 73982 words
2018-08-19 13:05:11,154 : INFO : PROGRESS: pass 6, at document #327/327
2018-08-19 13:05:11,726 : INFO : topic #41 (0.005): 0.0

2018-08-19 13:05:18,116 : INFO : topic #192 (0.005): 0.003*"cognitive" + 0.002*"solving" + 0.002*"communication" + 0.001*"people" + 0.001*"depth" + 0.001*"robot" + 0.001*"object" + 0.001*"vision" + 0.001*"functional" + 0.001*"person"
2018-08-19 13:05:18,119 : INFO : topic #7 (0.005): 0.024*"cognitive" + 0.017*"transformation" + 0.017*"concept" + 0.015*"code" + 0.013*"later" + 0.012*"real" + 0.011*"domain" + 0.011*"air" + 0.011*"format" + 0.011*"analytics"
2018-08-19 13:05:18,121 : INFO : topic #85 (0.005): 0.035*"probability" + 0.023*"output" + 0.020*"rules" + 0.020*"method" + 0.017*"theory" + 0.017*"fear" + 0.017*"trust" + 0.017*"prior" + 0.014*"programs" + 0.014*"complexity"
2018-08-19 13:05:18,124 : INFO : topic #76 (0.005): 0.002*"control" + 0.002*"able" + 0.002*"complexity" + 0.001*"network" + 0.001*"activation" + 0.001*"basic" + 0.001*"intervention" + 0.001*"management" + 0.001*"resources" + 0.001*"computing"
2018-08-19 13:05:18,127 : INFO : topic diff=inf, rho=0.277350
2018-08-1

2018-08-19 13:05:23,916 : INFO : PROGRESS: pass 17, at document #327/327
2018-08-19 13:05:24,256 : INFO : topic #141 (0.005): 0.001*"music" + 0.001*"retrieved" + 0.001*"arguments" + 0.001*"game" + 0.001*"theory" + 0.001*"notes" + 0.001*"semantics" + 0.001*"games" + 0.001*"play" + 0.001*"note"
2018-08-19 13:05:24,259 : INFO : topic #83 (0.005): 0.001*"brain" + 0.001*"network" + 0.001*"logic" + 0.001*"fuzzy" + 0.001*"neural" + 0.001*"neuron" + 0.001*"theory" + 0.001*"description" + 0.001*"node" + 0.001*"potential"
2018-08-19 13:05:24,263 : INFO : topic #68 (0.005): 0.085*"fuzzy" + 0.045*"sets" + 0.020*"vol" + 0.018*"ppxa–" + 0.016*"ieee" + 0.015*"logic" + 0.010*"uncertainty" + 0.009*"function" + 0.008*"trans" + 0.008*"words"
2018-08-19 13:05:24,267 : INFO : topic #147 (0.005): 0.098*"robot" + 0.077*"robotics" + 0.067*"robots" + 0.048*"cloud" + 0.032*"cognitive" + 0.026*"sensory" + 0.021*"communication" + 0.017*"control" + 0.013*"environment" + 0.012*"automation"
2018-08-19 13:05:24,270 :

2018-08-19 13:05:29,157 : INFO : topic diff=inf, rho=0.204124
2018-08-19 13:05:29,770 : INFO : -7.625 per-word bound, 197.4 perplexity estimate based on a held-out corpus of 327 documents with 73982 words
2018-08-19 13:05:29,773 : INFO : PROGRESS: pass 23, at document #327/327
2018-08-19 13:05:30,102 : INFO : topic #47 (0.005): 0.066*"skills" + 0.019*"years" + 0.017*"difficult" + 0.014*"us" + 0.013*"days" + 0.012*"need" + 0.011*"oxford" + 0.011*"others" + 0.011*"recently" + 0.011*"evolution"
2018-08-19 13:05:30,104 : INFO : topic #76 (0.005): 0.001*"control" + 0.001*"able" + 0.001*"complexity" + 0.001*"network" + 0.001*"activation" + 0.001*"basic" + 0.001*"intervention" + 0.001*"management" + 0.001*"resources" + 0.001*"computing"
2018-08-19 13:05:30,107 : INFO : topic #51 (0.005): 0.066*"reasoning" + 0.026*"approaches" + 0.018*"instance" + 0.015*"categories" + 0.013*"domains" + 0.013*"knowledgebased" + 0.013*"relations" + 0.012*"change" + 0.012*"last" + 0.012*"class"
2018-08-19 13:05:3

2018-08-19 13:05:34,941 : INFO : topic #185 (0.005): 0.064*"chess" + 0.035*"challenge" + 0.029*"programs" + 0.022*"game" + 0.019*"games" + 0.018*"computers" + 0.018*"moves" + 0.018*"player" + 0.017*"humans" + 0.016*"players"
2018-08-19 13:05:34,945 : INFO : topic diff=inf, rho=0.182574
2018-08-19 13:05:35,555 : INFO : -7.601 per-word bound, 194.2 perplexity estimate based on a held-out corpus of 327 documents with 73982 words
2018-08-19 13:05:35,558 : INFO : PROGRESS: pass 29, at document #327/327
2018-08-19 13:05:35,887 : INFO : topic #87 (0.005): 0.183*"project" + 0.084*"participants" + 0.030*"status" + 0.028*"background" + 0.025*"successfully" + 0.024*"projects" + 0.024*"according" + 0.024*"true" + 0.023*"future" + 0.021*"test"
2018-08-19 13:05:35,890 : INFO : topic #128 (0.005): 0.048*"natural" + 0.017*"tasks" + 0.017*"algorithms" + 0.015*"text" + 0.012*"statistical" + 0.011*"rules" + 0.010*"words" + 0.010*"generally" + 0.010*"realworld" + 0.009*"larger"
2018-08-19 13:05:35,894 : I

2018-08-19 13:05:40,934 : INFO : topic #24 (0.005): 0.122*"company" + 0.095*"website" + 0.089*"infobox" + 0.085*"logo" + 0.072*"founded" + 0.071*"million" + 0.060*"image" + 0.049*"companyn" + 0.041*"january" + 0.033*"released"
2018-08-19 13:05:40,938 : INFO : topic #14 (0.005): 0.031*"personal" + 0.018*"automated" + 0.018*"tasks" + 0.018*"sensors" + 0.017*"able" + 0.016*"robots" + 0.016*"behaviors" + 0.016*"user" + 0.015*"symbolic" + 0.014*"algorithmic"
2018-08-19 13:05:40,942 : INFO : topic diff=inf, rho=0.166667
2018-08-19 13:05:41,574 : INFO : -7.588 per-word bound, 192.3 perplexity estimate based on a held-out corpus of 327 documents with 73982 words
2018-08-19 13:05:41,577 : INFO : PROGRESS: pass 35, at document #327/327
2018-08-19 13:05:41,917 : INFO : topic #25 (0.005): 0.045*"held" + 0.040*"brain" + 0.035*"united" + 0.028*"cambridge" + 0.020*"project" + 0.020*"objective" + 0.015*"society" + 0.015*"second" + 0.011*"thought" + 0.011*"around"
2018-08-19 13:05:41,920 : INFO : topic

2018-08-19 13:05:46,876 : INFO : topic #107 (0.005): 0.084*"virtual" + 0.038*"training" + 0.020*"second" + 0.019*"provides" + 0.018*"worlds" + 0.014*"actions" + 0.013*"environment" + 0.012*"performing" + 0.012*"people" + 0.012*"scientist"
2018-08-19 13:05:46,879 : INFO : topic #70 (0.005): 0.051*"cite" + 0.050*"year" + 0.048*"title" + 0.040*"control" + 0.039*"journal" + 0.028*"volume" + 0.026*"publisher" + 0.025*"url" + 0.023*"issue" + 0.022*"author"
2018-08-19 13:05:46,882 : INFO : topic #59 (0.005): 0.001*"et" + 0.001*"vision" + 0.001*"image" + 0.001*"images" + 0.001*"al" + 0.001*"ieee" + 0.001*"semantic" + 0.001*"databases" + 0.001*"recognition" + 0.001*"database"
2018-08-19 13:05:46,886 : INFO : topic diff=inf, rho=0.154303
2018-08-19 13:05:47,469 : INFO : -7.577 per-word bound, 191.0 perplexity estimate based on a held-out corpus of 327 documents with 73982 words
2018-08-19 13:05:47,472 : INFO : PROGRESS: pass 41, at document #327/327
2018-08-19 13:05:47,807 : INFO : topic #166 (0

2018-08-19 13:05:52,676 : INFO : topic #61 (0.005): 0.042*"goals" + 0.038*"final" + 0.032*"goal" + 0.032*"resources" + 0.019*"value" + 0.019*"values" + 0.016*"people" + 0.016*"agent" + 0.016*"function" + 0.016*"might"
2018-08-19 13:05:52,678 : INFO : topic #183 (0.005): 0.025*"theorem" + 0.023*"current" + 0.021*"initial" + 0.021*"utility" + 0.020*"axioms" + 0.019*"environment" + 0.018*"proof" + 0.017*"reward" + 0.016*"variable" + 0.016*"mission"
2018-08-19 13:05:52,682 : INFO : topic #24 (0.005): 0.123*"company" + 0.096*"website" + 0.088*"infobox" + 0.084*"logo" + 0.071*"founded" + 0.070*"million" + 0.060*"image" + 0.049*"companyn" + 0.042*"january" + 0.034*"released"
2018-08-19 13:05:52,685 : INFO : topic #57 (0.005): 0.068*"neural" + 0.044*"networks" + 0.034*"network" + 0.022*"sequence" + 0.021*"deep" + 0.018*"last" + 0.017*"algorithm" + 0.014*"layers" + 0.012*"genetic" + 0.011*"weight"
2018-08-19 13:05:52,688 : INFO : topic diff=inf, rho=0.144338
2018-08-19 13:05:53,322 : INFO : -7.

In [539]:
lda.print_topics(20)

2018-08-19 13:05:55,732 : INFO : topic #120 (0.005): 0.038*"years" + 0.035*"year" + 0.029*"better" + 0.026*"less" + 0.024*"humans" + 0.022*"predicted" + 0.017*"performance" + 0.017*"left" + 0.016*"estimate" + 0.012*"found"
2018-08-19 13:05:55,736 : INFO : topic #52 (0.005): 0.032*"read" + 0.032*"write" + 0.020*"neural" + 0.018*"memory" + 0.017*"chess" + 0.013*"theories" + 0.011*"networks" + 0.011*"results" + 0.011*"output" + 0.010*"it"
2018-08-19 13:05:55,740 : INFO : topic #161 (0.005): 0.001*"fuzzy" + 0.001*"brain" + 0.001*"neurons" + 0.001*"neuron" + 0.001*"strong" + 0.001*"researchers" + 0.001*"potential" + 0.001*"neural" + 0.001*"action" + 0.001*"logic"
2018-08-19 13:05:55,743 : INFO : topic #95 (0.005): 0.017*"industry" + 0.017*"workshop" + 0.017*"user" + 0.017*"proceedings" + 0.015*"industrial" + 0.013*"aaai" + 0.013*"aims" + 0.013*"value" + 0.012*"national" + 0.012*"ethics"
2018-08-19 13:05:55,746 : INFO : topic #38 (0.005): 0.090*"function" + 0.090*"an" + 0.045*"computers" + 0

[(120,
  '0.038*"years" + 0.035*"year" + 0.029*"better" + 0.026*"less" + 0.024*"humans" + 0.022*"predicted" + 0.017*"performance" + 0.017*"left" + 0.016*"estimate" + 0.012*"found"'),
 (52,
  '0.032*"read" + 0.032*"write" + 0.020*"neural" + 0.018*"memory" + 0.017*"chess" + 0.013*"theories" + 0.011*"networks" + 0.011*"results" + 0.011*"output" + 0.010*"it"'),
 (161,
  '0.001*"fuzzy" + 0.001*"brain" + 0.001*"neurons" + 0.001*"neuron" + 0.001*"strong" + 0.001*"researchers" + 0.001*"potential" + 0.001*"neural" + 0.001*"action" + 0.001*"logic"'),
 (95,
  '0.017*"industry" + 0.017*"workshop" + 0.017*"user" + 0.017*"proceedings" + 0.015*"industrial" + 0.013*"aaai" + 0.013*"aims" + 0.013*"value" + 0.012*"national" + 0.012*"ethics"'),
 (38,
  '0.090*"function" + 0.090*"an" + 0.045*"computers" + 0.045*"every" + 0.045*"record" + 0.045*"maps" + 0.045*"agent" + 0.045*"act" + 0.045*"users" + 0.045*"application"'),
 (48,
  '0.040*"consciousness" + 0.033*"architecture" + 0.021*"nthe" + 0.018*"autonomou

### Now let's query our datset to find related documents

* Let's start by creating a get_similarity() function

In [540]:
def get_similarity(lda, q_vec):
    index = similarities.MatrixSimilarity(lda[corpus])
    sims = index[q_vec]
    return sims

* Let's manually create a subject we want to query the dataset for

In [541]:
query = "using deep learning for computer vision in real time"

* And see how our LDA model interprets this

Remember, we hare passing this through the same text cleaning functionality as the documents, so punctuation, stop words, etc. will all be removed.

This leaves us with the statistically important related words from our dictionary

In [542]:
bow = dictionary.doc2bow(content.get_cleaned_text(query).split())
words = [word for word in bow]
for word in words:
    print('{}: {}'.format(word[0], dictionary[word[0]]))

243: deep
709: real
1072: vision


* Now let's query our LDA model based on this bag of words

This will give us a vector based on our model for our query above

In [563]:
for i in range(5):
    print(bow)
    q_vec = lda[bow]
    print(q_vec)
    print('')

[(243, 1), (709, 1), (1072, 1)]
[(10, 0.25125), (80, 0.25125), (176, 0.25125)]

[(243, 1), (709, 1), (1072, 1)]
[(10, 0.25125), (80, 0.25125), (113, 0.25125003)]

[(243, 1), (709, 1), (1072, 1)]
[(10, 0.25125), (80, 0.25125), (113, 0.25125)]

[(243, 1), (709, 1), (1072, 1)]
[(10, 0.2339462), (18, 0.2733853), (80, 0.24641854)]

[(243, 1), (709, 1), (1072, 1)]
[(10, 0.25125), (80, 0.25125), (176, 0.25125)]



In [564]:
print(lda.print_topic(max(q_vec, key=lambda item: item[1])[0]))

0.058*"vision" + 0.041*"image" + 0.035*"depth" + 0.032*"camera" + 0.026*"images" + 0.015*"methods" + 0.012*"points" + 0.012*"point" + 0.011*"field" + 0.010*"object"


In [565]:
sims = get_similarity(lda, q_vec)

2018-08-19 13:07:42,548 : INFO : creating matrix with 327 documents and 200 features
  if np.issubdtype(vec.dtype, np.int):


In [566]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [567]:
for idx in [page[0] for page in sims[:10]]:
    pageid = page_ids[idx]
    print('Page ID {}: {}'.format(pageid[0], content.get_page_url_by_id(pageid)[0]))

Page ID 46222904: https://en.wikipedia.org/wiki/Intel_RealSense
Page ID 52918812: https://en.wikipedia.org/wiki/NTU_RGB-D_dataset
Page ID 6596: https://en.wikipedia.org/wiki/Computer_vision
Page ID 34668189: https://en.wikipedia.org/wiki/3D_reconstruction_from_multiple_images
Page ID 586357: https://en.wikipedia.org/wiki/Artificial_general_intelligence
Page ID 2088095: https://en.wikipedia.org/wiki/IJCAI_Computers_and_Thought_Award
Page ID 2862: https://en.wikipedia.org/wiki/AI-complete
Page ID 1164: https://en.wikipedia.org/wiki/Artificial_intelligence
Page ID 47336626: https://en.wikipedia.org/wiki/Oriented_energy_filters
Page ID 195552: https://en.wikipedia.org/wiki/Artificial_consciousness
