# LDA with Gensim

* Start by importing python libraries that we'll be using

In [315]:
%load_ext autoreload
%autoreload 2

import sqlite3
import string
import logging

from gensim import corpora, utils, models, similarities
from collections import defaultdict

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

* Let's set up some basic config

In [316]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

punctuation = set(string.punctuation)
stoplist = set(stopwords.words('english'))

frequency = defaultdict(int)

stemmer = SnowballStemmer("english")
lemma = WordNetLemmatizer()

dictionary = corpora.Dictionary()

### Create a class for our content

This class will use the database that we created when we downloaded content from Wikipedia

In [327]:
class ContentStore:
    def __init__(self, db_file):
        """
        Intialise the crawl_wikipedia class, set up a
        lightweight database for storing content for later use
        :param db_file:
        """
        self.categories = []
        # Connect to the DB db
        self.conn = sqlite3.connect(db_file)
        self.cursor = self.conn.cursor()

    def get_page_urls(self):
        """
        Retrieve a list of urls from the database
        :return: list of urls
        """
        return [row for row in self.cursor.execute("SELECT url FROM content")]
    
    def get_page_ids(self):
        """
        Retrieve a list of page ids from the database
        :return: list of page id tuples
        """
        return [row for row in self.cursor.execute("SELECT pageid FROM content")]   
    
    def get_page_by_id(self, pageid):
        """
        Retrieve the page with the specified pageid
        Note that this is of the format (pageid, ) for SQLite3 to work, for example
        to get the page with the id of 1 in our database, set pageid to ('1', )
        :return: string
        """
        return str(self.cursor.execute("SELECT content FROM content WHERE pageid=?", pageid).fetchone()).lower()
    
    def get_page_url_by_id(self, pageid):
        """
        Retrieve the page with the specified pageid
        Note that this is of the format (pageid, ) for SQLite3 to work, for example
        to get the page with the id of 1 in our database, set pageid to ('1', )
        :return: string
        """
        return str(self.cursor.execute("SELECT url FROM content WHERE pageid=?", pageid).fetchone())
    
    def get_cleaned_text(self, text):
        """
        Return the page with stopwords, digits, punctuation and single character words removed
        :return: string
        """
        # Remove numbers
        text = ''.join(char for char in text if not char.isdigit())
        # Remove stop words
        text = ' '.join([word for word in text.split() if word not in stoplist])
        # Remove punctuation
        text = ''.join(char for char in text if char not in punctuation)
        # Remove single character words
        text = ' '.join([word for word in text.split() if len(word) > 1])
        return text
        
    def __iter__(self):
        for pageid in self.get_page_ids():
            page = self.get_page_by_id(pageid)
            yield self.get_cleaned_text(page).split()
        
#     def __iter__(self):
#         self.pageids = self.get_page_ids()
#         self.idx = 0
#         return self
    
#     def __next__(self):
#         if self.idx <= len(self.pageids):
#             pageid = self.pageids(self.idx)
#             page = dictionary.doc2bow(self.get_cleaned_page_by_id(pageid))
#             self.idx += 1
#             return page
#         else:
#             raise StopIteration
          

### Remove punctuation from text

In [328]:
def remove_punctuation(text):
    """
    Remove punctuation from text by checking each character against a set of punctation characters
    :return: string
    """
    return ''.join(char for char in text if char not in punctuation)

### Remove numbers from text

In [329]:
def remove_numbers(text):
    """
    Remove numbers from text as they aren't of value to our model
    :return: string
    """
    return ''.join(char for char in text if not char.isdigit())

### Remove stop words from text

In [330]:
def remove_stop_words(text):
    """
    Remove common words as they won't add any value to our model
    :return: string
    """
    return ' '.join([word for word in text.split() if word not in stoplist])

### Remove single character words

In [331]:
def remove_single_characters(text):
    """
    Remove any remaining single-character words
    :return: string
    """
    return ' '.join([word for word in text.split() if len(word) > 1])

### Lemmatize our document

In [332]:
def lemmatize(text):
    return text

### Stemmatize our document

In [333]:
def stemmatize(text):
    return text

### Exploring and cleaning our content

* Access our database of Wikipedia content and get a list of all of the pages IDs

In [334]:
content = ContentStore('../content.db')
page_ids = content.get_page_ids()

* Let's view a page from our database

In [335]:
page = content.get_page_by_id(page_ids[0])
print(page)

('    \'\'\'artificial intelligence\'\'\' (\'\'\'ai\'\'\'), sometimes called \'\'\'machine intelligence\'\'\', is s, such as "learning" and "problem solving". the scope of ai is disputed: as machines become increasingly capable, tasks considered as requiring "intelligence" are often removed from the definition, a phenomenon known as the .\n\n\nartificial intelligence was founded as an academic discipline in 1956, and in the years since has experienced several waves of optimism, followed by disappointment and the loss of funding (known as an " s), or deep philosophical differences. subfields have also been based on social factors (particular institutions or the work of particular researchers).\n\nthe traditional problems (or goals) of ai research include and many others.\n\n\nthe field was founded on the claim that .\n\nin the twenty-first century, ai techniques have experienced a resurgence following concurrent advances in , helping to solve many challenging problems in computer scienc

* Now let's remove punctuation

In [336]:
page = remove_punctuation(page)
print(page)

    artificial intelligence ai sometimes called machine intelligence is s such as learning and problem solving the scope of ai is disputed as machines become increasingly capable tasks considered as requiring intelligence are often removed from the definition a phenomenon known as the nnnartificial intelligence was founded as an academic discipline in 1956 and in the years since has experienced several waves of optimism followed by disappointment and the loss of funding known as an  s or deep philosophical differences subfields have also been based on social factors particular institutions or the work of particular researchersnnthe traditional problems or goals of ai research include and many othersnnnthe field was founded on the claim that nnin the twentyfirst century ai techniques have experienced a resurgence following concurrent advances in  helping to solve many challenging problems in computer science  history   thoughtcapable nnthe study of mechanical or artificial neurons the f

* Remove numbers

In [337]:
page = remove_numbers(page)
print(page)

    artificial intelligence ai sometimes called machine intelligence is s such as learning and problem solving the scope of ai is disputed as machines become increasingly capable tasks considered as requiring intelligence are often removed from the definition a phenomenon known as the nnnartificial intelligence was founded as an academic discipline in  and in the years since has experienced several waves of optimism followed by disappointment and the loss of funding known as an  s or deep philosophical differences subfields have also been based on social factors particular institutions or the work of particular researchersnnthe traditional problems or goals of ai research include and many othersnnnthe field was founded on the claim that nnin the twentyfirst century ai techniques have experienced a resurgence following concurrent advances in  helping to solve many challenging problems in computer science  history   thoughtcapable nnthe study of mechanical or artificial neurons the field

* And stop words

In [338]:
page = remove_stop_words(page)
print(page)

artificial intelligence ai sometimes called machine intelligence learning problem solving scope ai disputed machines become increasingly capable tasks considered requiring intelligence often removed definition phenomenon known nnnartificial intelligence founded academic discipline years since experienced several waves optimism followed disappointment loss funding known deep philosophical differences subfields also based social factors particular institutions work particular researchersnnthe traditional problems goals ai research include many othersnnnthe field founded claim nnin twentyfirst century ai techniques experienced resurgence following concurrent advances helping solve many challenging problems computer science history thoughtcapable nnthe study mechanical artificial neurons field ai research born agreed writing within generationxa problem creating artificial intelligence substantially solvednnthey failed recognize difficulty remaining tasks progress slowed response criticism 

In [339]:
page = remove_single_characters(page)
print(page)

artificial intelligence ai sometimes called machine intelligence learning problem solving scope ai disputed machines become increasingly capable tasks considered requiring intelligence often removed definition phenomenon known nnnartificial intelligence founded academic discipline years since experienced several waves optimism followed disappointment loss funding known deep philosophical differences subfields also based social factors particular institutions work particular researchersnnthe traditional problems goals ai research include many othersnnnthe field founded claim nnin twentyfirst century ai techniques experienced resurgence following concurrent advances helping solve many challenging problems computer science history thoughtcapable nnthe study mechanical artificial neurons field ai research born agreed writing within generationxa problem creating artificial intelligence substantially solvednnthey failed recognize difficulty remaining tasks progress slowed response criticism 

### Now build a model using gensim

* Start by building a dictionary of all words found in the cleaned content

In [340]:
dictionary = corpora.Dictionary(content)

2018-08-19 10:48:08,127 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-08-19 10:48:13,606 : INFO : built Dictionary(28094 unique tokens: ['aaai', 'abandoned', 'abduction', 'ability', 'able']...) from 2284 documents (total 1154234 corpus positions)


* Ignore words that appear in less than 20 documents or more than 10% documents

In [341]:
dictionary.filter_extremes(no_below=20, no_above=0.1)

2018-08-19 10:48:13,654 : INFO : discarding 22661 tokens: [('abandoned', 8), ('ability', 429), ('able', 457), ('academic', 233), ('accelerating', 15), ('accepts', 15), ('accomplishn', 8), ('according', 323), ('achievementsn', 8), ('acknowledge', 15)]...
2018-08-19 10:48:13,655 : INFO : keeping 5433 tokens which were in no less than 20 and no more than 228 (=10.0%) documents
2018-08-19 10:48:13,662 : INFO : resulting dictionary: Dictionary(5433 unique tokens: ['aaai', 'abduction', 'abstract', 'academy', 'accepted']...)


* Now create a corpus

In [342]:
corpus = [dictionary.doc2bow(text) for text in content]

In [343]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=100)

2018-08-19 10:48:18,550 : INFO : using symmetric alpha at 0.01
2018-08-19 10:48:18,551 : INFO : using symmetric eta at 0.01
2018-08-19 10:48:18,553 : INFO : using serial LDA version on this node
2018-08-19 10:48:18,618 : INFO : running online (single-pass) LDA training, 100 topics, 1 passes over the supplied corpus of 2284 documents, updating model once every 2000 documents, evaluating perplexity every 2284 documents, iterating 50x with a convergence threshold of 0.001000
2018-08-19 10:48:18,620 : INFO : PROGRESS: pass 0, at document #2000/2284
2018-08-19 10:48:21,796 : INFO : merging changes from 2000 documents into a model of 2284 documents
2018-08-19 10:48:21,865 : INFO : topic #9 (0.010): 0.081*"fuzzy" + 0.012*"interval" + 0.008*"ppxa–" + 0.005*"uncertainty" + 0.005*"membership" + 0.005*"no" + 0.005*"evolving" + 0.005*"scopecol" + 0.004*"trans" + 0.004*"stylewidth"
2018-08-19 10:48:21,866 : INFO : topic #81 (0.010): 0.005*"simulation" + 0.005*"genesis" + 0.004*"situated" + 0.004*"c

In [344]:
lda.print_topics(20)

2018-08-19 10:48:22,987 : INFO : topic #55 (0.010): 0.015*"fuzzy" + 0.007*"neurons" + 0.007*"transparent" + 0.007*"opaque" + 0.007*"advantages" + 0.007*"nnas" + 0.006*"medical" + 0.006*"trained" + 0.005*"learned" + 0.005*"complicated"
2018-08-19 10:48:22,990 : INFO : topic #97 (0.010): 0.023*"industrial" + 0.015*"acquisition" + 0.012*"rationality" + 0.011*"principle" + 0.007*"false" + 0.007*"safety" + 0.006*"publisher" + 0.006*"isbn" + 0.006*"newell" + 0.006*"location"
2018-08-19 10:48:22,991 : INFO : topic #87 (0.010): 0.012*"appearance" + 0.011*"students" + 0.008*"male" + 0.008*"female" + 0.007*"aligncenter" + 0.007*"stylewidth" + 0.006*"scopecol" + 0.006*"emotions" + 0.006*"enhance" + 0.006*"cognition"
2018-08-19 10:48:22,993 : INFO : topic #21 (0.010): 0.026*"legal" + 0.022*"cloud" + 0.016*"robotics" + 0.010*"law" + 0.009*"services" + 0.008*"access" + 0.007*"service" + 0.006*"corporation" + 0.005*"uncertain" + 0.005*"potential"
2018-08-19 10:48:22,994 : INFO : topic #79 (0.010): 0.

[(55,
  '0.015*"fuzzy" + 0.007*"neurons" + 0.007*"transparent" + 0.007*"opaque" + 0.007*"advantages" + 0.007*"nnas" + 0.006*"medical" + 0.006*"trained" + 0.005*"learned" + 0.005*"complicated"'),
 (97,
  '0.023*"industrial" + 0.015*"acquisition" + 0.012*"rationality" + 0.011*"principle" + 0.007*"false" + 0.007*"safety" + 0.006*"publisher" + 0.006*"isbn" + 0.006*"newell" + 0.006*"location"'),
 (87,
  '0.012*"appearance" + 0.011*"students" + 0.008*"male" + 0.008*"female" + 0.007*"aligncenter" + 0.007*"stylewidth" + 0.006*"scopecol" + 0.006*"emotions" + 0.006*"enhance" + 0.006*"cognition"'),
 (21,
  '0.026*"legal" + 0.022*"cloud" + 0.016*"robotics" + 0.010*"law" + 0.009*"services" + 0.008*"access" + 0.007*"service" + 0.006*"corporation" + 0.005*"uncertain" + 0.005*"potential"'),
 (79,
  '0.025*"aligncenter" + 0.022*"neuron" + 0.019*"neurons" + 0.015*"potential" + 0.014*"tomas" + 0.014*"sun" + 0.011*"spike" + 0.010*"connectivity" + 0.009*"victor" + 0.008*"autonomic"'),
 (9,
  '0.137*"fuzzy"

In [345]:
def get_similarity(lda, q_vec):
    index = similarities.MatrixSimilarity(lda[corpus])
    sims = index[q_vec]
    return sims

In [346]:
query = "who invented deep learning"
bow = dictionary.doc2bow(content.get_cleaned_text(query).split())

In [347]:
bow

[(346, 1), (2496, 1)]

In [348]:
print(dictionary[346], dictionary[2496])

deep invented


In [349]:
q_vec = lda[bow]
print(q_vec)

[(67, 0.67)]


In [350]:
print(lda.print_topic(max(lda_vector, key=lambda item: item[1])[0]))

0.030*"music" + 0.016*"february" + 0.014*"weak" + 0.013*"retrieved" + 0.012*"length" + 0.012*"narrow" + 0.011*"extra" + 0.007*"games" + 0.007*"oct" + 0.006*"lab"


In [351]:
sims = get_similarity(lda, q_vec)

2018-08-19 10:48:25,700 : INFO : creating matrix with 2284 documents and 100 features
  if np.issubdtype(vec.dtype, np.int):


In [352]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [353]:
sims[:5]

[(83, 1.0), (208, 1.0), (409, 1.0), (534, 1.0), (735, 1.0)]

In [354]:
for idx in [4, 208, 323, 330, 534]:
    pageid = page_ids[idx]
    print('Page ID {}: {}'.format(pageid, content.get_page_url_by_id(pageid)))

Page ID ('43470933',): ('https://en.wikipedia.org/wiki/0music',)
Page ID ('53587467',): ('https://en.wikipedia.org/wiki/Outline_of_machine_learning',)
Page ID ('52968552',): ('https://en.wikipedia.org/wiki/WordDive',)
Page ID ('43470933',): ('https://en.wikipedia.org/wiki/0music',)
Page ID ('53587467',): ('https://en.wikipedia.org/wiki/Outline_of_machine_learning',)
