# LDA with Gensim

* Start by importing python libraries that we'll be using

In [138]:
%load_ext autoreload
%autoreload 2

import sqlite3
import string
import logging
from gensim import corpora
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

* Let's set up some basic config

In [139]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

punctuation = set(string.punctuation)
stoplist = set(stopwords.words('english'))

stemmer = SnowballStemmer("english")
lemma = WordNetLemmatizer()

### Create a class for our content

This class will use the database that we created when we downloaded content from Wikipedia

In [142]:
class ContentStore:
    def __init__(self, db_file):
        """
        Intialise the crawl_wikipedia class, set up a
        lightweight database for storing content for later use
        :param db_file:
        """
        self.categories = []
        # Connect to the DB db
        self.conn = sqlite3.connect(db_file)
        self.cursor = self.conn.cursor()

    def get_page_urls(self):
        """
        Retrieve a list of urls from the database
        :return: list of urls
        """
        return [row for row in self.cursor.execute("SELECT url FROM content")]
    
    def get_page_ids(self):
        """
        Retrieve a list of page ids from the database
        :return: list of page id tuples
        """
        return [row for row in self.cursor.execute("SELECT pageid FROM content")]   
    
    def get_page_by_id(self, pageid):
        """
        Retrieve the page with the specified pageid
        Note that this is of the format (pageid, ) for SQLite3 to work, for example
        to get the page with the id of 1 in our database, set pageid to ('1', )
        :return: string
        """
        return str(self.cursor.execute("SELECT content FROM content WHERE pageid=?", pageid).fetchone()).lower()
    
    def get_cleaned_page_by_id(self, pageid):
        page = self.get_page_by_id(pageid)
        # Remove numbers
        page = ''.join(char for char in page if not char.isdigit())
        # Remove stop words
        page = ' '.join([word for word in page.split() if word not in stoplist])
        # Remove punctuation
        page = ''.join(char for char in words if char not in punctuation)
        # Remove single character words
        page = ' '.join([word for word in page.split() if len(word) > 1])
        return page
        
    def __iter__(self):
        for page_id in get_page_ids():
            yield self.get_cleaned_page_by_id(page_id)
            

### Remove punctuation from text

In [143]:
def remove_punctuation(text):
    """
    Remove punctuation from text by checking each character against a set of punctation characters
    :return: string
    """
    return ''.join(char for char in text if char not in punctuation)

### Remove numbers from text

In [144]:
def remove_numbers(text):
    """
    Remove numbers from text as they aren't of value to our model
    :return: string
    """
    return ''.join(char for char in text if not char.isdigit())

### Remove stop words from text

In [145]:
def remove_stop_words(text):
    """
    Remove common words as they won't add any value to our model
    :return: string
    """
    return ' '.join([word for word in text.split() if word not in stoplist])

### Remove single character words

In [146]:
def remove_single_characters(text):
    """
    Remove any remaining single-character words
    :return: string
    """
    return ' '.join([word for word in text.split() if len(word) > 1])

### Lemmatize our document

In [147]:
def lemmatize(text):
    return text

### Stemmatize our document

In [148]:
def stemmatize(text):
    return text

### Exploring and cleaning our content

* Access our database of Wikipedia content and get a list of all of the pages IDs

In [149]:
content = ContentStore('../content.db')
page_ids = content.get_page_ids()

* Let's view a page from our database

In [150]:
page = content.get_page_by_id(page_ids[0])
print(page)

('    \'\'\'artificial intelligence\'\'\' (\'\'\'ai\'\'\'), sometimes called \'\'\'machine intelligence\'\'\', is s, such as "learning" and "problem solving". the scope of ai is disputed: as machines become increasingly capable, tasks considered as requiring "intelligence" are often removed from the definition, a phenomenon known as the .\n\n\nartificial intelligence was founded as an academic discipline in 1956, and in the years since has experienced several waves of optimism, followed by disappointment and the loss of funding (known as an " s), or deep philosophical differences. subfields have also been based on social factors (particular institutions or the work of particular researchers).\n\nthe traditional problems (or goals) of ai research include and many others.\n\n\nthe field was founded on the claim that .\n\nin the twenty-first century, ai techniques have experienced a resurgence following concurrent advances in , helping to solve many challenging problems in computer scienc

* Now let's remove punctuation

In [131]:
page = remove_punctuation(page)
print(page)

    Artificial intelligence AI sometimes called machine intelligence is s such as learning and problem solving The scope of AI is disputed as machines become increasingly capable tasks considered as requiring intelligence are often removed from the definition a phenomenon known as the nnnArtificial intelligence was founded as an academic discipline in 1956 and in the years since has experienced several waves of optimism followed by disappointment and the loss of funding known as an  s or deep philosophical differences Subfields have also been based on social factors particular institutions or the work of particular researchersnnThe traditional problems or goals of AI research include and many othersnnnThe field was founded on the claim that nnIn the twentyfirst century AI techniques have experienced a resurgence following concurrent advances in  helping to solve many challenging problems in computer science  History   Thoughtcapable nnThe study of mechanical or artificial neurons The f

* Remove numbers

In [134]:
page = remove_numbers(page)
print(page)

    Artificial intelligence AI sometimes called machine intelligence is s such as learning and problem solving The scope of AI is disputed as machines become increasingly capable tasks considered as requiring intelligence are often removed from the definition a phenomenon known as the nnnArtificial intelligence was founded as an academic discipline in  and in the years since has experienced several waves of optimism followed by disappointment and the loss of funding known as an  s or deep philosophical differences Subfields have also been based on social factors particular institutions or the work of particular researchersnnThe traditional problems or goals of AI research include and many othersnnnThe field was founded on the claim that nnIn the twentyfirst century AI techniques have experienced a resurgence following concurrent advances in  helping to solve many challenging problems in computer science  History   Thoughtcapable nnThe study of mechanical or artificial neurons The field

* And stop words

In [135]:
page = remove_stop_words(page)
print(page)

Artificial intelligence AI sometimes called machine intelligence learning problem solving The scope AI disputed machines become increasingly capable tasks considered requiring intelligence often removed definition phenomenon known nnnArtificial intelligence founded academic discipline years since experienced several waves optimism followed disappointment loss funding known deep philosophical differences Subfields also based social factors particular institutions work particular researchersnnThe traditional problems goals AI research include many othersnnnThe field founded claim nnIn twentyfirst century AI techniques experienced resurgence following concurrent advances helping solve many challenging problems computer science History Thoughtcapable nnThe study mechanical artificial neurons The field AI research born agreed writing within generationxa problem creating artificial intelligence substantially solvednnThey failed recognize difficulty remaining tasks Progress slowed response cr

In [137]:
page = remove_single_characters(page)
print(page)

Artificial intelligence AI sometimes called machine intelligence learning problem solving The scope AI disputed machines become increasingly capable tasks considered requiring intelligence often removed definition phenomenon known nnnArtificial intelligence founded academic discipline years since experienced several waves optimism followed disappointment loss funding known deep philosophical differences Subfields also based social factors particular institutions work particular researchersnnThe traditional problems goals AI research include many othersnnnThe field founded claim nnIn twentyfirst century AI techniques experienced resurgence following concurrent advances helping solve many challenging problems computer science History Thoughtcapable nnThe study mechanical artificial neurons The field AI research born agreed writing within generationxa problem creating artificial intelligence substantially solvednnThey failed recognize difficulty remaining tasks Progress slowed response cr