In [1]:
import gensim
import logging
import time
import json
from pymongo import MongoClient
import nltk
import rake

logging.basicConfig(format='%(levelname)s : %(message)s', level = logging.INFO)
logging.root.level = logging.INFO

In [2]:
# Constants setup
MONGO_CONNECTION_STRING = "mongodb://localhost:27017/"
DATASET_FILE = "cnn0.json"
CNN_DATABASE = "cnn_articles"
ARTICLES_COLLECTION = "articles"
CORPUS_COLLECTION = "corpus"

In [3]:
# Database setup
articles_collection = MongoClient(MONGO_CONNECTION_STRING)[CNN_DATABASE][ARTICLES_COLLECTION]
corpus_collection = MongoClient(MONGO_CONNECTION_STRING)[CNN_DATABASE][CORPUS_COLLECTION]

In [24]:
"""Gets the articles from the json file and imports them to MongoDB 
in a collection called Articles
"""
articles_cursor = articles_collection.find()
articlesCount = articles_cursor.count()
articles_cursor.batch_size(1000)

count = 0
done = 0
start = time.time()

articles_collection.drop()

with open(DATASET_FILE) as dataset:
    articles = json.load(dataset)
    count = len(articles)
    
    for data in articles:
        articles_collection.insert_one({
            "url": data["url"],
            "title": data["title"],
            "text": data["text"]
        })

        done += 1
        if done % 100 == 0:
            end = time.time()
            print 'Done ' + str(done) + ' out of ' + str(count) + ' in ' + str((end - start))


Done 100 out of 767 in 0.0854570865631
Done 200 out of 767 in 0.115850925446
Done 300 out of 767 in 0.153527021408
Done 400 out of 767 in 0.180469036102
Done 500 out of 767 in 0.206474065781
Done 600 out of 767 in 0.234987020493
Done 700 out of 767 in 0.262418031693


In [5]:
def load_stopwords():
    stopwords = {}

    # stopwords list created by Gerard Salton and Chris Buckley for 
    # the experimental SMART information retrieval system at Cornell University
    # source: http://www.lextek.com/manuals/onix/stopwords2.html
    with open('stopwords.txt', 'rU') as f:
        for line in f:
            stopwords[line.strip()] = 1

    # custom stopwords for CNN
    with open('stopwords2.txt', 'rU') as f:
        for line in f:
            stopwords[line.strip()] = 1

    return stopwords

In [6]:
from nltk.stem.wordnet import WordNetLemmatizer

def extract_lemmatized_nouns(text):
    stopwords = load_stopwords()
    words = []

    sentences = nltk.sent_tokenize(text.lower())
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        text = [word for word in tokens if word not in stopwords]
        tagged_text = nltk.pos_tag(text)

        for word, tag in tagged_text:
            words.append({"word": word, "pos": tag})

    lem = WordNetLemmatizer()
    nouns = []
    for word in words:
        if word["pos"] in ["NN", "NNS"]:
            nouns.append(lem.lemmatize(word["word"]))

    return words, nouns

In [38]:
# Tokenize, lemmatize and build the corpus
"""loops through all the articles in the initial dataset and for 
each article: splits the article into sentences, removes stopwords, 
extracts parts-of-speech tags for all the remaining tokens, 
filters out all words which are not nouns, 
uses WordNetLemmatizer to lookup the lemma of each noun, 
stores each article, i.e. url, title, article text, words (word,pos tag), and nouns
pairs vector in a collection called Corpus. 
"""

articles_cursor = articles_collection.find()
articlesCount = articles_cursor.count()
articles_cursor.batch_size(1000)

done = 0
start = time.time()
rake_object = rake.Rake("stopwords.txt", 3, 2, 1)

for article in articles_cursor:
    words, nouns = extract_lemmatized_nouns(article['text'])
    """
    Each word has at least 5 characters
    Each phrase has at most 3 words
    Each keyword appears in the text at least 4 times
    """    
    keywords = rake_object.run(article['text'])

    corpus_collection.insert_one({
        "url": article["url"],
        "title": article["title"],
        "text": article["text"],
        "words": words,
        "nouns": nouns,
        "rake": [k[0] for k in keywords]
    })

    done += 1
    if done % 100 == 0:
        end = time.time()
        print 'Done ' + str(done) + ' out of ' + str(articlesCount) + ' in ' + str((end - start))


Done 100 out of 767 in 36.1616191864
Done 200 out of 767 in 82.3833479881
Done 300 out of 767 in 117.313628197
Done 400 out of 767 in 153.492426157
Done 500 out of 767 in 182.082192183
Done 600 out of 767 in 213.004414082
Done 700 out of 767 in 251.6025002


In [20]:
# Train
"""feeds the articles corpus created in the previous step to the 
gensim LDA model, keeping only the 10000 most frequent tokens and using 50 topics.
"""

class Corpus(object):
    def __init__(self, cursor, articles_dictionary, corpus_path, fieldname="nouns"):
        self.cursor = cursor
        self.articles_dictionary = articles_dictionary
        self.corpus_path = corpus_path
        self.fieldname = fieldname
        
    def __iter__(self):
        self.cursor.rewind()
        for article in self.cursor:
            yield self.articles_dictionary.doc2bow(article[self.fieldname])

    def serialize(self):
        gensim.corpora.BleiCorpus.serialize(self.corpus_path, self, id2word=self.articles_dictionary)
        return self


class Dictionary(object):
    def __init__(self, cursor, dictionary_path, fieldname="nouns"):
        self.cursor = cursor
        self.dictionary_path = dictionary_path
        self.fieldname = fieldname

    def build(self):
        self.cursor.rewind()
        dictionary = gensim.corpora.Dictionary(article[self.fieldname] for article in self.cursor)
        dictionary.filter_extremes(keep_n=10000)
        dictionary.compactify()
        gensim.corpora.Dictionary.save(dictionary, self.dictionary_path)
        return dictionary


class Train:
    def __init__(self):
        pass

    @staticmethod
    def run(lda_model_path, corpus_path, num_topics, id2word):
        corpus = gensim.corpora.BleiCorpus(corpus_path)
        lda = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=id2word, passes=10)
        lda.save(lda_model_path)
        return lda

In [21]:
# Building with the "nouns" field
DICTIONARY_PATH = "models/nouns_dictionary.dict"
CORPUS_PATH = "models/nouns_corpus.lda-c"
LDA_MODEL_PATH = "models/nouns_lda_model_50_topics.lda"
LDA_NUM_TOPICS = 50

corpus_cursor = corpus_collection.find()
dictionary = Dictionary(corpus_cursor, DICTIONARY_PATH, 'nouns').build()
Corpus(corpus_cursor, dictionary, CORPUS_PATH, 'nouns').serialize()
Train.run(LDA_MODEL_PATH, CORPUS_PATH, LDA_NUM_TOPICS, dictionary)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : built Dictionary(18289 unique tokens: [u'raining', u'danpfeiffer', u'thesixtiescnn', u'imposing', u'sinjar']...) from 767 documents (total 170546 corpus positions)
INFO : discarding 14640 tokens: [(u'commemorate', 2), (u'cnn', 488), (u'ablaze', 2), (u'expects', 3), (u'eq', 1), (u'suv', 4), (u'swap', 4), (u'knob', 2), (u'touchscreen', 1), (u'sedan', 3)]...
INFO : keeping 3649 tokens which were in no less than 5 and no more than 383 (=50.0%) documents
INFO : resulting dictionary: Dictionary(3649 unique tokens: [u'paul', u'demand', u'dynamic', u'yellow', u'fdch']...)
INFO : saving Dictionary object under models/nouns_dictionary.dict, separately None
INFO : storing corpus in Blei's LDA-C format into models/nouns_corpus.lda-c
INFO : saving vocabulary of 3649 words to models/nouns_corpus.lda-c.vocab
INFO : saving BleiCorpus index to models/nouns_corpus.lda-c.index
INFO : loaded corpus index from models/nouns_corpus.lda-c.ind

<gensim.models.ldamodel.LdaModel at 0x100776cd0>

In [19]:
# Building with the "rake" keywords field

DICTIONARY_PATH = "models/rake_dictionary.dict"
CORPUS_PATH = "models/rake_corpus.lda-c"
LDA_MODEL_PATH = "models/rake_lda_model_50_topics.lda"
LDA_NUM_TOPICS = 50

corpus_cursor = corpus_collection.find()
rake_dictionary = Dictionary(corpus_cursor, DICTIONARY_PATH, 'rake').build()
Corpus(corpus_cursor, rake_dictionary, CORPUS_PATH, 'rake').serialize()
Train.run(LDA_MODEL_PATH, CORPUS_PATH, LDA_NUM_TOPICS, rake_dictionary)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : built Dictionary(46987 unique tokens: [u'fawn', u'start eating', u'newly revised', u'monkey forest', u'woods']...) from 767 documents (total 107231 corpus positions)
INFO : discarding 43332 tokens: [(u'users reading', 1), (u'350th anniversary', 1), (u'commemorate', 2), (u'skyline', 3), (u'great fire', 2), (u'river thames', 1), (u'set ablaze', 2), (u'cnn', 473), (u'entire dashboard', 1), (u'suv concept', 1)]...
INFO : keeping 3655 tokens which were in no less than 5 and no more than 383 (=50.0%) documents
INFO : resulting dictionary: Dictionary(3655 unique tokens: [u'limited', u'make landfall', u'personally', u'facilities', u'protest']...)
INFO : saving Dictionary object under models/rake_dictionary.dict, separately None
INFO : storing corpus in Blei's LDA-C format into models/rake_corpus.lda-c
INFO : saving vocabulary of 3655 words to models/rake_corpus.lda-c.vocab
INFO : saving BleiCorpus index to models/rake_corpus.l

<gensim.models.ldamodel.LdaModel at 0x1007760d0>

In [22]:
#Predict
#DICTIONARY_PATH = "models/rake_dictionary.dict"
#CORPUS_PATH = "models/rake_corpus.lda-c"
#LDA_MODEL_PATH = "models/rake_lda_model_50_topics.lda"

corpus = gensim.corpora.BleiCorpus(CORPUS_PATH)
diction = gensim.corpora.Dictionary.load(DICTIONARY_PATH)
lda = gensim.models.LdaModel.load(LDA_MODEL_PATH)

INFO : loaded corpus index from models/nouns_corpus.lda-c.index
INFO : loading corpus from models/nouns_corpus.lda-c
INFO : loading Dictionary object from models/nouns_dictionary.dict
INFO : loading LdaModel object from models/nouns_lda_model_50_topics.lda
INFO : loading id2word recursively from models/nouns_lda_model_50_topics.lda.id2word.* with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute dispatcher to None
INFO : loading LdaModel object from models/nouns_lda_model_50_topics.lda.state


In [23]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook

vis_data = gensimvis.prepare(lda, corpus, diction)
pyLDAvis.display(vis_data)