# Import Libraries

In [1]:
import pandas as pd

In [2]:
from multiprocessing import cpu_count

In [3]:
import string
import os

In [4]:
%%time 
import spacy
nlp = spacy.load('en')

CPU times: user 3.86 s, sys: 420 ms, total: 4.28 s
Wall time: 2.6 s


In [5]:
base_directory = os.path.join('.','data')

Example

In [6]:
spacy.displacy.render(nlp('South African President Cyril Ramaphosa treaded a fine line between his ruling party’s rival factions when he named steady hands to key cabinet posts and purged some of his predecessor Jacob Zuma’s most ineffectual appointees.  While keeping a handful of Zuma loyalists in largely minor positions, Ramaphosa brought back as finance minister Nhlanhla Nene, who Zuma replaced in late 2015 in a decision that caused chaos in the markets. Pravin Gordhan, who won over investors during two stints in the same job before running afoul of Zuma, will oversee the six of the biggest state companies that are mostly financially strapped and mired in graft allegations.') , style='ent', minify=True, jupyter=True)

# Lemmatization Pipeline

In [7]:
data_file = os.path.join(base_directory,'sens_data.jsonlines')

In [8]:
#sens_sample = pd.read_hdf('./data/test/big_newsdata.h5', 'sens', mode='a')
sens_data = pd.read_json(data_file,convert_dates=True,lines=True,chunksize=1)

In [9]:
lemma_file = os.path.join(base_directory,'json_lines_lemmas.txt')

In [10]:
def is_punct_helper(token):
    """helper function to eliminate tokens that are pure punctuation or whitespace"""
    return token.is_punct or token.is_space or token.like_num


def lemmatized_sentence_corpus(generator):    
    """generator function to use spaCy to parse reviews, lemmatize the text, and yield sentences"""
    punc = string.digits+string.punctuation.replace("",".")#+"’‘"+'“–”'

    with open(lemma_file, 'a', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(map(lambda x: str(x['text'].values).translate(
            str.maketrans(punc, ' '*len(punc)))[250:-559],generator)
                                      ,batch_size=1, n_threads=cpu_count(),disable=['is_stop','is_alpha','ner','tag','dep']):

            for sent in parsed_review.sents:
                f.write(u' '.join([token.lemma_ for token in sent
                                 if not is_punct_helper(token)])+'\n')

In [11]:
%%time

if os.path.exists(lemma_file):
        print("A lemma file already exists. ")
else:
    lemmatized_sentence_corpus(sens_data)

A lemma file already exists. 
CPU times: user 1.5 ms, sys: 155 µs, total: 1.66 ms
Wall time: 816 µs


# Phrase Modelling
Learning conbinations of tokens that together represent meaningful multi-word concepts.  
  
$\frac{count(AB)-count_{min}}{count(a)*count(b)} * N > threashold $

Where count is is the number of times a token appears in the corpus.  
N is the total size of the corpus.  
  
 Gensim is a library for statistical analysis of sentences.  

In [12]:
%%time
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

CPU times: user 500 ms, sys: 20 ms, total: 520 ms
Wall time: 428 ms


## Unigrams

In [13]:
unigram_sentences = LineSentence(lemma_file)

In [14]:
import itertools
for unigram_sentence in itertools.islice(unigram_sentences, 235, 240):
    print(u' '.join(unigram_sentence))
    print(" ")

position
 
no of
 
no of
 
deem value sar rs of rs at allocate grant grant date
 
rand ag kirk director of a major subsidiary pw spies director of a major subsidiary
 


## Bigrams

In [15]:
bigram_file = os.path.join(base_directory,'bigram_file.txt')

In [16]:
%%time

if os.path.exists(bigram_file):
    print("A bigram file already exists. ")
    bigram_model = Phrases.load(bigram_file)
else:
    bigram_model = Phrases(unigram_sentences)
    bigram_model.save(bigram_file)

A bigram file already exists. 
CPU times: user 2.17 s, sys: 192 ms, total: 2.36 s
Wall time: 2.36 s


In [17]:
bigram_sentences_filepath = os.path.join(base_directory,
                                         'bigram_sentences_all.txt')

In [18]:
%%time
if os.path.exists(bigram_sentences_filepath):
    print("A bigram file already exists. ")
else:
    with open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:   
        for unigram_sentence in unigram_sentences:
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')

A bigram file already exists. 
CPU times: user 98 µs, sys: 9 µs, total: 107 µs
Wall time: 188 µs


In [19]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [20]:
for bigram_sentence in itertools.islice(bigram_sentences, 235, 240):
    print(u' '.join(bigram_sentence))
    print(u'')

position

no of

no of

deem value sar rs of rs at allocate grant grant date

rand ag_kirk director of a major_subsidiary pw spies director of a major_subsidiary



## Trigrams

In [21]:
trigram_model_filepath = os.path.join(base_directory,
                                      'trigram_model_all')

In [22]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if os.path.exists(trigram_model_filepath):
    print("A trigram model already exists. ")
    # load the finished model from disk
    trigram_model = Phrases.load(trigram_model_filepath)
    
else:
    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)

A trigram model already exists. 
CPU times: user 2.38 s, sys: 164 ms, total: 2.54 s
Wall time: 2.54 s


In [23]:
trigram_sentences_filepath = os.path.join(base_directory,
                                          'trigram_sentences_all.txt')

In [24]:
%%time

if os.path.exists(trigram_sentences_filepath):
        print("A trigram model already exists. ")
else:
    with open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for bigram_sentence in bigram_sentences:
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')

A trigram model already exists. 
CPU times: user 212 µs, sys: 19 µs, total: 231 µs
Wall time: 153 µs


In [25]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [26]:
for trigram_sentence in itertools.islice(trigram_sentences, 235, 240):
    print(u' '.join(trigram_sentence))
    print(u'')

position

no of

no of

deem value sar rs of rs at allocate grant grant date

rand ag_kirk director of a major_subsidiary pw spies director of a major_subsidiary



## LDE

In [27]:
! pip install pyldavis



In [28]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle

In [29]:
trigram_dictionary_filepath = os.path.join(base_directory,
                                           'trigram_dict_all.dict')

In [30]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if os.path.exists(trigram_dictionary_filepath):
    print("A trigram dictionary already exists. ")
    trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)
    
else:
    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_sentences)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    #this was 10 and 0.4
    trigram_dictionary.filter_extremes(no_below=15, no_above=0.3)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)

A trigram dictionary already exists. 
CPU times: user 15.3 ms, sys: 7.6 ms, total: 22.9 ms
Wall time: 22.4 ms


In [31]:
# load the finished dictionary from disk


In [32]:
trigram_bow_filepath = os.path.join(base_directory,
                                    'trigram_bow_corpus_all.mm')

In [33]:
def trigram_bow_generator(filepath):
    """generator function to read reviews from a file
    and yield a bag-of-words representation """
    
    for review in LineSentence(filepath):
        yield(trigram_dictionary.doc2bow(review))

In [34]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if os.path.exists(trigram_bow_filepath):
        print("A trigram dictionary already exists. ")
        trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
else:
    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_sentences_filepath))
  

A trigram dictionary already exists. 
CPU times: user 1.17 ms, sys: 105 µs, total: 1.28 ms
Wall time: 882 µs


In [35]:
  
# load the finished bag-of-words corpus from disk


In [36]:
lda_model_filepath = os.path.join(base_directory, 'lda_model_all')

In [37]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if os.path.exists(lda_model_filepath):
        print("A lda model already exists. ")
else:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=20,
                           id2word=trigram_dictionary,
                           workers=(cpu_count-1))
    
    lda.save(lda_model_filepath)

A lda model already exists. 
CPU times: user 478 µs, sys: 43 µs, total: 521 µs
Wall time: 264 µs


In [38]:
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [39]:
def explore_topic(topic_number, topn=10):
    """accept a user-supplied topic number and
    print out a formatted list of the top terms"""
        
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [40]:
explore_topic(topic_number=3)

term                 frequency

or                   0.102
to                   0.059
be                   0.045
any                  0.043
in                   0.041
and                  0.035
offer                0.020
-PRON-               0.019
not                  0.015
other                0.014
this                 0.013
by                   0.012
no                   0.011
person               0.011
a                    0.010
make                 0.009
such                 0.009
for                  0.009
may                  0.009
announcement         0.009
security             0.008
which                0.008
if                   0.008
relevant             0.008
with                 0.007


Time to name the topics

In [41]:
topic_names_filepath = os.path.join(base_directory, 'topic_names.pkl')

In [42]:
if os.path.exists(topic_names_filepath):
    print("A topic names pickle already exists. ")
    with open(topic_names_filepath, 'rb') as f:
        topic_names = pickle.load(f)

else:
    topic_names = {0: u'mexican',
               1: u'menu',
               2: u'thai',
               3: u'steak',
               4: u'donuts & appetizers',
               5: u'specials',
               6: u'soup',
               7: u'wings, sports bar',
               8: u'foreign language',
               9: u'las vegas',
               10: u'chicken'
               }
    with open(topic_names_filepath, 'wb') as f:
        pickle.dump(topic_names, f)

A topic names pickle already exists. 


In [43]:
LDAvis_data_filepath = os.path.join(base_directory, 'ldavis_prepared')


In [44]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.

if os.path.exists(LDAvis_data_filepath):
    print("A LDA davis file pickle already exists. ")
    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
        
else:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary, n_jobs=-1)

    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

A LDA davis file pickle already exists. 
CPU times: user 4.78 ms, sys: 68 µs, total: 4.85 ms
Wall time: 4.48 ms


Time to visualize the model

In [45]:
pyLDAvis.enable_notebook(local=False)

In [46]:
pyLDAvis.display(LDAvis_prepared)

In [47]:
LDAvis_visual_output = os.path.join(base_directory, 'ldavis_visual.html')

In [None]:
if os.path.exists(LDAvis_data_filepath):
    pass
else:
    print("A LDA visual is being written to disks. ")
    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_visual_output, 'wb') as f:
        pyLDAvis.save_html(LDAvis_prepared, f)

In [None]:
pyLDAvis.show(LDAvis_prepared)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [28/Feb/2018 22:47:42] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:47:42] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:47:42] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:47:43] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [28/Feb/2018 22:47:43] "GET /LDAvis.js HTTP/1.1" 200 -


## Describing LDA

# Word Vectors