# Import Libraries

In [1]:
import pandas as pd

In [2]:
from multiprocessing import cpu_count

In [3]:
import string
import os

In [4]:
%%time 
import spacy
nlp = spacy.load('en')

CPU times: user 3.86 s, sys: 508 ms, total: 4.36 s
Wall time: 2.66 s


In [5]:
base_directory = os.path.join('.','data','test')

Example

In [15]:
spacy.displacy.render(nlp('South African President Cyril Ramaphosa treaded a fine line between his ruling party’s rival factions when he named steady hands to key cabinet posts and purged some of his predecessor Jacob Zuma’s most ineffectual appointees.  While keeping a handful of Zuma loyalists in largely minor positions, Ramaphosa brought back as finance minister Nhlanhla Nene, who Zuma replaced in late 2015 in a decision that caused chaos in the markets. Pravin Gordhan, who won over investors during two stints in the same job before running afoul of Zuma, will oversee the six of the biggest state companies that are mostly financially strapped and mired in graft allegations.') , style='ent', minify=True, jupyter=True)

# Lemmatization Pipeline

In [6]:
data_file = os.path.join(base_directory,'sens_lines.jsonlines')

In [7]:
#sens_sample = pd.read_hdf('./data/test/big_newsdata.h5', 'sens', mode='a')
sens_data = pd.read_json(data_file,convert_dates=True,lines=True,chunksize=1)

In [8]:
lemma_file = os.path.join(base_directory,'json_lines_lemmas.txt')

In [9]:
def is_punct_helper(token):
    """helper function to eliminate tokens that are pure punctuation or whitespace"""
    return token.is_punct or token.is_space or token.like_num


def lemmatized_sentence_corpus(generator):    
    """generator function to use spaCy to parse reviews, lemmatize the text, and yield sentences"""
    punc = string.digits+string.punctuation.replace("",".")#+"’‘"+'“–”'

    with open(lemma_file, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(map(lambda x: str(x['text'].values).translate(
            str.maketrans(punc, ' '*len(punc))),generator)
                                      ,batch_size=10, n_threads=cpu_count(),disable=['is_stop','is_alpha','ner','tag','dep'], cleanup=True):

            for sent in parsed_review.sents:
                f.write(u' '.join([token.lemma_ for token in sent
                                 if not is_punct_helper(token)])+'\n')

In [10]:
%%time

if os.path.exists(lemma_file):
        print("A lemma file already exists. ")
else:
    lemmatized_sentence_corpus(sens_data)

CPU times: user 20min 32s, sys: 2min, total: 22min 32s
Wall time: 12min 40s


# Phrase Modelling
Learning conbinations of tokens that together represent meaningful multi-word concepts.  
  
$\frac{count(AB)-count_{min}}{count(a)*count(b)} * N > threashold $

Where count is is the number of times a token appears in the corpus.  
N is the total size of the corpus.  
  
 Gensim is a library for statistical analysis of sentences.  

In [11]:
%%time
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

CPU times: user 693 ms, sys: 128 ms, total: 821 ms
Wall time: 1.88 s


## Unigrams

In [12]:
unigram_sentences = LineSentence(lemma_file)

In [13]:
import itertools
for unigram_sentence in itertools.islice(unigram_sentences, 1, 240):
    print(u' '.join(unigram_sentence))
    print(" ")

a decision of the enterprise chamber of the amsterdam court of appeal steinhoff international holdings n v
 
incorporate in the netherlands registration number share code
 
snh isin
 
nl steinhoff decision of the enterprise chamber of the amsterdam court of appeal steinhoff international holdings n v the company
 
and with -PRON- subsidiary
 
the group
 
further to the company ’s announcement of january the enterprise chamber of the amsterdam court of appeal the enterprise chamber
 
last night issue -PRON- decision in respect of proceeding bring by om handels gmbh and mw holdings gmbh
 
entity control by dr andreas seifert the seifert entities
 
a former joint venture partner of the group the dutch poco proceeding
 
background
 
the dutch poco proceeding relate to the appropriate treatment under international financial reporting standards
 
ifrs
 
regard the consolidation of poco einrichtungsmarkte gmbh
 
poco in the company ’s consolidate financial statement
 
accounts
 
the dutch poc

## Bigrams

In [16]:
bigram_file = os.path.join(base_directory,'bigram_file.txt')

In [17]:
%%time

if os.path.exists(bigram_file):
        print("A bigram file already exists. ")
else:
    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_file)

CPU times: user 13.2 s, sys: 154 ms, total: 13.4 s
Wall time: 13.4 s


In [18]:
bigram_sentences_filepath = os.path.join(base_directory,
                                         'bigram_sentences_all.txt')

In [21]:
%%time
if os.path.exists(bigram_sentences_filepath):
        print("A bigram file already exists. ")
else:
    with open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:   
        for unigram_sentence in unigram_sentences:
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')



CPU times: user 32.2 s, sys: 169 ms, total: 32.4 s
Wall time: 32.4 s


In [24]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [25]:
for bigram_sentence in itertools.islice(bigram_sentences, 230, 240):
    print(u' '.join(bigram_sentence))
    print(u'')



level derivative financial asset

level derivative financial liability

level contingent_consideration liability

level

the fair_value of the financial instrument be equal to -PRON- carrying value include in other non_current asset

include in other non_current liability and trade and other payable

there have be no transfer of financial asset or financial liability between the category of the fair_value hierarchy the fair_value of all external over the counter derivative be calculate base on the discount_rate adjustment technique the discount_rate use be derive from observable rate of return for comparable asset or liability trade in the market the credit_risk of the external counterparty be incorporate into the calculation of fair_value of financial asset and own credit_risk be incorporate in the measurement of financial liability the change in fair_value be therefore impact by the movement of the interest rate curve by the volatility of the applied credit_spread and by any change t

## Trigrams

In [28]:
trigram_model_filepath = os.path.join(base_directory,
                                      'trigram_model_all')

In [29]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if os.path.exists(trigram_model_filepath):
        print("A trigram model already exists. ")
else:
    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)

CPU times: user 12.4 s, sys: 123 ms, total: 12.5 s
Wall time: 12.5 s


In [30]:
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

In [32]:
trigram_sentences_filepath = os.path.join(base_directory,
                                          'trigram_sentences_all.txt')

In [34]:
%%time

if os.path.exists(trigram_sentences_filepath):
        print("A trigram model already exists. ")
else:
    with open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for bigram_sentence in bigram_sentences:
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')



CPU times: user 29.3 s, sys: 96.1 ms, total: 29.4 s
Wall time: 29.4 s


In [35]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [36]:
for trigram_sentence in itertools.islice(trigram_sentences, 230, 240):
    print(u' '.join(trigram_sentence))
    print(u'')

level derivative financial asset

level derivative financial liability

level contingent_consideration liability

level

the fair_value of the financial_instrument be equal to -PRON- carrying_value include in other non_current_asset

include in other non_current_liability and trade and other_payable

there have be no transfer of financial asset or financial liability between the category of the fair_value_hierarchy the fair_value of all external over the counter derivative be calculate base on the discount_rate adjustment technique the discount_rate use be derive_from observable rate of return for comparable asset or liability trade in the market the credit_risk of the external counterparty be incorporate into the calculation of fair_value of financial asset and own credit_risk be incorporate in the measurement of financial liability the change in fair_value be therefore impact by the movement of the interest rate_curve by the volatility of the applied credit_spread and by any change t

## LDE

In [37]:
! pip install pyldavis

Collecting pyldavis
  Downloading pyLDAvis-2.1.1.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 429kB/s 
Collecting joblib>=0.8.4 (from pyldavis)
  Downloading joblib-0.11-py2.py3-none-any.whl (176kB)
[K    100% |████████████████████████████████| 184kB 1.1MB/s 
Collecting pytest (from pyldavis)
  Downloading pytest-3.4.1-py2.py3-none-any.whl (188kB)
[K    100% |████████████████████████████████| 194kB 1.2MB/s 
Collecting funcy (from pyldavis)
  Downloading funcy-1.10.1.tar.gz
Collecting py>=1.5.0 (from pytest->pyldavis)
  Using cached py-1.5.2-py2.py3-none-any.whl
Collecting pluggy<0.7,>=0.5 (from pytest->pyldavis)
Building wheels for collected packages: pyldavis, funcy
  Running setup.py bdist_wheel for pyldavis ... [?25ldone
[?25h  Stored in directory: /home/marcussky/.cache/pip/wheels/de/41/af/cba16e4c15ff942728f3345c8f165831b03ad7f4d87cff8b6e
  Running setup.py bdist_wheel for funcy ... [?25ldone
[?25h  Stored in directory: /home/marcussky/.cache/pip/wheel

In [None]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary

import pyLDAvis
import pyLDAvis.gensim
import warnings
import cPickle as pickle

In [None]:
trigram_dictionary_filepath = os.path.join(base_directory,
                                           'trigram_dict_all.dict')

In [43]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if os.path.exists(trigram_dictionary_filepath):
        print("A trigram dictionary already exists. ")
else:
    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_sentences)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)

CPU times: user 7.86 s, sys: 21.1 ms, total: 7.88 s
Wall time: 7.87 s


In [44]:
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

In [51]:
trigram_bow_filepath = os.path.join(base_directory,
                                    'trigram_bow_corpus_all.mm')

In [52]:
def trigram_bow_generator(filepath):
    """generator function to read reviews from a file
    and yield a bag-of-words representation """
    
    for review in LineSentence(filepath):
        yield(trigram_dictionary.doc2bow(review))

In [55]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if os.path.exists(trigram_bow_filepath):
        print("A trigram dictionary already exists. ")
else:
    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_sentences_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

CPU times: user 12.4 s, sys: 293 ms, total: 12.7 s
Wall time: 12.7 s


In [56]:
lda_model_filepath = os.path.join(base_directory, 'lda_model_all')

In [57]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if os.path.exists(lda_model_filepath):
        print("A lda model already exists. ")
else:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=3)
    
    lda.save(lda_model_filepath)

CPU times: user 3min 56s, sys: 4.59 s, total: 4min 1s
Wall time: 4min 11s


In [61]:
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [62]:
def explore_topic(topic_number, topn=25):
    """accept a user-supplied topic number and
    print out a formatted list of the top terms"""
        
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [66]:
explore_topic(topic_number=3)

term                 frequency

interest             0.127
december             0.094
september            0.060
date                 0.060
january              0.027
november             0.026
payment_date         0.026
on                   0.021
security             0.017
a                    0.016
at                   0.016
period               0.015
usd                  0.014
distribution         0.013
class                0.011
to                   0.011
be                   0.011
rate                 0.010
for                  0.009
payment              0.009
march                0.008
first                0.008
long_short_position  0.008
commencement         0.008
issue                0.007


Time to name the topics

In [None]:
topic_names = {0: u'mexican',
               1: u'menu',
               2: u'thai',
               3: u'steak',
               4: u'donuts & appetizers',
               5: u'specials',
               6: u'soup',
               7: u'wings, sports bar',
               8: u'foreign language',
               9: u'las vegas',
               10: u'chicken',
               11: u'aria buffet',
               12: u'noodles',
               13: u'ambience & seating',
               14: u'sushi',
               15: u'arizona',
               16: u'family',
               17: u'price',
               18: u'sweet',
               19: u'waiting',
               20: u'general',
               21: u'tapas',
               22: u'dirty',
               23: u'customer service',
               24: u'restrooms',
               25: u'chinese',
               26: u'gluten free',
               27: u'pizza',
               28: u'seafood',
               29: u'amazing',
               30: u'eat, like, know, want',
               31: u'bars',
               32: u'breakfast',
               33: u'location & time',
               34: u'italian',
               35: u'barbecue',
               36: u'arizona',
               37: u'indian',
               38: u'latin & cajun',
               39: u'burger & fries',
               40: u'vegetarian',
               41: u'lunch buffet',
               42: u'customer service',
               43: u'taco, ice cream',
               44: u'high cuisine',
               45: u'healthy',
               46: u'salad & sandwich',
               47: u'greek',
               48: u'poor experience',
               49: u'wine & dine'}

In [None]:
topic_names_filepath = os.path.join(base_directory, 'topic_names.pkl')

with open(topic_names_filepath, 'w') as f:
    pickle.dump(topic_names, f)

In [None]:
LDAvis_data_filepath = os.path.join(base_directory, 'ldavis_prepared')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if os.path.exists(LDAvis_data_filepath):
        print("A LDAvis data already exists. ")
else:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

    with open(LDAvis_data_filepath, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath) as f:
    LDAvis_prepared = pickle.load(f)

Time to visualize the model

In [None]:
pyLDAvis.display(LDAvis_prepared)

## Describing LDA

# Word Vectors