In [1]:
import re

import numpy as np

from sklearn.preprocessing import normalize
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
def print1d(template, values):
    for value in values: print(template.format(value), end = '')
    print()
    
def print2d(template, valuess, blank = '', threshold = None):
    for values in valuess:
        for value in values:
            print(template.format(value) if (threshold == None) or (value > threshold) else blank, end = '')
        print()

In [4]:
corpus = [
    'Investment in Deutsche Bank yields low return.',
    'My investment may return nothing.', 
    'Federer’s return was good, his volley was not.',
    'Return volley, return volley; tennis is boring.',
    'Return on investment is on a ten year high.',
#    'Tennis is for Federer!',
#    'Deutsche Bank may be an investment bank.'
]

In [5]:
bags = []

for document in corpus:
    
    tokens = re.split('[ .!,;’]', document)
    bag    = [token.lower() for token in tokens if len(token) > 3]
    
#    stop_words = ['', 'in', 'my', 'may', 's', 'was', 'his', 'not', 'is', 'on', 'a', 'ten', 'for', 'be', 'an']
#    bag        = [token.lower() for token in tokens if token.lower() not in stop_words]

#    bag        = ['hi/lo' if word in {'high', 'low'} else word for word in bag]
    
    bags.append(bag)

print2d('{:12s}', bags)

investment  deutsche    bank        yields      return      
investment  return      nothing     
federer     return      good        volley      
return      volley      return      volley      tennis      boring      
return      investment  year        high        


In [8]:
vocabulary = dict.fromkeys([word for bag in bags for word in bag])

# vocabulary = dict.fromkeys(['investment', 'return', 'federer', 'volley'])
# for bag in bags: bag = [word for word in bag if word in vocabulary]

words = [word for word in vocabulary.keys()]

print1d('{}  ', words)

investment  deutsche  bank  yields  return  nothing  federer  good  volley  tennis  boring  year  high  


In [9]:
for key in vocabulary.keys(): vocabulary[key] = 0
word_counts = np.zeros((len(corpus), len(vocabulary)), dtype=int)

for d, bag in enumerate(bags):
    for w, word in enumerate(words):
        
        count = bag.count(word)
        
        vocabulary[word] += count
        word_counts[d, w] = count

LINE = '-' + len(vocabulary) * 9 * '-'
print1d('{:>9}', vocabulary.keys()             ); print(LINE)
print2d('{:9d}', word_counts,        9 * ' ', 0); print(LINE)
print1d('{:9d}', vocabulary.values()           )

investment deutsche     bank   yields   return  nothing  federer     good   volley   tennis   boring     year     high
----------------------------------------------------------------------------------------------------------------------
        1        1        1        1        1                                                                        
        1                                   1        1                                                               
                                            1                 1        1        1                                    
                                            2                                   2        1        1                  
        1                                   1                                                              1        1
----------------------------------------------------------------------------------------------------------------------
        3        1        1        1        6        

In [11]:
n_topics = 2

lda = LatentDirichletAllocation(n_components = n_topics, learning_method='batch', max_iter=50, n_jobs = -1)

lda.fit(word_counts)

words_in_topics = normalize(lda.components_, norm='l1')

print1d('{:>9}',   vocabulary.keys()); print(LINE)
print2d('{:9.1f}', lda.components_  ); print(LINE)
print2d('{:9.1%}', words_in_topics  ); print(LINE)

investment deutsche     bank   yields   return  nothing  federer     good   volley   tennis   boring     year     high
----------------------------------------------------------------------------------------------------------------------
      0.5      0.5      0.5      0.5      3.6      0.5      1.5      1.5      3.5      1.5      1.5      0.5      0.5
      3.5      1.5      1.5      1.5      3.4      1.5      0.5      0.5      0.5      0.5      0.5      1.5      1.5
----------------------------------------------------------------------------------------------------------------------
     3.1%     3.0%     3.0%     3.0%    21.6%     3.1%     9.0%     9.0%    21.0%     9.0%     9.0%     3.1%     3.1%
    19.0%     8.1%     8.1%     8.1%    18.6%     8.1%     2.7%     2.7%     2.7%     2.7%     2.7%     8.1%     8.1%
----------------------------------------------------------------------------------------------------------------------


In [12]:
topics_in_corpus = lda.transform(word_counts)

print1d('Topic{:2d}  ', range(n_topics)               )
print2d('{:7.0%}  ',    topics_in_corpus, 9 * ' ', 0.5)

Topic 0  Topic 1  
             91%  
             85%  
    89%           
    92%           
             89%  


In [13]:
words_in_corpus  = topics_in_corpus.dot(words_in_topics)
length_in_corpus = [len(bag) for bag in bags]
word_counts_in_corpus = np.diag(length_in_corpus).dot(words_in_corpus)

print1d('{:>9}',   vocabulary.keys()                    ); print(LINE)
print2d('{:9d}',   word_counts,           9 * ' ', 0    ); print(LINE)
print2d('{:9.1f}', word_counts_in_corpus, 9 * ' ', 0.334)

investment deutsche     bank   yields   return  nothing  federer     good   volley   tennis   boring     year     high
----------------------------------------------------------------------------------------------------------------------
        1        1        1        1        1                                                                        
        1                                   1        1                                                               
                                            1                 1        1        1                                    
                                            2                                   2        1        1                  
        1                                   1                                                              1        1
----------------------------------------------------------------------------------------------------------------------
      0.9      0.4      0.4      0.4      0.9      0.

In [49]:
def topic_description(words, probabilities):

    cumulated = 0
    description = ''
    
    for w in np.argsort(probabilities)[::-1]:

        probability = probabilities[w]
        description += words[w]  + ','
        
        if (cumulated < 1/3 <= cumulated + probability) or (cumulated < 4/5 <= cumulated + probability):
            description += '  '
        
        cumulated += probability
    
    return description.rstrip(' ').rstrip(',')

descriptions = []

for probabilities in words_in_topics:
    description = topic_description(words, probabilities)
    print(description)
    descriptions.append(description)

investment,return,  yields,bank,deutsche,high,year,nothing,  good,federer,volley,boring,tennis
return,volley,  boring,tennis,good,federer,nothing,  investment,high,year,yields,bank,deutsche


In [50]:
for document, probabilities in zip(corpus, topics_in_corpus):

    print('\n"{}"'.format(document))
    
    for probability, description in zip(probabilities, descriptions):
        print('{} {:.0%} {:}'.format('X ' if probability > 0.5 else '- ', probability, description))


"Investment in Deutsche Bank yields low return."
X  91% investment,return,  yields,bank,deutsche,high,year,nothing,  good,federer,volley,boring,tennis
-  9% return,volley,  boring,tennis,good,federer,nothing,  investment,high,year,yields,bank,deutsche

"My investment may return nothing."
X  85% investment,return,  yields,bank,deutsche,high,year,nothing,  good,federer,volley,boring,tennis
-  15% return,volley,  boring,tennis,good,federer,nothing,  investment,high,year,yields,bank,deutsche

"Federer’s return was good, his volley was not."
-  11% investment,return,  yields,bank,deutsche,high,year,nothing,  good,federer,volley,boring,tennis
X  89% return,volley,  boring,tennis,good,federer,nothing,  investment,high,year,yields,bank,deutsche

"Return volley, return volley; tennis is boring."
-  8% investment,return,  yields,bank,deutsche,high,year,nothing,  good,federer,volley,boring,tennis
X  92% return,volley,  boring,tennis,good,federer,nothing,  investment,high,year,yields,bank,deutsch

<img source='images/lda-on-returns-word-use-in-5-sentences.PNG'/>

## Lattice of the "Sentence uses word" relation
<img src='images/lda-on-returns-word-use-in-5-sentences.PNG' style='width:60%'/>

## Lattice of the "Sentence uses word" relation, given two more sentences
<img src='images/lda-on-returns-word-use-in-7-sentences.PNG' style='width:60%'/>