In [11]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

from collections import Counter

import glob
import pickle
import itertools
import numpy as np
import pandas as pd

In [6]:
n_features = 1000
n_top_words = 20
n_components = 25

In [62]:
fs = glob.glob("*.csv")
fs.remove('aluminum_5yrs_seekingalpha.csv')
dfs = []
for f in fs:
    dfs.append(pd.read_csv(f))
df = pd.concat(dfs).reset_index(inplace=True)

In [63]:
df = pd.concat(dfs)
df.reset_index(inplace=True)
df.head()

Unnamed: 0,index,title,date,body
0,0,"Dude, You're Not Getting DELL...Or AUTO",Mar.18.13,(At Least Not At These Prices)Despite differen...
1,1,Critical Analysis Of Dell Buyout Plan Shows Hi...,Mar.14.13,The media is currently focused on the saga of ...
2,2,The Value Of Dell Using The EBIT Multiple Valu...,Mar.14.13,Let me go through a EBIT multiple valuation me...
3,3,Dell LBO Deal Structure - Like Buying A Rental...,Feb. 7.13,Henry Blodget is pulling on exactly the right ...
4,4,Billionaire Carl Icahn Moving Into Multilevel-...,"Mar. 12, 2013 8:55 AM ET","In the view of many investors, billionaire Car..."


In [64]:
df.shape

(1854, 4)

In [42]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key]
    
class Predictor(BaseEstimator, ClassifierMixin): 
    def __init__(self, top_n=5, debug=False):
        self.top_n = top_n
        self.debug = debug

    def fit(self, X, y):
        self.X = X
        self.y = y
        return self
    
    def _flatten(self, l):
        return list(itertools.chain(*l))
    
    def predict(self, X):
        m = np.matmul(self.X, X.T).T 
        results = []
        for idx, row in enumerate(np.argsort(m)):
            top_idxs = list(reversed(row[-self.top_n:]))
            results.append(self.X[top_idxs])
        return np.array(results)
    

In [52]:
class Topicer():
    def __init__(self, debug=False):
        self.selector = ItemSelector(key='body')
        
        """
        self.words = CountVectorizer(ngram_range=(1,2), max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
        self.transformer = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
        """
        
        self.words = TfidfVectorizer(ngram_range=(1,1), max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
        
        self.transformer = NMF(n_components=n_components, random_state=1, 
                  alpha=.1, l1_ratio=.5, init='nndsvd')
        
        self.classifier = Predictor(top_n=5, debug=debug)
        
    def _preprocess_fit(self, X):
        r = self.selector.transform(X)
        r = self.words.fit_transform(r)
        self.transformer.fit(r)
        r = self.transformer.transform(r)
        return r
    
    def _preprocess_predict(self, X):
        r = self.selector.transform(X)
        r = self.words.transform(r)
        r = self.transformer.transform(r)
        return r
    
    def fit(self, X, y=None):
        r = self._preprocess_fit(X)
        self.X = r
        self.y = y
        self.classifier.fit(self.X, self.y)
        return self
    
    def predict(self, X):
        r = self._preprocess_predict(X)
        return self.classifier.predict(r)
    
    def display_topics(self, no_top_words):
        model = self.transformer
        feature_names = self.words.get_feature_names()
        for topic_idx, topic in enumerate(model.components_):
            print("Topic {}:".format(topic_idx))
            print(" ".join([feature_names[i]
                            for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [53]:
X_train = df

In [54]:
t = Topicer()
r = t.fit(X_train)

In [55]:
t.display_topics(30)

Topic 0:
revenue year quarter growth earnings sales billion million company share revenues fiscal reported decline results eps guidance operating margin report adjusted increase total fourth segment line business net expected q3
Topic 1:
ibm business strategic imperatives revenue rometty years blue blockchain growth company cloud mainframe big machines businesses hardware billion past computing analytics international tech nyse time long buybacks declines areas technology
Topic 2:
hp pc enterprise printing whitman hpq printers split printer lenovo hewlett sales packard pcs business ceo billion hardware personal meg unit shipments businesses year shareholders market gartner quarter xerox new
Topic 3:
dividend yield payout dividends stocks income yields growth ratio earnings high paying investors years yielding current flow payments list increases year company annual increase companies look free past ratios stock
Topic 4:
dell mr michael said private deal buyout technologies firm equity 

In [None]:
        
lda_pipeline = Pipeline([
    ('selector', ItemSelector(key='body')),
    ('words', CountVectorizer(ngram_range=(1,2), max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')),
    ('transform', LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)),
    ('classify', Predictor(top_n=5))
])

nmf_pipeline = Pipeline([
    ('selector', ItemSelector(key='body')),
    ('words', TfidfVectorizer(ngram_range=(1,2), max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')),
    ('transform', NMF(n_components=n_components, random_state=1, 
                  alpha=.1, l1_ratio=.5, init='nndsvd')),
    ('classify', Predictor(top_n=5))
])
lda = lda_pipeline.fit(X_train, y_train)
nmf = nmf_pipeline.fit(X_train, y_train)

lda_res = lda_pipeline.predict(X_test[:2])
nmf_res = nmf_pipeline.predict(X_test[:2])
print(lda_res)
print(nmf_res)
print(X_test[:2]['tags'])