In [1]:
import random
from pprint import pprint
import gensim.corpora as corpora
from nltk.corpus import stopwords as sw
from gensim.models.ldamodel import LdaModel

from utils.processing import scrape_edgar, format_text, lem_and_stem, add_bigrams
from utils.stopwords import stopwords as custom_sw

In [2]:
stopwords = set(sw.words('english')).union(custom_sw)

In [3]:
from data.sec_edgar_urls import URLS_10K

In [4]:
docContents = scrape_edgar(URLS_10K)

100%|██████████| 32/32 [02:49<00:00,  8.37s/it]


In [5]:
X = []
companies = []
for company, doc in docContents.items():
    companies.append(company)
    X.append(lem_and_stem(format_text(doc), stopwords))
X = add_bigrams(X)

In [6]:
idWordDictionary = corpora.Dictionary(X)
idWordDictionary.filter_extremes(no_below=5)  # filter rare words
corpus = [idWordDictionary.doc2bow(doc) for doc in X]
readableCorpus = [[(idWordDictionary[wordid], freq) for wordid, freq in cp] for cp in corpus[:1]]

In [7]:
numTopics = 5
passes = 30
iterations = 400
seed = 667  # random.randint(0,1000)

lda = LdaModel(corpus=corpus,
    id2word=idWordDictionary,
    num_topics=numTopics, 
    passes=100,  # 1000 pretty good
    eta='auto',
    alpha='auto',
    update_every=0,  # batch learning
    random_state=seed
)

In [8]:
cluster = dict()

for i in range(numTopics):
    cluster[i] = []

for idx, company in enumerate(companies):
    mod_pred = lda[corpus[idx]]
    weights = [w[1] for w in mod_pred]
    idx_max = weights.index(max(weights))
    max_topic = mod_pred[idx_max][0]
    cluster[max_topic].append(company)
    
pprint(cluster)

{0: ['CHEVRON', 'HOMEDEPOT'],
 1: ['UNITEDHEALTH', 'PFIZER', 'MERCK'],
 2: ['APPLE', 'WALMART', 'AMAZON', 'CISCO', 'DISNEY'],
 3: ['VISA', 'MASTERCARD'],
 4: ['FACEBOOK', 'ALPHABET', 'PHILIPMORRIS']}


In [9]:
for idx, company in enumerate(companies):
    print(company, lda[corpus[idx]])

APPLE [(2, 0.9997229)]
WALMART [(2, 0.9998605)]
AMAZON [(2, 0.9994884)]
FACEBOOK [(4, 0.9992917)]
ALPHABET [(4, 0.99969685)]
VISA [(3, 0.9998805)]
UNITEDHEALTH [(1, 0.9998781)]
CHEVRON [(0, 0.99987483)]
PFIZER [(1, 0.9999062)]
CISCO [(2, 0.9998805)]
HOMEDEPOT [(0, 0.9997771)]
MERCK [(1, 0.99989176)]
MASTERCARD [(3, 0.9998624)]
DISNEY [(2, 0.999906)]
PHILIPMORRIS [(4, 0.99942845)]


In [10]:
pprint(lda.print_topics())
doc_lda = lda[corpus]

[(0,
  '0.079*"field" + 0.059*"well" + 0.051*"home" + 0.044*"affili" + '
  '0.028*"capac" + 0.028*"energi" + 0.022*"australia" + 0.020*"water" + '
  '0.017*"mexico" + 0.015*"sharehold"'),
 (1,
  '0.116*"care" + 0.033*"phase" + 0.022*"insur" + 0.020*"adult" + '
  '0.017*"agenc" + 0.014*"japan" + 0.014*"coverag" + 0.014*"rule" + '
  '0.014*"section" + 0.013*"contract"'),
 (2,
  '0.036*"station" + 0.023*"cloud" + 0.020*"home" + 0.019*"video" + '
  '0.018*"execut" + 0.015*"enterpris" + 0.014*"hardwar" + 0.014*"hour" + '
  '0.014*"game" + 0.013*"shop"'),
 (3,
  '0.058*"institut" + 0.057*"card" + 0.040*"issuer" + 0.028*"fee" + '
  '0.028*"commerc" + 0.028*"core" + 0.028*"credit" + 0.021*"accept" + '
  '0.016*"fund" + 0.015*"jurisdict"'),
 (4,
  '0.019*"cloud" + 0.017*"hardwar" + 0.016*"video" + 0.013*"learn" + '
  '0.012*"climat" + 0.012*"execut" + 0.011*"trend" + 0.011*"disclosur" + '
  '0.011*"oblig" + 0.011*"instanc"')]
