In [1]:
import os
import pickle
from typing import List
from tqdm import tqdm_notebook
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
from gensim.corpora import Dictionary
from newspaper import Article
from scipy.special import kl_div
from scipy.stats import entropy
from scipy.spatial.distance import jensenshannon

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/mmr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# spacy processing func, currently not in use
def doc_func(self,doc):
    # remove if punct, is whitespace, symbol, non-descript pos,
    # stop word, contains a digit

    # doc is a spacy document object, formed from calling
    # nlp(text)
    pos_to_remove = set(['ADJ','ADP','ADV','AUX','PART',
                         'SCONJ','SYM'])
    lemmas = []
    for token in doc:
        if token.is_punct or \
           token.text.isspace() or \
           any(char.isdigit() for char in token.text) or \
           self.nlp.vocab[token.lemma_].is_stop or \
           token.pos_ in pos_to_remove:
            continue
        lemmas.append(token.lemma_)


    return ' '.join(lemmas)

In [4]:
def write_to_file(text:str,base_dir:str,ticker:str,file_stub:str):

    """
    writes text to file base_dir/ticker/file_stub    
    """
    
    filename = os.path.join(base_dir,ticker,file_stub)
    with open(filename,'w') as f:
        f.write(text)

In [5]:
class Processor:
    def __init__(self):
        # placeholder for now, if we want to use spacy preprocessing
        # we will store spacy nlp obj here
        self.lemmatizer = WordNetLemmatizer()
        stops = stopwords.words('english')
        stops.extend(['mr', 'vice', 'he', 'award', 'm', 'his', 'ceo', 'she', 'joining', 
                      'grant', 'severance', 'schenkel', 'chairman', 'threshold', 'salary', 
                      'yetto', 'vesting', 'gaap', 'letter', 'neos', 'interim', 'ebay', 'lee', 
                      'cring', 'wenig', 'pbrsus', 'pbrsu', 'rsus', 'her', 'serf', 'age', 'joined', 
                      'appointment', 'bonus', 'university', 'vested', 'maximum', 'thompson', 'granted', 
                      'column', 'eip', 'peer', 'met', 'chair', 'neutral', 'fx', 'qualification',
                      'cfo', 'exercise', 'svp', 'earned', 'appointed', 'vest', 'equal', 'payout',
                      'modifier', 'unvested', 'achievement', 'shown', 'career', 'half', 'entitled',
                      'terminated', 'vp', 'eligible', 'calculated', 'school', 'counsel', 
                      'advisor', 'separation', 'acceleration', 'talent', 'accrued', 
                      'deputy', 'payouts', 'anderson', 'head', 'rsu', 'waiver', 'awarded',
                      'publiccompany', 'lieu', 'founder', 'follows', 'death', 'spent', 'iannone',
                      'mckinsey', 'led', 'accelerated', 'closing', 'summary', 'sign', 'yet', 
                      'iv', 'institute', 'affair', 'nominating', 'revoke', 'resignation', 
                      'llp', 'council', 'metric', 'consultant', 'formerly', 'execution', 
                      'harvard', 'determining', 'omidyar', 'neo', 'named', 'monthly',
                      'achieved', 'serving', 'please', 'disability', 'restricted', 'payable', 
                      'dated', 'reflected', 'secretary', 'ten', 'treated', 'brown', 'intel', 
                      'ending', 'walmart', 'prorated', 'llc', 'airplane', 'comparable', 
                      'ethic', 'disclosed', 'bates', 'hammer', 'departure', 'describes', 
                      'college', 'occurs', 'biography', 'division', 'unpaid', 'schedule', 
                      'immediately', 'elect', 'realized', 'reviewed', 'driving', 'lump', 
                      'retirement', 'assume', 'graduate', 'sum', 'worked', 'marvell', 
                      'nbcuniversal', 'farrell', 'footnote', 'science', 'studio', 
                      'terminates', 'recommendation', 'approves', 'tenure', 'engagement', 
                      'did', 'calculation', 'consecutive', 'survey', 'club', 'sam', 
                      'founded', 'advisory', 'election', 'assistant', 'paypal', 
                      'elliott', 'murphy', 'cohn', 'rationale',
                      'mitic', 'swan', 'involuntary', 'him', 'trustee', 'budget',
                      'recognize', 'preceding', 'accountable', 'bachelor', 'clawback', 'assessed', 
                      'corp', 'hired', 'driver', 'setting','client','product','special','margin',
                      'cost','reduce'])
        self.stops = set([self.lemmatizer.lemmatize(w) for w in stops])
    
    def process(self,rawtext:str):
        """
        Parameters
        ------------

        converts string of raw text to string of lemmas separated by
        single whitespace

        rawtext: str, text to be processed
        """

        tokenizer = RegexpTokenizer(r'\w+')
        rawtext   = rawtext.lower()
        rawtext   = tokenizer.tokenize(rawtext)
        lemmas    = []
        
        for token in rawtext:
            if any(char.isdigit() for char in token) or \
               len(token) <= 1:continue
            
            lemma = self.lemmatizer.lemmatize(token)
            if lemma in self.stops: continue
            lemmas.append(lemma)
            
        output = ' '.join(lemmas)

        return output


In [6]:
class Reader:
    def __init__(self,base_dir:str, file_stub:str,
                 com_list:List[str],output_type:str,
                 dct=None):
        
        """
        Parameters
        ------------
        
        base_dir: str, base directory containing company texts
        file_stub: str, name of the files to be read, a file is constructed
                   as base_dir/ticker/file_stub
        com_list: List[str], list of company tickers to read out
        output_type: str, 'list': splits a file on whitespace 
                           'bow': runs file through dct to get bow representation
                                  must also provide gensim dictionary object in dct
                                  parameter
                          'text': returns raw text file 
        dct: gensim.corpora.Dictionary, for 'bow' output_type will pass text through 
             dct to get bow representation
         """
        
        self.base_dir     = base_dir
        self.file_stub    = file_stub
        self.com_list     = com_list
        self.output_type  = output_type
        self.dct          = dct

    def get_bow(self,tckr):
        text = self.__getitem__(tckr)
        return self.dct.doc2bow(text.split(' '))
        
    def __getitem__(self,tckr):
        filename = os.path.join(self.base_dir,tckr,self.file_stub)
        with open(filename,'r') as f:
            text = f.read()
        return text

    def __iter__(self):
        for tckr in self.com_list:
            text = self.__getitem__(tckr)
            if self.output_type == 'bow':
                yield self.dct.doc2bow(text.split(' '))
            elif self.output_type == 'list':
                yield text.split(' ')
            else:
                yield text

    def __len__(self):
        return len(self.com_list)

In [7]:
class Converter:
    def __init__(self,reader:Reader,processor:Processor,
                 base_dir:str,file_stub:str):
        """
        Parameters 
        ------------
        
        reader: Reader, Reader object to read files from disk and process
        processor: Processor, Processor object to convert files to lemmas
        base_dir: str, base directory to write processed files to
        file_stub: str, file name for processed files
        
        """
        
        self.reader = reader
        self.processor = processor
        self.base_dir = base_dir
        self.file_stub = file_stub
    
    def convert(self):
        """
        iterates over self.reader and processes the returned text files
        writes the processed text files to disk
        """
        print('converting...')
        for text,ticker in zip(self.reader,self.reader.com_list):
            filename = os.path.join(self.base_dir,ticker,self.file_stub)
            processed_text = self.processor.process(text)
            with open(filename,'w') as f:
                f.write(processed_text)
        print('done')

In [8]:
#Topics

def topic_to_arr(doc_topic,num_topics):
    arr = np.zeros(num_topics)
    for tup in doc_topic:
        arr[tup[0]] = tup[1]
    return arr

def arr_to_topics(article_arr):
    topics = []
    for i,top in enumerate(article_arr):
        if top > 0.03:
            topics.append((i,top))
    return topics
    
class Topics:
    def __init__(self,model,reader,processor,num_topics):
        self.model      = model
        self.reader     = reader
        self.com_list   = self.reader.com_list
        self.num_topics = num_topics
        self.processor  = processor
        self.cutoff     = 5
        self.epsilon    = 0.0001
    
    def score_docs(self):
        topic_mat = np.zeros([len(self.reader),self.num_topics])
        for i,doc in enumerate(self.reader):
            doc_topic = model[doc]
            topic_mat[i] = topic_to_arr(doc_topic,self.num_topics)
            
        self.topic_mat = topic_mat + self.epsilon
    
    def get_article_topics(self,text):
        article_topics = self._get_topics(text)
        article_arr  = topic_to_arr(article_topics,self.num_topics)
        arr_sorted   = np.argsort(-article_arr)
        to_discard   = arr_sorted[self.cutoff:]
        article_arr[to_discard] = self.epsilon
        article_arr /= np.sum(article_arr)
        return arr_to_topics(article_arr)
        
    def _get_topics(self,text):
        lemmas = self.processor.process(text)
        bow    = self.reader.dct.doc2bow(lemmas.split(' '))
        topics = self.model[bow]
        return topics
        
    def get_top_companies(self,article_text):
        article_topics = self._get_topics(article_text)
        article_arr    = topic_to_arr(article_topics,self.num_topics)
        to_discard     = np.argsort(-article_arr)[self.cutoff:]
        article_arr[to_discard] = self.epsilon
        #article_arr /= np.sum(article_arr)
        #article_arr    += 0.01
        
        distances = []
        for doc in self.topic_mat:
            doc_to_discard = np.argsort(-doc)[self.cutoff:]
            doc[doc_to_discard] = self.epsilon
            ent = jensenshannon(article_arr,doc)
            #ent = np.linalg.norm(article_arr-doc)
            distances.append(ent)
        distance_inds = np.argsort(distances)
        #print(distances)
        return [self.com_list[i] for i in distance_inds[:10]]
    


In [9]:
com_list  = set(pickle.load(open('valid_list.pkl','rb')))
com_list.remove('googl')
com_list = list(com_list)

In [66]:
pickle.dump(com_list,open('valid_list.pkl','wb'))

In [10]:
reader    = Reader('../crawl/valid','matchedText.txt',com_list,'text')
processor = Processor()
converter = Converter(reader,processor,'../crawl/valid','lemmas.txt')

In [11]:
converter.convert()

converting...
done


In [12]:
lemma_reader = Reader('../crawl/valid','lemmas.txt',com_list,'list')
dictionary   = Dictionary(lemma_reader)
dictionary.filter_extremes(no_below=0,no_above=.5,keep_n=1000000)

In [13]:
corpus = Reader('../crawl/valid','lemmas.txt',com_list,'bow',dct=dictionary)

In [14]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [78]:
from gensim.models import LdaModel, LdaMulticore

# Set training parameters.
num_topics = 300
chunksize = 2000
passes = 40
iterations = 500
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='symmetric',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2020-05-03 23:18:20,398 : INFO : using symmetric alpha at 0.0033333333333333335
2020-05-03 23:18:20,402 : INFO : using serial LDA version on this node
2020-05-03 23:18:21,557 : INFO : running online LDA training, 300 topics, 40 passes over the supplied corpus of 425 documents, updating every 14000 documents, evaluating every ~0 documents, iterating 500x with a convergence threshold of 0.001000
2020-05-03 23:18:21,557 : INFO : training LDA model using 7 processes
2020-05-03 23:18:22,757 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #425/425, outstanding queue size 1
2020-05-03 23:19:08,476 : INFO : topic #190 (0.003): 0.011*"aep" + 0.009*"branch" + 0.008*"transmission" + 0.008*"utility" + 0.006*"registrant" + 0.005*"beer" + 0.005*"engine" + 0.005*"electric" + 0.005*"onsite" + 0.004*"eu"
2020-05-03 23:19:08,477 : INFO : topic #14 (0.003): 0.017*"southern" + 0.014*"electric" + 0.013*"jpmorgan" + 0.010*"chase" + 0.010*"pipeline" + 0.009*"oil" + 0.007*"utility" + 0.006*"r

2020-05-03 23:19:53,758 : INFO : topic diff=1.642902, rho=0.401205
2020-05-03 23:19:55,613 : INFO : PROGRESS: pass 6, dispatched chunk #0 = documents up to #425/425, outstanding queue size 1
2020-05-03 23:20:02,775 : INFO : topic #288 (0.003): 0.030*"clearing" + 0.017*"cme" + 0.016*"chicken" + 0.014*"food" + 0.009*"index" + 0.008*"house" + 0.008*"beef" + 0.008*"live" + 0.008*"pork" + 0.007*"prepared"
2020-05-03 23:20:02,776 : INFO : topic #127 (0.003): 0.002*"ibm" + 0.001*"cloud" + 0.000*"ai" + 0.000*"hybrid" + 0.000*"cognitive" + 0.000*"ip" + 0.000*"mission" + 0.000*"workload" + 0.000*"ecosystem" + 0.000*"computing"
2020-05-03 23:20:02,777 : INFO : topic #28 (0.003): 0.028*"coffee" + 0.023*"starbucks" + 0.012*"food" + 0.010*"beverage" + 0.010*"merchandise" + 0.009*"vehicle" + 0.008*"autozone" + 0.008*"amp" + 0.008*"card" + 0.008*"automotive"
2020-05-03 23:20:02,777 : INFO : topic #135 (0.003): 0.136*"fifth" + 0.070*"bancorp" + 0.022*"frb" + 0.017*"banking" + 0.016*"loan" + 0.013*"depo

2020-05-03 23:20:52,885 : INFO : topic #126 (0.003): 0.028*"insurer" + 0.021*"auto" + 0.020*"reinsurance" + 0.013*"progressive" + 0.010*"catastrophe" + 0.010*"vehicle" + 0.009*"write" + 0.008*"surplus" + 0.008*"underwriting" + 0.007*"layer"
2020-05-03 23:20:52,886 : INFO : topic #265 (0.003): 0.022*"medical" + 0.017*"dental" + 0.013*"animal" + 0.010*"fda" + 0.009*"physician" + 0.009*"patterson" + 0.007*"defendant" + 0.007*"complaint" + 0.007*"schein" + 0.006*"henry"
2020-05-03 23:20:52,886 : INFO : topic #140 (0.003): 0.003*"healthcare" + 0.003*"outlook" + 0.003*"compulsory" + 0.003*"round" + 0.003*"downward" + 0.003*"fifth" + 0.003*"simultaneous" + 0.003*"pursued" + 0.003*"parallel" + 0.003*"approve"
2020-05-03 23:20:52,887 : INFO : topic #235 (0.003): 0.030*"packaging" + 0.023*"beverage" + 0.017*"food" + 0.016*"paper" + 0.010*"ingredient" + 0.010*"snack" + 0.010*"raw" + 0.009*"plastic" + 0.009*"bottler" + 0.008*"retailer"
2020-05-03 23:20:52,906 : INFO : topic diff=0.618049, rho=0.27

2020-05-03 23:21:34,769 : INFO : topic #225 (0.003): 0.026*"semiconductor" + 0.025*"signal" + 0.024*"analog" + 0.012*"raw" + 0.010*"foundry" + 0.010*"wafer" + 0.007*"wireless" + 0.007*"sustainability" + 0.007*"assembly" + 0.007*"yield"
2020-05-03 23:21:34,770 : INFO : topic #296 (0.003): 0.043*"dealer" + 0.034*"broker" + 0.025*"swap" + 0.018*"volcker" + 0.015*"cftc" + 0.015*"banking" + 0.014*"bhc" + 0.013*"basel" + 0.012*"fdic" + 0.011*"adviser"
2020-05-03 23:21:34,771 : INFO : topic #105 (0.003): 0.062*"mellon" + 0.048*"bny" + 0.036*"md" + 0.021*"guide" + 0.019*"loan" + 0.017*"banking" + 0.014*"sheet" + 0.010*"presented" + 0.010*"nv" + 0.010*"sa"
2020-05-03 23:21:34,786 : INFO : topic diff=0.204276, rho=0.228143
2020-05-03 23:21:36,380 : INFO : PROGRESS: pass 19, dispatched chunk #0 = documents up to #425/425, outstanding queue size 1
2020-05-03 23:21:41,232 : INFO : topic #174 (0.003): 0.043*"healthcare" + 0.014*"tenant" + 0.013*"diagnostic" + 0.012*"hospital" + 0.012*"medicare" + 0.

2020-05-03 23:22:14,872 : INFO : topic #196 (0.003): 0.052*"hospital" + 0.032*"medicare" + 0.026*"physician" + 0.025*"patient" + 0.022*"medicaid" + 0.019*"cm" + 0.016*"medical" + 0.012*"payer" + 0.009*"reimbursement" + 0.009*"outpatient"
2020-05-03 23:22:14,888 : INFO : topic diff=0.074935, rho=0.199155
2020-05-03 23:22:16,486 : INFO : PROGRESS: pass 25, dispatched chunk #0 = documents up to #425/425, outstanding queue size 1
2020-05-03 23:22:22,410 : INFO : topic #35 (0.003): 0.000*"dialysis" + 0.000*"patient" + 0.000*"physician" + 0.000*"bxp" + 0.000*"medicare" + 0.000*"esrd" + 0.000*"medical" + 0.000*"healthcare" + 0.000*"tenant" + 0.000*"clinical"
2020-05-03 23:22:22,412 : INFO : topic #224 (0.003): 0.032*"retailer" + 0.019*"architectural" + 0.018*"plumbing" + 0.014*"decorative" + 0.013*"masco" + 0.012*"shower" + 0.012*"hardware" + 0.011*"spa" + 0.011*"lighting" + 0.010*"brass"
2020-05-03 23:22:22,412 : INFO : topic #164 (0.003): 0.046*"domain" + 0.046*"registry" + 0.042*"icann" + 

2020-05-03 23:22:57,393 : INFO : topic diff=0.033332, rho=0.178993
2020-05-03 23:22:59,156 : INFO : PROGRESS: pass 31, dispatched chunk #0 = documents up to #425/425, outstanding queue size 1
2020-05-03 23:23:04,460 : INFO : topic #41 (0.003): 0.045*"tenant" + 0.017*"rsf" + 0.016*"rental" + 0.015*"estate" + 0.014*"san" + 0.013*"redevelopment" + 0.011*"francisco" + 0.010*"campus" + 0.010*"street" + 0.009*"alexandria"
2020-05-03 23:23:04,461 : INFO : topic #122 (0.003): 0.030*"adhesive" + 0.019*"lgm" + 0.018*"rbi" + 0.017*"ihm" + 0.016*"label" + 0.015*"dennison" + 0.013*"pension" + 0.011*"avery" + 0.010*"graphic" + 0.010*"libor"
2020-05-03 23:23:04,462 : INFO : topic #268 (0.003): 0.000*"ldl" + 0.000*"lepr" + 0.000*"isogenic" + 0.000*"itching" + 0.000*"juvenile" + 0.000*"kanghong" + 0.000*"keratitis" + 0.000*"kevzara" + 0.000*"keytruda" + 0.000*"knockout"
2020-05-03 23:23:04,462 : INFO : topic #127 (0.003): 0.000*"ibm" + 0.000*"cloud" + 0.000*"ai" + 0.000*"hybrid" + 0.000*"cognitive" + 0

2020-05-03 23:23:45,903 : INFO : topic #98 (0.003): 0.039*"flooring" + 0.021*"ceramic" + 0.020*"tile" + 0.017*"carpet" + 0.013*"raw" + 0.013*"wood" + 0.013*"vinyl" + 0.012*"laminate" + 0.010*"mohawk" + 0.009*"contentsindex"
2020-05-03 23:23:45,903 : INFO : topic #281 (0.003): 0.079*"sirona" + 0.076*"dentsply" + 0.054*"dental" + 0.014*"amalgam" + 0.014*"medical" + 0.011*"fda" + 0.010*"consumable" + 0.008*"mercury" + 0.007*"circular" + 0.006*"restructuring"
2020-05-03 23:23:45,904 : INFO : topic #110 (0.003): 0.021*"drug" + 0.015*"opioid" + 0.014*"fda" + 0.012*"generic" + 0.011*"specialty" + 0.009*"pharmaceutical" + 0.009*"trial" + 0.008*"dea" + 0.008*"clinical" + 0.007*"patient"
2020-05-03 23:23:45,905 : INFO : topic #8 (0.003): 0.087*"expedia" + 0.072*"travel" + 0.041*"hotel" + 0.026*"traveler" + 0.022*"motion" + 0.019*"defendant" + 0.019*"booking" + 0.018*"accommodation" + 0.016*"diller" + 0.014*"plaintiff"
2020-05-03 23:23:45,922 : INFO : topic diff=0.016684, rho=0.161770
2020-05-03 

In [57]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.8096.
[([(2.0369504e-05, 'ldl'),
   (2.0369504e-05, 'lipid'),
   (2.0369504e-05, 'inds'),
   (2.0369504e-05, 'lipoprotein'),
   (2.0369504e-05, 'locus'),
   (2.0369504e-05, 'lipase'),
   (2.0369504e-05, 'lipodystrophy'),
   (2.0369504e-05, 'legalize'),
   (2.0369504e-05, 'ioi'),
   (2.0369504e-05, 'knockout'),
   (2.0369504e-05, 'keytruda'),
   (2.0369504e-05, 'kevzara'),
   (2.0369504e-05, 'keratitis'),
   (2.0369504e-05, 'kanghong'),
   (2.0369504e-05, 'juvenile'),
   (2.0369504e-05, 'itching'),
   (2.0369504e-05, 'isogenic'),
   (2.0369504e-05, 'lepr'),
   (2.0369504e-05, 'ligand'),
   (2.0369504e-05, 'leptin')],
  -0.47777508758523973),
 ([(2.0369504e-05, 'ldl'),
   (2.0369504e-05, 'lipid'),
   (2.0369504e-05, 'inds'),
   (2.0369504e-05, 'lipoprotein'),
   (2.0369504e-05, 'locus'),
   (2.0369504e-05, 'lipase'),
   (2.0369504e-05, 'lipodystrophy'),
   (2.0369504e-05, 'legalize'),
   (2.0369504e-05, 'ioi'),
   (2.0369504e-05, 'knockout'),
   (2.0369504e-05

 ([(0.045374047, 'entertainment'),
   (0.032742508, 'eone'),
   (0.031370517, 'film'),
   (0.029580422, 'television'),
   (0.023292052, 'game'),
   (0.018108964, 'toy'),
   (0.01806516, 'hasbro'),
   (0.011473095, 'gaming'),
   (0.011310219, 'retailer'),
   (0.010605124, 'child'),
   (0.009245407, 'allspark'),
   (0.008973805, 'family'),
   (0.008755903, 'picture'),
   (0.008528653, 'play'),
   (0.0077966354, 'programming'),
   (0.006518099, 'storytelling'),
   (0.006419893, 'disney'),
   (0.0061637964, 'marvel'),
   (0.006080026, 'royalty'),
   (0.0060289223, 'producer')],
  -1.2858026285057016),
 ([(0.024718622, 'semiconductor'),
   (0.02247168, 'embedded'),
   (0.021348195, 'analog'),
   (0.02022471, 'ti'),
   (0.011236895, 'automotive'),
   (0.008989944, 'processor'),
   (0.008989938, 'signal'),
   (0.007866462, 'wafer'),
   (0.00786645, 'forecast'),
   (0.007866446, 'texas'),
   (0.006742984, 'consignment'),
   (0.006742984, 'microcontrollers'),
   (0.0056194994, 'electronics'),
 

   (0.006454278, 'attorney'),
   (0.0064077876, 'drug'),
   (0.006241841, 'fda'),
   (0.005746846, 'veterinary'),
   (0.0057023666, 'minnesota'),
   (0.005559467, 'district'),
   (0.005487043, 'benco'),
   (0.0054798536, 'pharmaceutical'),
   (0.0053343065, 'schein'),
   (0.005304866, 'companion'),
   (0.0053048576, 'veterinarian')],
  -1.840153788026348),
 ([(0.014898398, 'originates'),
   (0.0148418415, 'forestar'),
   (0.012029887, 'geographically'),
   (0.010880423, 'homebuilding'),
   (0.01081832, 'superintendent'),
   (0.010790652, 'mortgage'),
   (0.010745321, 'columbus'),
   (0.010406662, 'unsuccessful'),
   (0.009649832, 'builder'),
   (0.009609515, 'county'),
   (0.009319164, 'deterioration'),
   (0.0091114575, 'listed'),
   (0.009070514, 'land'),
   (0.008641996, 'allowable'),
   (0.0072140642, 'earnest'),
   (0.0072140573, 'registering'),
   (0.0072140247, 'acoe'),
   (0.007206317, 'moines'),
   (0.0071676904, 'stormwater'),
   (0.0071308524, 'lowering')],
  -1.843701329088

In [58]:
u1 = "https://www.cnn.com/2020/05/03/health/us-coronavirus-sunday/index.html" # parks and protests
u2 = "https://www.cnn.com/2020/05/03/asia/north-korea-gunfire-south-dmz-intl/index.html" # korea gunfire
u3 = "https://www.cnn.com/2020/05/03/health/coronavirus-vaccine-never-developed-intl/index.html" # cv vax
u4 = "https://www.cnn.com/2020/05/03/success/landlords-rent-may-coronavirus/index.html" # landlords
u5 = "https://www.cnn.com/2020/05/03/business/cheap-clothing-fast-fashion-climate-change-intl/index.html" # clothing
u6 = "https://www.cnn.com/2020/05/02/economy/reopening-economy-child-care-wellness/index.html" # child care
u7 = "https://www.cnn.com/2020/04/30/tech/zoom-google-facebook/index.html" # google fb zoom

In [59]:
url_list = [u1,u2,u3,u4,u5,u6,u7]

In [79]:
topics = Topics(model,corpus,processor,num_topics)
topics.score_docs()

In [80]:
company_list = []
for url in url_list:
    article = Article(url)
    article.download()
    article.parse()
    cut_text = ' '.join(article.text.split(' ')[3:])
    company_list.append(topics.get_top_companies(cut_text))
    

In [81]:
company_list

[['endp', 'mrk', 'bmy', 'mnk', 'alxn', 'lly', 'wmt', 'rost', 'bbby', 'jwn'],
 ['nwsa', 'met', 'flir', 'fb', 'mco', 'cat', 'dal', 'ntrs', 'aapl', 'pru'],
 ['mrk', 'bmy', 'alxn', 'endp', 'lly', 'zts', 'myl', 'gild', 'celg', 'vrtx'],
 ['amt', 'cci', 'kim', 'vtr', 'slg', 'swn', 'rrc', 'cnx', 'spg', 'eqt'],
 ['rl', 'vfc', 'wm', 'intu', 'pvh', 'tjx', 'pep', 'ip', 'rsg', 'kmb'],
 ['mat', 'hpq', 'dltr', 'nvda', 'wmt', 'rost', 'swn', 'rrc', 'cnx', 'eqt'],
 ['fb', 'has', 't', 'cmcsa', 'vz', 'met', 'viab', 'expd', 'msft', 'ffiv']]

In [82]:
pickle.dump(topics,open('topics_obj_300_2000_40_500.pkl','wb'))

In [None]:
u1 = "https://www.cnn.com/2020/05/03/health/us-coronavirus-sunday/index.html" # parks and protests
u2 = "https://www.cnn.com/2020/05/03/asia/north-korea-gunfire-south-dmz-intl/index.html" # korea gunfire
u3 = "https://www.cnn.com/2020/05/03/health/coronavirus-vaccine-never-developed-intl/index.html" # cv vax
u4 = "https://www.cnn.com/2020/05/03/success/landlords-rent-may-coronavirus/index.html" # landlords
u5 = "https://www.cnn.com/2020/05/03/business/cheap-clothing-fast-fashion-climate-change-intl/index.html" # clothing
u6 = "https://www.cnn.com/2020/05/02/economy/reopening-economy-child-care-wellness/index.html" # child care
u7 = "https://www.cnn.com/2020/04/30/tech/zoom-google-facebook/index.html" # google fb zoom

In [75]:
model[corpus.get_bow('mo')]

[(94, 0.99939376)]

In [48]:
corpus['tsco']

'item businessoverviewtractor supply company company u largest rural lifestyle retailer united state company focused supplying need recreational farmer rancher enjoy living rural lifestyle refer lifestyle well tradesman small business operate retail store name tractor supply company del feed farm supply petsense operate website name tractorsupply com petsense com store located primarily town outlying major metropolitan market rural community company one reportable industry segment retail sale product support rural lifestyle december operated retail store state tractor supply del retail store petsense retail store tractor supply store typically range size square foot inside selling space along additional outside selling space petsense store approximately square foot inside selling space tractor supply retail location use standard design new built suit location includes approximately square foot inside selling space online selling website positioned offer extended assortment product beyo

In [76]:
article = Article(u4)
article.download()
article.parse()
text = ' '.join(article.text.split(' ')[2:])
topics.get_article_topics(text)

[(18, 0.13377866926349655),
 (40, 0.08498601437826613),
 (54, 0.31753611036845636),
 (77, 0.10777079075603535),
 (94, 0.34231661168607563)]

In [28]:
text

'California to New York, more Americans are headed outside -- some for recreation and others in protest.\n\nBut as some states loosen or let go of their stay-at-home orders, researchers predict a higher death toll from coronavirus this summer than previously expected.\n\nBy Sunday, more than 1.1 million people in the US have been infected with coronavirus, and more than 66,000 have died , according to data from Johns Hopkins University.\n\nStates such as California have stood firm on their stay-at-home orders -- and have been met with protests.\n\nBut more than 30 states have started easing some social distancing restrictions -- ranging from simply opening state parks to allowing certain businesses to restart.\n\nSome of those states let stay-at-home orders expire, with caveats restricting what businesses can open and how. Georgia, for example, still requires the elderly to stay home until June 12.\n\nOther states and counties will ease restrictions this week. In Arkansas, gyms and fit

In [77]:
model.print_topic(94,topn=100)

'0.025*"tenant" + 0.012*"tobacco" + 0.012*"estate" + 0.010*"rent" + 0.009*"altria" + 0.009*"bxp" + 0.008*"rental" + 0.007*"street" + 0.007*"reit" + 0.007*"avenue" + 0.007*"san" + 0.006*"redevelopment" + 0.006*"rsf" + 0.006*"libor" + 0.005*"francisco" + 0.004*"preferred" + 0.004*"leasing" + 0.004*"loan" + 0.004*"cigarette" + 0.004*"occupancy" + 0.004*"boston" + 0.004*"green" + 0.004*"bplp" + 0.004*"residential" + 0.004*"campus" + 0.004*"manhattan" + 0.003*"unsecured" + 0.003*"alexandria" + 0.003*"unconsolidated" + 0.003*"agtech" + 0.003*"juul" + 0.003*"annualized" + 0.003*"taxable" + 0.003*"ground" + 0.003*"wine" + 0.003*"land" + 0.003*"adult" + 0.003*"holder" + 0.003*"qualify" + 0.002*"road" + 0.002*"expiration" + 0.002*"weighted" + 0.002*"rentable" + 0.002*"fda" + 0.002*"usa" + 0.002*"mortgage" + 0.002*"maryland" + 0.002*"abi" + 0.002*"washington" + 0.002*"contamination" + 0.002*"proceeds" + 0.002*"pm" + 0.002*"district" + 0.002*"hotel" + 0.002*"bond" + 0.002*"sustainability" + 0.002*

In [None]:
processor.process(text)

In [None]:
processor.process('military')

In [None]:
dictionary.token2id['military']

In [None]:
model.get_term_topics(303,minimum_probability=0.001)