#WSJ Headline Deep Learning Model

###Adapted/ inspired by the RaRe Machine Learning Blog Post

##Preparing the input

In [221]:
#Import and setup
from gensim import corpora, models, similarities, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from collections import defaultdict
frequency = defaultdict(int)

In [238]:
dictionary = corpora.Dictionary(line.lower().split() for line in open('rawdata/wsj1314.txt'))
print(dictionary)

Dictionary(27258 unique tokens: ['licenses', 'one-legged', 'restaurant', 'money?', 'slammed']...)


In [239]:
#remove stopwords and words that only appear once
stoplist = set('for a of the and to in'.split())

stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]

once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]

#remove stop & words occuring once
dictionary.filter_tokens(stop_ids + once_ids) 

#removes gaps in id sequence after words that were removed
dictionary.compactify() 

print(dictionary)

Dictionary(25702 unique tokens: ['licenses', 'restaurant', 'money?', 'marketplaces', 'wakefield']...)


In [240]:
#save our dictionary
dictionary.save('Dictionaries/wsj1314.dict') #storing dictionary for future reference

In [241]:
#creating vectors from corpus in memory efficient way (BOW)
class MyCorpus(object):
    def __init__(self, fname):
        self.fname = fname
    #creating bow !!!    
    def __iter__(self):
        for line in open(self.fname): #pure python list
            yield dictionary.doc2bow(line.lower().split())

In [244]:
corpus_wsj = MyCorpus('rawdata/wsj1314.txt') #not loading into memory

#saving vectorised corpus
corpora.MmCorpus.serialize('Corpus/wsj1314.mm', corpus_wsj)

In [246]:
#loading up
dictionary = corpora.Dictionary.load('Dictionaries/wsj1314.dict')
corpus = corpora.MmCorpus('Corpus/wsj1314.mm')

##Model creation & training

In [247]:
#create class to generate sentences from textfile
class MySentences(object):
    def __init__(self, fname):
        self.fname = fname
    def __iter__(self):
         for line in open(self.fname): #pure python list
                # assume there's one document per line, tokens separated by whitespace
                yield line.lower().split()

In [251]:
sentences = MySentences('rawdata/wsj2014.txt') #using function to get our sentences
for line in sentences:
    print(line)

['how', 'economists’', '2014', 'projections', 'fared']
['hyundai,', 'kia', 'expect', '2015', 'sales', 'growth', 'to', 'be', 'weakest', 'in', 'more', 'than', 'a', 'decade']
['fed:', 'etf', 'concern', 'misplaced']
['unilever,', 'p&g', 'try', 'tweaked', 'formulas,', 'higher', 'prices', 'for', 'developing', 'world']
['labor,', 'marijuana', 'and', 'pets', 'to', 'get', 'new', 'laws', 'in', '2015']
['kkr', 'to', 'earn', 'big', 'payout', 'from', 'walgreen-alliance', 'boots', 'deal']
['more', 'startups', 'aim', 'to', 'keep', 'it', 'private']
['falling', 'oil', 'price', 'poses', 'tough', 'challenge', 'for', 'west', 'african', 'rulers']
['new', 'year,', 'new', 'job?', 'read', 'this', 'first']
['overseas', 'headwinds', 'test', 'u.s.', 'economy']
['facebook', 'and', 'beijing']
['online', 'political', 'opinions', 'don’t', 'need', 'regulating']
['broken', 'deals', 'rein', 'in', 'a', 'strong', 'm&a', 'market']
['insider’s', 'guide', 'to', 'melbourne']
['mlps:', 'the', 'oil', 'market’s', 'lost', 'child

In [254]:
import gensim #creating our model, 40 hidden layers, min word count 2
model = gensim.models.Word2Vec(sentences, size=40, min_count=2)

In [255]:
#saving and loading the model
model.save('Models/wsj1314.model')

model = gensim.models.Word2Vec.load('Models/wsj1314.model')

In [146]:
#add vocab
sentences2 = MySentences('rawdata/ENTERNEWFILENAME.txt') #using function to get our sentences
model.build_vocab(sentences2)

In [256]:
#train & trim model
model.train(sentences)

213865

In [342]:
#trying implied relationships - too few items trained on however
model.most_similar(positive=['retreat',"rise"], negative=['gain'], topn=1)

[('tumble', 0.842988133430481)]

In [321]:
#what doesnt match
model.doesnt_match("ubs barclays goldman google".split())

'google'

In [343]:
#similarities 
model.similarity('putin','kremlin')

0.60948239053737385

##More advanced model with phrases

In [344]:
bigram_transformer = gensim.models.Phrases(sentences)
model2 = gensim.models.Word2Vec(bigram_transformer[sentences])

In [364]:
#trying implied relationships - can now use phrases
model2.most_similar(positive=['goldman_sachs','bank'], negative=['tesla'], topn=3)

[('loses', 0.8768612146377563),
 ("mcdonald's", 0.8649779558181763),
 ('retail_sales', 0.8491551280021667)]

In [365]:
#similarities 
model2.similarity('goldman_sachs','ubs')

0.8622937525108415

In [366]:
model.save('Models/wsj1314phrases.model')

###testing TFIDF model

In [370]:
corpus = corpora.MmCorpus('Corpus/wsj1314.mm')
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]# step 2 -- populate model

###testing LSI model

In [371]:
#like done previously
dictionary = corpora.Dictionary.load('Dictionaries/wsj1314.dict')
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) 
corpus_lsi = lsi[corpus_tfidf]  #bow->tfidf->fold-in-lsi

In [373]:
lsi.print_topics(10)

['1.000*"10-point." + 0.000*"russia" + -0.000*"rally" + 0.000*"expectations" + -0.000*"can" + -0.000*"private" + 0.000*"gains" + -0.000*"gets" + 0.000*"cut" + 0.000*"decline"',
 '0.449*"u.s." + 0.423*"stocks" + 0.324*"asian" + 0.281*"briefing:" + 0.280*"morning" + 0.202*"on" + 0.185*"end" + 0.156*"lower" + 0.142*"higher" + 0.113*"shares"',
 '0.383*"on" + -0.255*"briefing:" + -0.255*"morning" + -0.255*"stocks" + 0.236*"profit" + -0.230*"asian" + 0.169*"as" + 0.161*"china" + 0.157*"oil" + 0.155*"prices"',
 '-0.558*"digest" + -0.555*"news" + -0.414*"watch:" + -0.342*"corporate" + -0.149*"briefing" + -0.146*"book:" + -0.123*"financial" + 0.064*"on" + 0.052*"profit" + -0.051*"world"',
 '-0.612*"profit" + -0.251*"rises" + 0.228*"new" + -0.172*"higher" + 0.170*"bank" + -0.164*"falls" + 0.149*"china" + 0.142*"u.s." + 0.137*"bonds" + 0.127*"with"',
 '-0.325*"prices" + -0.318*"oil" + 0.302*"bank" + 0.255*"profit" + 0.252*"new" + -0.248*"u.s." + -0.198*"bonds" + 0.178*"asian" + -0.171*"government

In [375]:
#save our models
lsi.save('Models/wsj1314.lsi')
tfidf.save('Models/wsj1314.tfidf')

#load our models
lis = models.LsiModel.load('Models/wsj1314.lsi')
tfidf = models.TfidfModel.load('Models/wsj1314.tfidf')

#Without the corpus growing by about 10x this level of NLP is very sketchy..