In [3]:
from config import *

In [4]:
from collections import defaultdict

import gensim
from gensim.test.utils import datapath

import glob
import gzip
import itertools
import os
import re

import matplotlib.pyplot as plt
plt.style.use('seaborn')

import numpy as np
import pandas as pd

# Use the natural language toolkit package
import nltk

# Stop Words: Load the nltk default English stopwords list:
stopwords_list = nltk.corpus.stopwords.words('english')

import json

In [None]:
def get_tokenized_articles(year):
    articles = dict()
    
    with open(TEMP_PATH + '/Inflation/Inflation_Article_Texts_v2_%s.json' % year) as f:
        yyyymm_all_articles = json.load(f)
    
    for yyyymm in yyyymm_all_articles:
        articles[yyyymm] = []
        for text in yyyymm_all_articles[yyyymm]:
            text.replace('\n', ' ')
            sentences = nltk.sent_tokenize(text)

            text_words = []
            for sentence in sentences:
                sentence = re.sub(r'[^A-Za-z.]+', ' ', sentence)
                sentence = sentence.replace('.', '') # Abbreviations - G.D.P
                sentence = sentence.lower()
                sent_words = nltk.word_tokenize(sentence)
                sent_words = [word for word in sent_words if ((len(word) > 2) and (len(word) < 20))]
                sent_words = [word for word in sent_words if (word not in stopwords_list) and word.isalpha()]
                text_words.extend(sent_words)
            articles[yyyymm].append(text_words)
    
    return articles


def get_effective_vocabulary(articles):
    """
    Articles is a dictionary containing a list of lists for each month.
    
    """
    all_words = itertools.chain.from_iterable(itertools.chain.from_iterable(articles.values()))
    
    # Get frequency counts, sort words by frequency
    frequency_count = nltk.FreqDist(all_words)
    words = np.array([word for word in frequency_count.keys()])
    word_freq = np.array([word for word in frequency_count.values()])
    freq_sort = np.argsort(word_freq)[::-1]
    word_freq_sort = word_freq[freq_sort]
    words_sorted = words[freq_sort]
    
    # Create effective vocabulary: Only keep the words that aren't the 50 most frequent, 
    # and have a frequency of at least 2.
    rank = 1
    effective_vocab = list()
    for object in words_sorted:
        if (rank >= 50):
            fc = frequency_count[object]
            if (fc > 1):
                effective_vocab.append(object)
        rank += 1
    print(len(effective_vocab))
    return effective_vocab


def get_tokenized_articles_within_effective_vocab(articles):
    effective_vocab = get_effective_vocabulary(articles)
    tok_articles_ev = []
    # Preserve the chronological order in which we are processing articless
    # And lose the dictionary structure
    keys = list(articles.keys())
    keys.sort()
    for yyyymm in keys:
        for article in articles[yyyymm]:
            article_words_ev = [word for word in article if word in effective_vocab]
            tok_articles_ev.append(article_words_ev)
    return tok_articles_ev


In [5]:
import json

In [None]:
articles_1996['199601']

In [None]:
keys = list(articles_1996.keys())

In [None]:
keys.sort()

In [None]:
articles_1996 = get_tokenized_articles('1996')

In [None]:
# Combined corpus/ dictionary to create the joint LDA model
# The idea is to model speeches from both years into a shared space, and within this space 
# determine the top topics for each of them and the shift in importance
tok_articles_ev = get_tokenized_articles_within_effective_vocab(articles_1996)
print('Tokenized art with effective vocab done!')


dictionary_all = gensim.corpora.Dictionary(tok_articles_ev)
print('Dictionary created!')
corpus_all = [dictionary_all.doc2bow(doc) for doc in tok_articles_ev]


In [None]:
len(trial)

The above processing was parallelized for each year to get yearly tokenized articles within effective vocabulary. Now the task is to combine all of these into a single gensim corpora dictionary and corpus to run the sequential LDA model.

In [6]:
years = [str(year) for year in range(START_YEAR, END_YEAR + 1)]

In [7]:
THEME = 'GDP'
TOKENIZED_ARTICLES_PATH = TEMP_PATH + '/%s/%s_Articles_Tokenized_%s.json'

all_tok_articles = []
for year in years:
    with open(TOKENIZED_ARTICLES_PATH % (THEME, THEME, year)) as f:
        all_tok_articles.extend(json.load(f))
    print(TOKENIZED_ARTICLES_PATH % (THEME, THEME, year), 'done!')

/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_1996.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_1997.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_1998.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_1999.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_2000.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_2001.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_2002.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_2003.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_2004.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_2005.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_2006.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_2007.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_2008.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_2009.json done!
/work/ms5941/NLP/Temp/GDP/GDP_Articles_Tokenized_2010.json done!
/work/ms5941/NLP/Temp/GDP

MemoryError: 

In [8]:
# Only from 1996 - 2016
len(all_tok_articles)

2416033

In [11]:
dictionary_all = gensim.corpora.Dictionary(all_tok_articles)
print('Dictionary created!')
corpus_all = [dictionary_all.doc2bow(doc) for doc in all_tok_articles]


Dictionary created!


MemoryError: 

In [None]:
# Took about 20 minutes? - FOR VOL ALONE 

In [12]:
# Storing the dict/ Corpus for future use incase of issues/ notebook crashes
dictionary_all.save(TEMP_PATH + '/%s/%s.dict' % (THEME, THEME))
# gensim.corpora.MmCorpus.serialize(TEMP_PATH + '/%s/%s.mm' % (THEME, THEME), corpus_all)

In [None]:
del all_tok_articles

In [None]:
# Get Time Slices

In [None]:
time_slices = pd.read_csv('Summary Stat Tables/%s_Article_Count.csv' % THEME, index_col=0)

In [None]:
time_slices.sort_index(inplace=True)

In [None]:
time_slices.index = pd.to_datetime(time_slices.index)

time_slices.groupby(time_slices.index.year)['No. of Volatility Articles'].sum().values

In [None]:
time_slices.groupby(time_slices.index.year)['No. of Volatility Articles'].sum().values

In [None]:
time_slices = time_slices.sort_index()['No. of Volatility Articles'].values

In [None]:
time_slices

In [None]:
# LDA Seq Model

In [None]:
from gensim.models import ldaseqmodel


In [None]:
time_slices = time_slices.groupby(time_slices.index.year)['No. of Volatility Articles'].sum().values

In [None]:
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus_all, id2word=dictionary_all, time_slice=time_slices, num_topics=10)

In [9]:
import sys

local_vars = list(locals().items())
total_mem = 0
for var, obj in local_vars:
    total_mem += sys.getsizeof(obj)/float(1024*1024)

In [10]:
total_mem

20.745285034179688

In [None]:
import time


In [None]:
tic = time.time()

In [None]:
time.time() - tic

In [None]:
len(time_slices)