In [1]:
# Imports

import math
import json
import spacy
import gensim
import random
import pyLDAvis
import warnings
import numpy as np
import pandas as pd
import pyLDAvis.gensim
from gensim import corpora
from datetime import datetime
from gensim.test.utils import datapath
from gensim.corpora import Dictionary

from gensim.models.wrappers.dtmmodel import DtmModel
from gensim.models.coherencemodel import CoherenceModel

warnings.filterwarnings('ignore')

# Using the spacy medium model
nlp = spacy.load('en_core_web_md')

# Set DTM binary path (Refer https://github.com/magsilva/dtm for binaries)
dtm_path = 'dtm-master/bin/dtm-win64.exe'


In [2]:
# Read the dataset

with open('new-york-times-articles/nytimes_news_articles.txt', 'r', encoding='utf-8') as inf:
    nyt_article_data = inf.read()

In [3]:
# Split on 'URL:' to get the number of articles in the dataset

nyt_articles = nyt_article_data.split('URL:')[1:]
print("Num of articles : ", len(nyt_articles))

Num of articles :  8888


In [4]:
# Text preprocessing

stopwords = spacy.lang.en.stop_words.STOP_WORDS
stopwords = stopwords.union(['want', 'new', 'tell', 'use'])

def preprocess_text(text,
                    valid_pos=['NOUN', 'VERB', 'ADJ', 'ADV']):
    '''
    Tasks performed in preprocessed:
       - Filtering Non Alpha-Numeric characters
       - Parse URL to fetch parameters like Published date, document type(sports/)
       - Eliminate stopwords, filter based on POS tags and perform lemmatization
    '''
    
    # Remove junk characters
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = ''.join([i if ord(i) < 128 else ' ' for i in text])
    
    # Parse URL
    text_split = text.strip().split()
    url = text_split[0]
    url_date = '-'.join(url.split('/')[3:6])
    url_topic = url.split('/')[6]
    
    doc = nlp(text)
    processed_doc = []
    
    # Lemmatization & Stopword removal & Few POS tags removal
    for word in doc:
        if word.pos_ in valid_pos and \
           word.lemma_ not in ['-PRON-'] and \
           word.lemma_ not in stopwords and \
           len(word.lemma_) > 2:
            processed_doc.append(word.lemma_)
    
    if processed_doc:
        processed_doc = ' '.join(processed_doc)
    else:
        processed_doc = np.nan
    
    return pd.Series([url, url_date, url_topic, processed_doc])

In [0]:
# Maintaining Article data as a DataFrame

st = datetime.now()

data = pd.DataFrame({'Orig Article': nyt_articles}, columns=['Published Date', 'URL Topic', 'URL', 'Orig Article', 'Processed Article'])

data[['URL', 'Published Date', 'URL Topic', 'Processed Article']] = data['Orig Article'].apply(preprocess_text)

et = datetime.now()
print("Time taken : ", str(et-st))

In [0]:
# To Save data to disk to reduce preprocess time in later reuse

# data.to_csv('processed_article_data.csv')

In [5]:
# To Load processed article data from disk

# data = pd.read_csv("processed_article_data.csv")
data

Unnamed: 0.1,Unnamed: 0,Published Date,URL Topic,URL,Orig Article,Processed Article
0,0,2016-06-30,sports,http://www.nytimes.com/2016/06/30/sports/baseb...,http://www.nytimes.com/2016/06/30/sports/base...,pitching afloat half season offensive woe prod...
1,1,2016-06-30,nyregion,http://www.nytimes.com/2016/06/30/nyregion/may...,http://www.nytimes.com/2016/06/30/nyregion/ma...,counsel chief legal adviser resign month posit...
2,2,2016-06-30,nyregion,http://www.nytimes.com/2016/06/30/nyregion/thr...,http://www.nytimes.com/2016/06/30/nyregion/th...,early morning hour year group gunman street ga...
3,3,2016-06-30,nyregion,http://www.nytimes.com/2016/06/30/nyregion/tek...,http://www.nytimes.com/2016/06/30/nyregion/te...,thing start sell service device glass cube ecl...
4,4,2016-06-30,sports,http://www.nytimes.com/2016/06/30/sports/olymp...,http://www.nytimes.com/2016/06/30/sports/olym...,swimming trial spectacle build strap sport bro...
5,5,2016-06-30,sports,http://www.nytimes.com/2016/06/30/sports/olymp...,http://www.nytimes.com/2016/06/30/sports/olym...,omaha race olympic trial heat backstroke frees...
6,6,2016-06-30,business,http://www.nytimes.com/2016/06/30/business/dea...,http://www.nytimes.com/2016/06/30/business/de...,deal buy unite film studio premium cable home ...
7,7,2016-06-30,nyregion,http://www.nytimes.com/2016/06/30/nyregion/poo...,http://www.nytimes.com/2016/06/30/nyregion/po...,slate color light slant skylight woman enter c...
8,8,2016-06-30,sports,http://www.nytimes.com/2016/06/30/sports/baske...,http://www.nytimes.com/2016/06/30/sports/bask...,coach deliver truth afternoon team summer leag...
9,9,2016-06-30,nyregion,http://www.nytimes.com/2016/06/30/nyregion/lat...,http://www.nytimes.com/2016/06/30/nyregion/la...,rejoice apparent victory democratic primary ch...


In [6]:
# Function to strip URL from article (Missed during pre-processing)

def strip_url(text):
    return ' '.join(text.strip().split()[1:])

data['Orig Article'] = data['Orig Article'].apply(strip_url)

In [7]:
# Rows having Nan values in the column 'Processed Article'

data_na = data[data['Processed Article'].isnull()]
data_na

Unnamed: 0.1,Unnamed: 0,Published Date,URL Topic,URL,Orig Article,Processed Article
3978,3978,2016-05-28,arts,http://www.nytimes.com/2016/05/28/arts/design/...,,
6109,6109,2016-05-11,universal,http://www.nytimes.com/2016/05/11/universal/ko...,처음 본 그녀의 모습은 빙판 위에서 스케이트를 타는 모습도 아니었다. 토끼 귀 모양...,
7096,7096,2016-05-03,world,http://www.nytimes.com/2016/05/03/world/middle...,,
7108,7108,2016-05-02,fashion,http://www.nytimes.com/2016/05/02/fashion/met-...,,
8606,8606,2016-04-20,universal,http://www.nytimes.com/2016/04/20/universal/ko...,말레이시아 쿠알라룸푸르 — 브라질이나 바르셀로나와 어깨를 나란히 하는 건 어려운 일...,
8869,8869,2016-04-18,arts,http://www.nytimes.com/2016/04/18/arts/design/...,,


In [8]:
# Eliminating articles having Nan values

data = data[data['Processed Article'].notnull()].copy()
print("Number of articles after eliminating Nan values : ", len(data))

Number of articles after eliminating Nan values :  8882


In [9]:
# Changing datatype of 'Published Date' column and sorting on Published Date

data['Published Date'] = pd.to_datetime(data['Published Date'], format='%Y-%m-%d')
data.sort_values(by='Published Date', inplace=True)

In [10]:
# Convert text to tokens
def prep_corpus_tokens(text):
    return text.split()

data['Tokens'] = data['Processed Article'].apply(prep_corpus_tokens)
data

Unnamed: 0.1,Unnamed: 0,Published Date,URL Topic,URL,Orig Article,Processed Article,Tokens
4609,4609,2016-02-24,arts,http://www.nytimes.com/2016/02/24/arts/music/k...,"This article was updated on May 23, 2016, to r...",article update reflect development dispute mon...,"[article, update, reflect, development, disput..."
8887,8887,2016-04-15,us,http://www.nytimes.com/2016/04/15/us/californi...,"PALOS VERDES ESTATES, Calif. — From high atop ...",palos high oceanside cliff shimmer blue green ...,"[palos, high, oceanside, cliff, shimmer, blue,..."
8878,8878,2016-04-17,us,http://www.nytimes.com/2016/04/17/us/student-s...,A college student who came to the United State...,college student come iraqi refugee remove flig...,"[college, student, come, iraqi, refugee, remov..."
8829,8829,2016-04-18,sports,http://www.nytimes.com/2016/04/18/sports/hocke...,Andreas Athanasiou and Henrik Zetterberg score...,score save beat night cut deficit team round p...,"[score, save, beat, night, cut, deficit, team,..."
8830,8830,2016-04-18,business,http://www.nytimes.com/2016/04/18/business/ama...,SEATTLE — Amazon is introducing new options to...,introduce option subscribe membership service ...,"[introduce, option, subscribe, membership, ser..."
8831,8831,2016-04-18,business,http://www.nytimes.com/2016/04/18/business/med...,WHEN Seventh Generation began selling environm...,begin sell environmentally sustainable product...,"[begin, sell, environmentally, sustainable, pr..."
8832,8832,2016-04-18,business,http://www.nytimes.com/2016/04/18/business/dea...,WASHINGTON — Executives of the International M...,executive mutter darkly tap phone greek govern...,"[executive, mutter, darkly, tap, phone, greek,..."
8833,8833,2016-04-18,business,http://www.nytimes.com/2016/04/18/business/med...,"Earlier this month, a couple of inventive youn...",earlier month couple inventive young getter ti...,"[earlier, month, couple, inventive, young, get..."
8834,8834,2016-04-18,business,http://www.nytimes.com/2016/04/18/business/med...,The business of online news has never been for...,business online news forgive recent week simme...,"[business, online, news, forgive, recent, week..."
8835,8835,2016-04-18,world,http://www.nytimes.com/2016/04/18/world/americ...,When Angela Collins and her partner wanted to ...,partner child reach sperm bank look potential ...,"[partner, child, reach, sperm, bank, look, pot..."


In [11]:
# Checking topics inherent in the data set using the URL tags to get an idea of the possible topics that could exist
data['URL Topic'].drop_duplicates().tolist()

['arts',
 'us',
 'sports',
 'business',
 'world',
 'nyregion',
 'health',
 'nytnow',
 't-magazine',
 'insider',
 'magazine',
 'dining',
 'science',
 'pageoneplus',
 'movies',
 'books',
 'technology',
 'theater',
 'fashion',
 'upshot',
 'your-money',
 'realestate',
 'travel',
 'automobiles',
 'style',
 'jobs',
 'universal',
 'education']

In [12]:
# Calculating the number of articles in each time-frame
apr_data = data.loc[(data['Published Date'] <= '2016-04-30')]
may_data = data.loc[(data['Published Date'] <= '2016-05-31') & (data['Published Date'] > '2016-04-30')]
jun_data = data.loc[(data['Published Date'] <= '2016-06-30') & (data['Published Date'] > '2016-05-31')]

num_apr_articles = len(apr_data)
num_may_articles = len(may_data)
num_jun_articles = len(jun_data)

print("Number of articles published upto April : ", num_apr_articles)
print("Number of articles published in May : ", num_may_articles)
print("Number of articles published in June : ", num_jun_articles)

Number of articles published upto April :  1531
Number of articles published in May :  3730
Number of articles published in June :  3621


In [15]:
# Setting Time frames for the DTM model
time_slice = [num_apr_articles, num_may_articles, num_jun_articles]

# Creates a bag-of-words corpus with its dictionary(having word indices)
class DTMcorpus(corpora.textcorpus.TextCorpus):

    def get_texts(self):
        return self.input

    def __len__(self):
        return len(self.input)

# Creating unigram + bigram  of the articles
corpus_tokens = data['Tokens'].tolist()
bigram = gensim.models.Phrases(corpus_tokens, min_count=5, threshold=100)
train_texts = [bigram[text] for text in corpus_tokens]
train_corpus = DTMcorpus(train_texts)


In [None]:
# Training the DTM model for different topic values

for i in [7, 10, 12, 15, 17, 20]:
    print("*******")
    print("For num topic = ", i)
    
    st = datetime.now()
    dtm_model = DtmModel(dtm_path, train_corpus, time_slice, num_topics=i,
                       id2word=train_corpus.dictionary, initialize_lda=True,
                      )

    et = datetime.now()
    print("Time taken : ", str(et-st))
    
    file_name = "models/lda_all_train_set_defd2v_" + str(i) + "t_model"
    dtm_model.save(file_name)


In [16]:
# Dataframe to hold the coherence values for different topic values
columns = ['Num_Topics', 'UMass_TimeSlice0', 'UMass_TimeSlice1', 
           'UMass_TimeSlice2', 'C_V_TimeSlice0', 'C_V_TimeSlice1', 
           'C_V_TimeSlice2']

res_df = pd.DataFrame(columns=columns)

In [18]:
# Load the built models and check the coherence values for each topic size

for i in [7, 10, 12, 15, 17, 20]:
    print("*******")
    print("For num topic = ", i)
    df_params = []
    df_params.append(i)
    
    file_name = "models/lda_all_train_set_defd2v_" + str(i) + "t_model"
    
    # Load the model from the disk
#     temp_file = datapath(file_name)
    dtm_model = DtmModel.load(file_name)
    
    # Coherence on training data
    for t in np.arange(len(time_slice)):
        topics_wrapper = dtm_model.dtm_coherence(time=t)
        cm_wrapper = CoherenceModel(topics=topics_wrapper, corpus=train_corpus,
                                    dictionary=train_corpus.dictionary, coherence='u_mass')
        coherence = cm_wrapper.get_coherence()
        df_params.append(coherence)
    
    # Coherence on training data
    for t in np.arange(len(time_slice)):
        topics_wrapper = dtm_model.dtm_coherence(time=t)
        cm_wrapper = CoherenceModel(topics=topics_wrapper, texts=train_texts,
                                    dictionary=train_corpus.dictionary, coherence='c_v')
        coherence = cm_wrapper.get_coherence()
        df_params.append(coherence)
    
    res_df = res_df.append(dict(zip(columns, df_params)), ignore_index=True)

*******
For num topic =  7
*******
For num topic =  10
*******
For num topic =  12
*******
For num topic =  15
*******
For num topic =  17
*******
For num topic =  20


### Comparing coherence values

In [19]:
res_df

Unnamed: 0,Num_Topics,UMass_TimeSlice0,UMass_TimeSlice1,UMass_TimeSlice2,C_V_TimeSlice0,C_V_TimeSlice1,C_V_TimeSlice2
0,7.0,-1.120907,-1.103981,-1.152554,0.483971,0.480354,0.489685
1,10.0,-1.293016,-1.313774,-1.326166,0.514217,0.516964,0.520814
2,12.0,-1.385266,-1.37659,-1.38117,0.55377,0.549746,0.554894
3,15.0,-1.434529,-1.442001,-1.439955,0.558677,0.559173,0.557121
4,17.0,-1.425454,-1.406204,-1.401271,0.558658,0.555011,0.55172
5,20.0,-1.656916,-1.652923,-1.6385,0.553621,0.554234,0.552368


In [20]:
res_df.to_csv('Coherence_values.csv')

In [35]:
# Based on the above coherence table, load the best model, example shown below
file_name = "models/lda_all_train_set_defd2v_12t_model"
dtm_model = DtmModel.load(file_name)

In [36]:
vis = dtm_model.dtm_vis(time=2, corpus=train_corpus)

vis_wrapper = pyLDAvis.prepare(topic_term_dists=vis[1], doc_topic_dists=vis[0], doc_lengths=vis[2], vocab=vis[4], term_frequency=vis[3])
pyLDAvis.display(vis_wrapper)