In [1]:

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import json
import requests
import bs4
import json
import gensim
from collections import defaultdict
import time 
import pandas as pd


In [2]:
with open('NYTjson.json') as json_data:
    text = json.load(json_data)
    json_data.close()

In [3]:
df = pd.DataFrame(text)
df.full_text =df.full_text.str.lower()
df.headline = df.headline.str.lower()

In [4]:
df

Unnamed: 0,about,date,full_text,headline
0,Trump,1989-12-31T00,""" there was a moment in dec...","ideas & trends; a decade ends, and the nouvell..."
1,Trump,1989-12-31T00,""" dear david, when you and ...","""how'd i do?"""
10,Trump,1989-12-20T00,""" the trump shuttle, the ai...","company news; trump shuttle, pilots reach pact"
100,Trump,1989-09-26T00,""" new york city's business ...",dinkins gaining support among business executives
1000,Trump,1998-12-27T00,"old malls, shopping center...",commercial property; old malls demolished to m...
10000,Clinton,2014-09-24T15,", 'the premiere of a short film by the directo...",kathryn bigelow joins new york film festival l...
10001,Clinton,2014-09-24T14,", 'i can’t sit here today and say if we had do...",clinton not saying i told you so
10002,Clinton,2014-09-24T12,", 'charitable commitments have been made. impo...","at clintons’ event, a really close watch on re..."
10003,Clinton,2014-09-24T07,", 'obama on a war footing at the united nation...",today in politics
10004,Clinton,2014-09-23T17,", 'as many democrats attack companies that tak...",bill clinton on inversions: ‘this is their money’


In [5]:

# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(analyzer='word',
                                  ngram_range=(1, 2), stop_words='english',
                                  token_pattern='\\b[a-z][a-z]+\\b', max_df=0.02, min_df=2)
count_vectorizer.fit(df.full_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.02, max_features=None, min_df=2,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
ng_vecs = count_vectorizer.transform(df.full_text).transpose()
ng_vecs.shape

(753365, 12412)

In [7]:
corpus = matutils.Sparse2Corpus(ng_vecs)

In [9]:

id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [10]:
len(id2word)

753365

In [11]:
lda = models.LdaModel(corpus, id2word=id2word, num_topics=10, passes=10)

In [12]:
lda.print_topics(num_words=5, num_topics=10)

[(0,
  '0.003*mr starr + 0.003*palestinian + 0.002*palestinians + 0.002*http + 0.002*netanyahu'),
 (1,
  '0.001*gail + 0.001*hedge + 0.001*brooks + 0.001*griffin + 0.001*healthcare'),
 (2, '0.003*tyson + 0.002*renta + 0.002*la renta + 0.001*mr la + 0.001*weiss'),
 (3,
  '0.003*iranian + 0.003*sanctions + 0.002*taliban + 0.002*panetta + 0.002*pakistani'),
 (4,
  '0.002*fares + 0.002*mr bloomberg + 0.001*york post + 0.001*mr koch + 0.001*shuttle'),
 (5,
  '0.002*mr cuomo + 0.002*pirro + 0.001*myanmar + 0.001*ms pirro + 0.001*mr pataki'),
 (6,
  '0.001*mcauliffe + 0.001*midterm + 0.001*mr edwards + 0.001*gillibrand + 0.001*mr christie'),
 (7,
  '0.001*tamil + 0.001*secret service + 0.001*wedding + 0.000*oprah + 0.000*lanka'),
 (8,
  '0.001*hagel + 0.001*confirmation + 0.001*emissions + 0.001*sotomayor + 0.001*mr hagel'),
 (9,
  '0.001*random + 0.001*knopf + 0.001*random house + 0.001*duchess + 0.001*org')]