from http://brandonrose.org/clustering

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import csv
import pickle

In [2]:
df = pickle.load( open( "allNYT.p", "rb" ) )


In [3]:
df = pd.DataFrame(df)

In [4]:
articles = df.full_text

In [5]:
headlines = df.headline

In [29]:
date = df.date
about = df.about

In [8]:

# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

In [11]:
rubbish = ['New York City’s top public schools must become more diverse.',
 'Room for Debate asks whether shorefront homeowners should have to open their land to all comers.',
 '\n',
 '"Advertisement"',
 'Find out what you need to know about the 2016 presidential race today, and get politics news updates via Facebook, Twitter and the First Draft newsletter.',
 '\nLike it? Hate it? Have a tip for us? Send any and all feedback to firstdraft@nytimes.com.\n',
 "\n\t\t\tWe send out a newsletter around 7 a.m. eastern time each weekday that serves as a look-ahead to the day's political events and expected stories.\n\t\t\t",
 'Go to Home Page »',
 ' version of this article appears in print on',
 " with the headline: Ethanol Mandate Faces Growing Resistance, Even in Iowa .  Order Reprints| Today's Paper|Subscribe\n\n"]

In [13]:
stopwords.append(rubbish)

In [14]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [15]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens


In [16]:

#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in articles:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [18]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 12244646 items in vocab_frame


In [19]:
print (vocab_frame.head())

         words
\n          \n
there    there
was        was
a            a
moment  moment


In [20]:

from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(articles) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 8min 7s, sys: 9.29 s, total: 8min 16s
Wall time: 8min 22s
(12944, 278)


In [21]:
terms = tfidf_vectorizer.get_feature_names()

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print
print

<function print>

In [23]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 8.83 s, sys: 160 ms, total: 8.99 s
Wall time: 9.02 s


In [25]:

from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [33]:
nyt = {'headline': headlines, 'articles': articles, 'cluster': clusters, "about": about, 'date': date

      }
frame = pd.DataFrame(nyt, index=[clusters] , columns = [ 'headline', 'cluster', 'date', 'about'] )

In [34]:
frame['cluster'].value_counts() #number of films per cluster (clusters from 0 to 4)

2    3740
0    3558
3    3253
4    1845
1     548
Name: cluster, dtype: int64