In [190]:
from sklearn.datasets import load_files 

import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

from __future__ import print_function

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.externals import joblib
from sklearn.cluster import KMeans

In [191]:
# preparar funciones de procesamiento de texto

def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [192]:
dataset = load_files('data/txt_sentoken', shuffle=False)

synopses = []
N = len(dataset['data'])

for i in range(N):
    synopses.append(dataset['data'][i].decode('utf-8').strip())
    

In [193]:
#synopses[0]

In [194]:
totalvocab_tokenized = []

for i in synopses:
    allwords_tokenized = tokenize_and_stem(i)
    totalvocab_tokenized.extend(allwords_tokenized)
    
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_tokenized)

In [195]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses
print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()

(2000, 165)


In [196]:
#print(terms)

In [197]:
num_clusters = 5

km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [198]:
print (clusters)

# Recuento del número de elementos en cada cluster
for i in range(num_clusters):
    print ('El cluster %i tiene %i elementos' % (i, clusters.count(i)))

[0, 0, 0, 4, 2, 0, 0, 4, 3, 0, 3, 0, 1, 0, 0, 2, 0, 0, 4, 0, 0, 0, 1, 0, 2, 0, 3, 1, 0, 0, 2, 0, 1, 2, 3, 2, 2, 0, 0, 2, 3, 1, 3, 4, 4, 0, 1, 0, 1, 3, 4, 2, 3, 0, 4, 0, 0, 0, 0, 3, 0, 0, 2, 0, 1, 0, 1, 0, 4, 2, 0, 0, 0, 2, 0, 1, 2, 4, 2, 0, 2, 2, 3, 3, 2, 0, 2, 0, 2, 3, 3, 2, 0, 1, 3, 4, 4, 3, 0, 0, 0, 3, 0, 4, 3, 2, 2, 0, 2, 2, 0, 2, 1, 0, 0, 1, 1, 2, 2, 2, 3, 4, 4, 0, 0, 0, 4, 0, 3, 4, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 1, 4, 1, 1, 2, 3, 3, 0, 4, 1, 2, 0, 4, 1, 0, 4, 3, 1, 4, 2, 1, 2, 0, 4, 4, 0, 4, 4, 0, 0, 3, 0, 0, 1, 0, 1, 3, 3, 2, 0, 2, 1, 2, 0, 0, 2, 2, 3, 0, 3, 0, 0, 2, 2, 2, 4, 3, 3, 2, 2, 3, 0, 4, 0, 0, 3, 0, 2, 1, 1, 4, 0, 3, 1, 2, 0, 4, 4, 2, 2, 2, 1, 1, 0, 0, 4, 0, 0, 4, 0, 1, 2, 3, 0, 2, 4, 2, 0, 2, 2, 0, 0, 3, 0, 0, 4, 2, 1, 0, 3, 0, 0, 4, 4, 0, 0, 4, 4, 1, 1, 3, 3, 4, 2, 0, 1, 2, 1, 0, 4, 0, 4, 3, 0, 4, 0, 0, 2, 0, 0, 0, 0, 4, 0, 2, 0, 0, 0, 0, 4, 3, 0, 1, 3, 0, 1, 1, 4, 2, 0, 2, 0, 3, 0, 0, 0, 0, 2, 4, 0, 4, 4, 1, 1, 2, 3, 3, 0, 3, 1, 3, 0, 0, 3, 2, 0, 2, 3, 3, 0, 0, 0, 

In [199]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_tokenized)

terms = tfidf_vectorizer.get_feature_names()

dist = 1 - cosine_similarity(tfidf_matrix)

num_clusters = 10
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()


In [200]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]     
        
for i in range(num_clusters):
    print("*** Cluster %d:" % i, end='\n\n')
    
    print("WORDS /// ", end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=' / ')
    print('\n\n')


Top terms per cluster:

*** Cluster 0:

WORDS ///  love /  charact /  year /  like /  end /  best / 


*** Cluster 1:

WORDS ///  life /  world /  live /  charact /  man /  like / 


*** Cluster 2:

WORDS ///  origin /  star /  effect /  time /  like /  charact / 


*** Cluster 3:

WORDS ///  bad /  guy /  just /  like /  good /  plot / 


*** Cluster 4:

WORDS ///  stori /  charact /  doe /  make /  like /  just / 


*** Cluster 5:

WORDS ///  charact /  play /  perform /  scene /  role /  veri / 


*** Cluster 6:

WORDS ///  like /  just /  doe /  thing /  make /  realli / 


*** Cluster 7:

WORDS ///  john /  like /  stori /  make /  new /  time / 


*** Cluster 8:

WORDS ///  action /  scene /  like /  charact /  plot /  make / 


*** Cluster 9:

WORDS ///  comedi /  funni /  laugh /  like /  just /  make / 


