In [0]:
#참고: http://www.lumenai.fr/blog/quick-review-on-text-clustering-and-text-similarity-approaches

In [11]:
import collections
import nltk
from nltk import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint


def word_tokenizer(text):
    #tokenizes and stems the text
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
    return tokens


def cluster_sentences(sentences, nb_of_clusters=5):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                    stop_words=stopwords.words('english'),
                                    max_df=0.9,
                                    min_df=0.1,
                                    lowercase=True)
    #builds a tf-idf matrix for the sentences
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

    kmeans = KMeans(n_clusters=nb_of_clusters)
    kmeans.fit(tfidf_matrix)
    
    clusters = collections.defaultdict(list)
    for i, label in enumerate(kmeans.labels_):
            clusters[label].append(i)

    return dict(clusters)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
if __name__ == "__main__":
    sentences = ["Nature is beautiful","I like green apples",
            "We should protect the trees","Fruit trees provide fruits",
            "Green apples are tasty"]
    nclusters= 3
    clusters = cluster_sentences(sentences, nclusters)
    for cluster in range(nclusters):
            print ("cluster ",cluster,":")
            for i,sentence in enumerate(clusters[cluster]):
                    print ("\tsentence ",i,": ",sentences[sentence])

cluster  0 :
	sentence  0 :  Nature is beautiful
cluster  1 :
	sentence  0 :  I like green apples
	sentence  1 :  Green apples are tasty
cluster  2 :
	sentence  0 :  We should protect the trees
	sentence  1 :  Fruit trees provide fruits


  'stop_words.' % sorted(inconsistent))
