# 使用 scikit-learn 进行 KMeans 文本聚类

参考资料：

https://blog.razrlele.com/p/1614

In [1]:
import jieba 
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.cluster import KMeans
 
def jieba_tokenize(text):
    return jieba.lcut(text) 

In [3]:
'''
tokenizer: 指定分词函数
lowercase: 在分词之前将所有的文本转换成小写，因为涉及到中文文本处理，
所以最好是False
'''
tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenize, lowercase=False)

In [4]:
# 需要进行聚类的文本集
text_list = ["今天天气真好啊啊啊啊", "小明上了清华大学", "我今天拿到了 Google 的 Offer ", "清华大学在自然语言处理方面真厉害"]


In [5]:
tfidf_matrix = tfidf_vectorizer.fit_transform(text_list)

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/n5/97d6sf9j1m5gs27gb7zl153c0000gn/T/jieba.cache
Loading model cost 1.194 seconds.
Prefix dict has been built succesfully.


In [6]:
num_clusters = 3
km_cluster = KMeans(n_clusters=num_clusters, max_iter=300, n_init=40, \
                    init='k-means++',n_jobs=-1)

In [8]:
'''
n_clusters: 指定K的值
max_iter: 对于单次初始值计算的最大迭代次数
n_init: 重新选择初始值的次数
init: 制定初始值选择的算法
n_jobs: 进程个数，为-1的时候是指默认跑满CPU
注意，这个对于单个初始值的计算始终只会使用单进程计算，
并行计算只是针对与不同初始值的计算。比如n_init=10，n_jobs=40, 
服务器上面有20个CPU可以开40个进程，最终只会开10个进程
'''

#返回各自文本的所被分配到的类索引
result = km_cluster.fit_predict(tfidf_matrix)

In [10]:
result

array([0, 1, 2, 1], dtype=int32)

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(text_list)
#上面一行代码等价于下面两行代码
tfidf_vectorizer.fit(text_list)
tfidf_matrix = tfidf_vectorizer.transform(text_list)
 
result = km_cluster.fit_predict(tfidf_matrix)
#上面一行代码等价于下面两行代码
km_cluster.fit(tfidf_matrix)
result = km_cluster.predict(tfidf_matrix)

## 持久化

In [None]:
from sklearn.externals import joblib
 
joblib.dump(tfidf_vectorizer, 'tfidf_fit_result.pkl')
joblib.dump(km_cluster, 'km_cluster_fit_result.pkl')
 
#程序下一次则可以直接load
tfidf_vectorizer = joblib.load('tfidf_fit_result.pkl')
km_cluster = joblib.load('km_cluster_fit_result.pkl')