In [1]:
import numpy as np
import pandas as pd
import jieba as jb
import math
import operator

In [2]:
data = pd.read_json('twitter_big_long.json', lines=True)
stopwords = open('stopword.txt', mode='r', encoding='utf-8').read().split('\n')
data.head()

Unnamed: 0,text,cluster
0,octob inform camp refuge migrant calai known j...,0
1,catalyst trust major initi join iie creat plat...,0
2,woman candid want improv chanc elect grab seat...,0
3,octob argu loss octob local gyumri vanadzor se...,0
4,term servic end user licens agreement agreemen...,0


In [3]:
data.describe

<bound method NDFrame.describe of                                                    text  cluster
0     octob inform camp refuge migrant calai known j...        0
1     catalyst trust major initi join iie creat plat...        0
2     woman candid want improv chanc elect grab seat...        0
3     octob argu loss octob local gyumri vanadzor se...        0
4     term servic end user licens agreement agreemen...        0
...                                                 ...      ...
1495  submit ingrid carlqvist gateston institut perp...        5
1496  ahead major solidar march london weekend bob d...        5
1497  poster mock child refuge displai wall commun k...        5
1498  jame joyc rescu migrant nautic mile tripoli li...        5
1499  bahn bildet vier der lassen sich demnach zum e...        5

[1500 rows x 2 columns]>

In [4]:
data['cluster'].unique()

array([0, 2, 5], dtype=int64)

In [5]:
data_words = data['text'].apply(lambda x:str(x).split())
data_words.head()
# print(type(data_words[0]))

0    [octob, inform, camp, refuge, migrant, calai, ...
1    [catalyst, trust, major, initi, join, iie, cre...
2    [woman, candid, want, improv, chanc, elect, gr...
3    [octob, argu, loss, octob, local, gyumri, vana...
4    [term, servic, end, user, licens, agreement, a...
Name: text, dtype: object

In [6]:
def word_fre(X):
    n = X.shape[0]
    data_set = []
    for i in range(n):
        words = X.values[i]
        words_set = {}
        for word in words:
            if word not in stopwords: # 去停用词
                data_set.append(word)
                if word in words_set.keys():
                    words_set[word] += 1
                else:
                    words_set[word] = 1
        X.values[i] = words_set
    return X, set(data_set)
        # print(type(X.values[i]))

In [7]:
data_fre, data_set= word_fre(data_words)
data_fre.head()

0    {'octob': 2, 'inform': 3, 'camp': 2, 'refuge':...
1    {'catalyst': 7, 'trust': 6, 'major': 1, 'initi...
2    {'woman': 2, 'candid': 2, 'improv': 1, 'chanc'...
3    {'octob': 2, 'argu': 1, 'loss': 1, 'local': 1,...
4    {'term': 22, 'servic': 20, 'end': 1, 'user': 2...
Name: text, dtype: object

In [8]:
# 计算每个词的idf
def idf(data_set, X):
    data_idf = {i: 0 for i in data_set}
    for i in data_set:
        for j in range(X.shape[0]):
            if i in X.values[j]:
                data_idf[i] += 1
        data_idf[i] = math.log(X.shape[0] / (data_idf[i] + 1))
    return data_idf

In [9]:
# 计算文档中每个词的词频tf
def tf(X, data_fre):
    for i in range(X.shape[0]):
        n = len(X.values[i])
        for j in data_fre.values[i]:
            data_fre.values[i][j] = data_fre.values[i][j] / n
    return data_fre

In [10]:
data_tf = tf(data_words, data_fre)
print(data_tf.head())
data_idf = idf(data_set, data_words)

0    {'octob': 0.012738853503184714, 'inform': 0.01...
1    {'catalyst': 0.02364864864864865, 'trust': 0.0...
2    {'woman': 0.058823529411764705, 'candid': 0.05...
3    {'octob': 0.046511627906976744, 'argu': 0.0232...
4    {'term': 0.04573804573804574, 'servic': 0.0415...
Name: text, dtype: object


In [11]:
# 计算tf * idf
def tf_idf(x_tf, x_idf):
    n = x_tf.shape[0]
    for i in range(n):
        for word in x_tf.values[i]:
            x_tf.values[i][word] = x_tf.values[i][word] * x_idf[word]
        sorted(x_tf.values[i].items(), key=operator.itemgetter(1),reverse=True) # 按照tf*idf的值降序排列
    return x_tf

In [12]:
data_tf_idf = tf_idf(data_tf, data_idf)

In [13]:
data_tf_idf

0       {'octob': 0.03155335643488947, 'inform': 0.034...
1       {'catalyst': 0.13057508927376935, 'trust': 0.0...
2       {'woman': 0.12092500088603059, 'candid': 0.151...
3       {'octob': 0.11520644093668946, 'argu': 0.04879...
4       {'term': 0.09308175886260968, 'servic': 0.0740...
                              ...                        
1495    {'submit': 0.008742177918843358, 'ingrid': 0.0...
1496    {'ahead': 0.012137590890147688, 'major': 0.014...
1497    {'poster': 0.08355805229024695, 'mock': 0.0440...
1498    {'jame': 0.17283070808965334, 'joyc': 0.241022...
1499    {'bahn': 0.3478085617551021, 'bildet': 0.04226...
Name: text, Length: 1500, dtype: object

In [14]:
weight = [] # 权重矩阵
word = []   # 文本矩阵
for i in range(data_tf_idf.shape[0]):
    w1 = []
    w2 = []
    # 取前10个为特征项
    c = 0
    for j in data_tf_idf.values[i]:
        if c == 10:
            break
        w1.append(data_tf_idf.values[i][j])
        w2.append(j)
        c += 1
    weight.append(w1)
    word.append(w2)
    
# print(weight)

In [15]:
print(type(weight))

<class 'list'>


In [16]:
# 计算向量的距离
def getEuclidean(point1, point2):
    dimension = len(point1)
    dist = 0.0
    for i in range(dimension):
        dist += (point1[i] - point2[i]) ** 2
    return dist

In [17]:
# k_means 实现
import random
def k_means(dataset, k, n_iter):
    # 初始化k个质心向量
    index = random.sample(list(range(len(dataset))), k) # 随机选择 k 个下标
    vectors = []
    for i in index:
        vectors.append(dataset[i])
    
    # 初始化标签
    labels = []
    for i in range(len(dataset)):
        labels.append(-1)
    
    C = []
    
    # 重复迭代
    while n_iter > 0:
        C = []
        for i in range(k):
            C.append([])
            
        for id, val in enumerate(dataset): # 遍历所有的dataset，离质心最近的归为质心一类
            classlabel = -1
            min_dist = 1e6
            for i, point in enumerate(vectors):
                dist = getEuclidean(val, point)
                if dist < min_dist:
                    classlabel = i
                    min_dist = dist
            C[classlabel].append(val)
            labels[id] = classlabel
        
        # 重新计算质心
        for i, cluster in enumerate(C):
            clusterheart = []
            dimension = len(dataset[0])
            for j in range(dimension):
                clusterheart.append(0)
            for item in cluster:
                for j, val in enumerate(item):
                    clusterheart[j] += val / len(cluster)
            vectors[i] = clusterheart
        
        n_iter -= 1
    return C, labels

In [18]:
C, C_labels = k_means(weight, 6, 300)

In [20]:
np.array(C_labels)

array([2, 0, 0, ..., 2, 0, 0])

In [21]:
# 进行k-means用sklearn
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=6, max_iter=1000) # n_clusters 簇类个数， max_iter最大迭代数 
s = clf.fit(weight)
print(s)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1000,
       n_clusters=6, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)


In [22]:
clf.labels_

array([0, 1, 1, ..., 0, 5, 2])

In [23]:
labels_true = data['cluster'].to_list()
labels_pred = list(clf.labels_)

In [24]:
from sklearn import metrics
rand_index_score1 = metrics.adjusted_rand_score(labels_true, C_labels) # 实现函数k_means 的兰德指数
print(rand_index_score1)

0.0006469772183506995


In [25]:
rand_index_score2 = metrics.adjusted_rand_score(labels_true, labels_pred) # 调用 sklearn 的兰德指数
print(rand_index_score2)

0.0011942025428728605
