In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
def loadData(filename, N):
    sentences = []
    vectors = []
    with open(filename, encoding="utf-8") as file:
        data = json.load(file)
        
        count = 1
        for k , v in data.items():
            if count > N :
                break
            # print(k)
            sentences.append(k)
            vectors.append(v)
            count += 1
    return sentences, vectors

In [None]:
def plotData(data):
    plt.scatter(data[:, 0], data[:, 1], marker='.')
    fig = plt.figure(figsize=(10, 8))
    colors = plt.cm.Spectral(np.linspace(0, 1, len(set(kmeans.labels_))))
    ax = fig.add_subplot(1, 1, 1)
    for k, col in zip(range(len(kmeans.cluster_centers_)), colors):
        my_members = (kmeans.labels_ == k)
        cluster_center = kmeans.cluster_centers_[k]
        ax.plot(data[my_members, 0], data[my_members, 1], 'w', markerfacecolor=col, marker='.', markersize=10)
        ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,  markeredgecolor='k', markersize=6)
    plt.show()

In [None]:
# def savePredict(sentences, pre_label, file_out):
#     result = {}
#     for i in range(0, len(sentences)):
#         result[sentences[i]] = int(pre_label[i])

#     with open(file_out, "w", encoding="utf-8") as fout:
#         json.dump(result, fout, ensure_ascii=False)

In [None]:
def savePredict(sentences, pre_label, file_out):
    result = {}
    for i in range(0, len(sentences)):
        if int(pre_label[i]) in result:
            result[int(pre_label[i])].append(sentences[i])
        else:
            result[int(pre_label[i])] = [sentences[i]]

    with open(file_out, "w", encoding="utf-8") as fout:
        json.dump(result, fout, ensure_ascii=False)

In [None]:
filename = "/content/drive/MyDrive/Colab_Notebooks/Mining of massive datasets/FinalExam/fasttext_vec100.json"
sentences, vectors = loadData(filename, 100000)

In [None]:
data = pd.DataFrame(vectors)

In [None]:
data.shape

(100000, 100)

In [None]:
scaler = StandardScaler()
data_std = scaler.fit_transform(data)

In [None]:
pca = PCA(n_components = 50)
data_pca = pca.fit_transform(data_std)

In [None]:
data_pca.shape

(100000, 50)

In [None]:
kmeans = KMeans(n_clusters=15, random_state=0).fit(data_pca)

In [None]:
print('Centers found by scikit-learn:')
print(kmeans.cluster_centers_)
pred_label = kmeans.predict(data_pca)
# plotData(data_pca)

Centers found by scikit-learn:
[[-3.00160676e+00  8.47758901e-01 -1.97548141e+00  1.38717906e-01
  -4.23867340e-01 -2.90750224e-01  5.43811803e-01 -1.62619787e-01
  -3.32098265e-02 -5.66832257e-02  2.84989926e-01  2.40232401e-01
   5.56222284e-02 -1.28108030e-01 -2.18290459e-01  2.39354861e-01
   1.02163542e-01  5.45585564e-02  2.11164700e-01  4.16033390e-02
   2.07407649e-01  3.05670664e-02  4.40970018e-02 -1.24039082e-01
   4.46770624e-02 -9.26679659e-02  8.21382324e-02  3.22830983e-02
   3.48171644e-02 -2.90911894e-02  2.62361492e-02  4.89892883e-02
  -2.23184670e-02  1.31064175e-01 -2.88816747e-02  3.44055917e-02
   4.17847136e-04 -3.54981924e-02 -2.00477136e-02  1.24854383e-02
   1.47770297e-02 -4.27559463e-02 -2.55570566e-02 -1.05337095e-01
   8.95921187e-02  8.88428972e-03  1.46716009e-02  5.29874246e-02
  -2.49174710e-02 -4.78708742e-02]
 [-1.97111160e+00  3.45361180e+00  1.00625225e+00 -7.32117649e-01
  -8.35000284e-01  2.09907736e-01 -7.18270043e-01 -1.75457774e-01
  -5.33869

In [None]:
file_out = "/content/drive/MyDrive/Colab_Notebooks/Mining of massive datasets/FinalExam/Label_pre_Kmeans_100000_50d.json"
savePredict(sentences, pred_label, file_out)