# ARTICLE CLUSTERING

In [6]:
import warnings
warnings.filterwarnings("ignore")

import os
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import sklearn.metrics.cluster as metrics
import numpy as np
import pickle

def cluster_articles(data):
    
    X = np.asarray(data["vectors"])
    y = np.asarray(data["group"])

    km_args = dict(
        n_clusters=10, 
        random_state=2, 
        tol=0.05, 
        max_iter=50,
    )

    km_full = KMeans(**km_args).fit(X)
    labels_100 = km_full.labels_
    nobs_100 = np.bincount(labels_100).tolist()

    pca = PCA(n_components=10, random_state=2)
    X_reduced = pca.fit_transform(X)
    pca_explained = float(pca.explained_variance_[0])

    km_red = KMeans(**km_args).fit(X_reduced)
    labels_10 = km_red.labels_
    nobs_10 = np.bincount(labels_10).tolist()

    cs_100 = metrics.completeness_score(y, labels_100)
    cs_10 = metrics.completeness_score(y, labels_10)
    vms_100 = metrics.v_measure_score(y, labels_100)
    vms_10 = metrics.v_measure_score(y, labels_10)

    return {
        "nobs_100": nobs_100,
        "nobs_10": nobs_10,
        "pca_explained": pca_explained,
        "cs_100": cs_100,
        "cs_10": cs_10,
        "vms_100": vms_100,
        "vms_10": vms_10,
    }

In [7]:
data_directory = r"C:\Users\sb013698\Desktop\ML Test\Datasets\Article Clustering"
with open(os.path.join(data_directory, "documents.p"), "rb") as f:
    data = pickle.load(f)

result_dict = cluster_articles(data)
print(result_dict)

{'nobs_100': [586, 1132, 1680, 1152, 1156, 1486, 1357, 580, 1017, 1168], 'nobs_10': [1099, 1181, 1195, 1161, 1506, 1191, 630, 895, 713, 1743], 'pca_explained': 0.7096365690231323, 'cs_100': 0.6901578857723636, 'cs_10': 0.4801995751023586, 'vms_100': 0.7093524278356562, 'vms_10': 0.494180250439118}


# END