* kmeans
* affinity p
* hdbscan

Is there a way of identify significant change here? 
Both static embeddings and centroid BERTs are "means" over several usages.


In [1]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import AffinityPropagation
from sklearn.metrics import silhouette_score
from sklearn.decomposition import TruncatedSVD
from scipy.special import rel_entr
from scipy.special import kl_div
from scipy.spatial.distance import jensenshannon
from scipy.stats import pearsonr, spearmanr
from pathlib import Path
import matplotlib.pyplot as plt
from collections import Counter
import time

from sklearn.feature_extraction.text import CountVectorizer
from ctfidf import CTFIDFVectorizer

In [2]:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py

In [3]:
def get_vec(file):
    years = []
    vectors = []
    with open(file) as f:
        for line in f:
            year, vector = tuple(line.strip("\n").split("\t"))
            vector = [float(val) for val in vector.split(" ")]
            years.append(int(year))
            vectors.append(vector)
    vectors = np.array(vectors)
    return years, vectors      

In [4]:
def rename_for_ctx(path, v_root = "/home/max/Corpora/flashback-pol-time/yearly/contexts_per_term/vectors/"):
    path = str(path)
    path = path.replace("vectors", "files")
    for sub_path in os.listdir(v_root):
        path = path.replace(sub_path + "/", "")
    path = Path(path)
    return path

In [5]:
sw_file = "../data/utils/stopwords-sv.txt"
def load_stop_words(file):
    with open(Path(file)) as f:
        sw = [line.strip("\n") for line in f.readlines()]
    return sw
STOP_WORDS = load_stop_words(sw_file)

In [6]:
def get_ctx(file):
    #years = []
    contexts = []
    with open(file) as f:
        for line in f:
            year, context = tuple(line.strip("\n").split("\t"))
            #years.append(int(year))
            contexts.append(context)
    return contexts     

In [7]:
# https://stats.stackexchange.com/questions/99171/why-is-euclidean-distance-not-a-good-metric-in-high-dimensions/
# https://towardsdatascience.com/k-means-algorithm-for-high-dimensional-data-clustering-714c6980daa9
# https://medium.com/swlh/k-means-clustering-on-high-dimensional-data-d2151e1a4240

def dimensionality_reduction(data, method = "SVD", k = 100, random_seed = 10):
    """
    param method   'SVD', ...
    """
    
    if method == "SVD":
        svd = TruncatedSVD(n_components = k, random_state = random_seed)
        
        reduced = svd.fit_transform(data)
        
        log = f"For reduction to n_components = {k}, {round(svd.explained_variance_ratio_.sum() * 100)} % explained"
        print(log)
        
        # explained_variance_ndarray of shape (n_components,)
        
        return reduced, log       

In [8]:
def clustering(
    data, 
    method = "kmeans",
    k = None,
    silohouette = True, 
    k_range = (2, 10),
    k_step = 1,
    return_only_best = True,
    initialization = "auto",
    random_seed = 10,
    verbose = True,
    return_as_list = True # json.dumps() will complain about numpy objects as keys
):
    
    log = []
    
    if silohouette:
        k_start, k_stop = k_range
        k_stop = k_stop + 1

        silhoette_scores = []
        models = []

        for k in range(k_start, k_stop, k_step):
            if method == "kmeans":
                model = KMeans(n_clusters = k, random_state = random_seed, n_init = initialization).fit(data)
            if method == "agglomerative_cosine":
                model = AgglomerativeClustering(n_clusters = k, metric = "cosine", linkage="complete").fit(data)
                # Note linkage = "complete"
            
            silhouette_avg = silhouette_score(data, model.labels_)

            silhoette_scores.append(silhouette_avg)
            models.append(model.labels_)
            
            txt = f"The average silhouette_score for k = {k} is: {silhouette_avg}."
            log.append(txt)
            if verbose:
                print(txt)

        best_score = max(silhoette_scores)
        idx_best = silhoette_scores.index(best_score)
        
        txt = f"Best score for k = {[k for k in range(k_start, k_stop)][idx_best]} ({best_score:.5f})."
        log.append(txt)
        if verbose:
            print(txt)        
        
        best_model = models[idx_best]
        
        #log = "\n".join(log)
        
        if return_as_list:
            best_model = [int(val) for val in best_model]
        
        return best_model, log
    
    else: # predefined k
        model = KMeans(n_clusters = k, random_state = random_seed, n_init = initialization).fit(data)
        
        if return_as_list:
            labels = [int(val) for val in model.labels_]  
        else:
            labels = model.labels_
        
        return labels, ""

In [9]:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_affinity_propagation.html
def aff_prop(data, random_seed = 10):
    start_time = time.time()
    print("N =", len(data))
    model = AffinityPropagation(random_state=random_seed).fit(data)
    labels = model.labels_
    delta = time.time() - start_time
    print(int(delta / 60), int(delta % 60))
    print("No. labels:", len(set(labels)))
    print("Silhoette score:", silhouette_score(data, model.labels_))
    for lab, cnt in Counter(labels).items():
        print(lab, cnt)
    return labels

In [10]:
def cluster_representation(clusters, contexts, top_k = 50, stop_words = None):
    
    assert len(clusters) == len(contexts)
    
    df = pd.DataFrame({"Cluster": clusters, "Context": contexts})
    docs_per_class = df.groupby(["Cluster"], as_index=False).agg({"Context": " ".join})
    count_vectorizer = CountVectorizer(stop_words = stop_words)
    count = count_vectorizer.fit_transform(docs_per_class.Context)
    words = count_vectorizer.get_feature_names_out()
    ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(df)).toarray()
    #words_per_class = {newsgroups.target_names[label]: [words[index] for index in ctfidf[label].argsort()[-10:]] for label in docs_per_class.Class}
    words_per_class = {label: [words[index] for index in ctfidf[label].argsort()[-top_k:]] for label in docs_per_class.Cluster}
    
    return words_per_class

In [11]:
def visualize(data, labels):
    XR = dimensionality_reduction(data, k=2)
    A = []
    B = []
    for x in XR:
        A.append(x[0])
        B.append(x[-1])
    plt.scatter(A, B, c=labels, alpha = 0.6, s=10)

In [12]:
def npd(labels, years, min_count = 10): # Nominal Probaility Distribution
    
    label_counts = {year: {label: 0 for label in set(labels)} for year in set(years)}
    #print(label_counts)
    
    for year, label in zip(years, labels):
        label_counts[year][label] += 1
    
    pdv = {year: [] for year in years} # Probability Distribution Vectors
    
    for year in label_counts.keys():
        n_yr = sum(label_counts[year].values())
        
        if n_yr <= min_count:
            del pdv[year]
            continue
        
        for label in sorted(label_counts[year].keys()): # Obs! Here, sorted() should guarante that the index of proportions == name of labels
            count = label_counts[year][label]
            proportion = count / n_yr
            pdv[year].append(proportion)
            
    return pdv

In [13]:
# https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
# https://www.statology.org/kl-divergence-python/
# https://towardsdatascience.com/kl-divergence-python-example-b87069e4b810
def kl(p, q):
    return sum(rel_entr(q, p))

In [14]:
def get_transitions(start_year, end_year):
    transitions = [(yi, yi+1) for yi in range(start_year, end_year)]
    return transitions

In [15]:
def get_jsd(pdv_data, transitions = get_transitions(2000, 2022)):
    
    jsd = dict()
    Yrs = []
    Chg = []
    
    for yi, yj in transitions:
        if yi not in pdv_data:
            continue
        if yj not in pdv_data:
            continue
        p = pdv_data[yi]
        q = pdv_data[yj]
        change = jensenshannon(p, q)
        
        Yrs.append(yj)
        Chg.append(change)
        jsd[f"{yi}_{yj}"] = change
        
    return jsd, Yrs, Chg

In [16]:
def populator(clustering, years):
    
    results = {}
    
    npvs = npd(clustering, years)
    results["npd"] = npvs
    
    jsd, Y, C = get_jsd(npvs, transitions = get_transitions(2000, 2022))
    
    results["jsd"] = jsd
    results["years"] = Y
    results["changes"] = C
    
    return results

In [17]:
def multi_clust(file):
    
    multi_results = {}
    
    years, X = get_vec(file)
    print("Length:", len(years))
    
    contexts = get_ctx(rename_for_ctx(file))
    
    Xr, vx_log = dimensionality_reduction(X, k=50)
    
    print()
    print("KMS")
    kms, log = clustering(X)
    multi_results["kms"] = populator(kms, years)
    multi_results["kms"]["cluster_counts"] = Counter(kms)
    multi_results["kms"]["words_per_class"] = cluster_representation(kms, contexts, stop_words=STOP_WORDS)
    multi_results["kms"]["log"] = log
    multi_results["kms"]["vx_log"] = None
    
    print()
    print("KM5")
    km5, log = clustering(X, k = 5, silohouette = False)
    multi_results["km5"] = populator(km5, years)
    multi_results["km5"]["cluster_counts"] = Counter(km5)
    multi_results["km5"]["words_per_class"] = cluster_representation(km5, contexts, stop_words=STOP_WORDS)
    multi_results["km5"]["log"] = log
    multi_results["km5"]["vx_log"] = None
    
    print()
    if len(years) < 12000:
        print("ACS")
        acs, log = clustering(X, method = "agglomerative_cosine")
        multi_results["acs"] = populator(acs, years)
        multi_results["acs"]["cluster_counts"] = Counter(acs)
        multi_results["acs"]["words_per_class"] = cluster_representation(acs, contexts, stop_words=STOP_WORDS)
        multi_results["acs"]["log"] = log
        multi_results["acs"]["vx_log"] = None
    else:
        print("Skipped ACS")
    
    print()
    print("KMS-R50")
    kmsr50, log = clustering(Xr)
    multi_results["kmsr50"] = populator(kmsr50, years)
    multi_results["kmsr50"]["cluster_counts"] = Counter(kmsr50)
    multi_results["kmsr50"]["words_per_class"] = cluster_representation(kmsr50, contexts, stop_words=STOP_WORDS)
    multi_results["kmsr50"]["log"] = log
    multi_results["kmsr50"]["vx_log"] = vx_log
    
    print()
    if len(years) < 12000:
        print("ACS-R50")
        acsr50, log = clustering(Xr, method = "agglomerative_cosine")
        multi_results["acsr50"] = populator(acsr50, years)
        multi_results["acsr50"]["cluster_counts"] = Counter(acsr50)
        multi_results["acsr50"]["words_per_class"] = cluster_representation(acsr50, contexts, stop_words=STOP_WORDS)
        multi_results["acsr50"]["log"] = log    
        multi_results["acsr50"]["vx_log"] = vx_log
    else:
        print("Skipped ACS-R50")
    
    #afp    = aff_prop(X)
    
    return multi_results

In [18]:
def viscor(data):
    Xs = []
    Ys = []
    
    for method in data.keys():
        print()
        print(method)
        X = data[method]["years"]
        Y = data[method]["changes"]
        Ys.append(Y)
        #print(X, Y)
        plt.plot(X, Y)
        plt.show()
    
    #print(Ys)
    corr, pval = spearmanr(Ys, axis=1)
    print(pd.DataFrame(corr, columns = data.keys(), index = data.keys()))    

In [19]:
def multi_term(directory, stop = None):
    
    big_r = dict()
    
    directory = Path("Directory:", directory)
    print(directory)
    
    files = sorted(os.listdir(directory))
    if stop != None:
        files = files[:stop]
    
    for file in files:
        name = file.replace(".txt", "")
        print()
        print(f"--- {name} ---")
        results = multi_clust(directory/file)
        big_r[name] = results
        
    return big_r

In [20]:
my_results = multi_term(
    directory = "/home/max/Corpora/flashback-pol-time/yearly/contexts_per_term/vectors/sts_fbmodel/", 
    stop = None
)

/home/max/Corpora/flashback-pol-time/yearly/contexts_per_term/vectors/sts_fbmodel

--- A1_globalistisk ---
Length: 5817
For reduction to n_components = 50, 64 % explained

KMS
The average silhouette_score for k = 2 is: 0.034393790763544166.
The average silhouette_score for k = 3 is: 0.03269933546246315.
The average silhouette_score for k = 4 is: 0.027281517289711645.
The average silhouette_score for k = 5 is: 0.027227930192089904.
The average silhouette_score for k = 6 is: 0.025703384573126428.
The average silhouette_score for k = 7 is: 0.023767564512540276.
The average silhouette_score for k = 8 is: 0.023333251758767836.
The average silhouette_score for k = 9 is: 0.020985796274486677.
The average silhouette_score for k = 10 is: 0.019889826904382624.
Best score for k = 2 (0.03439).

KM5

ACS
The average silhouette_score for k = 2 is: 0.013386476501816005.
The average silhouette_score for k = 3 is: 0.009199829419153047.
The average silhouette_score for k = 4 is: 0.0033588663175745494.
T

The average silhouette_score for k = 6 is: 0.02287856406306221.
The average silhouette_score for k = 7 is: 0.02050608664891606.
The average silhouette_score for k = 8 is: 0.021202141934058238.
The average silhouette_score for k = 9 is: 0.018426687491052807.
The average silhouette_score for k = 10 is: 0.019737909098028108.
Best score for k = 2 (0.03476).

KM5

Skipped ACS

KMS-R50
The average silhouette_score for k = 2 is: 0.054162241575415424.
The average silhouette_score for k = 3 is: 0.0483532503934591.
The average silhouette_score for k = 4 is: 0.04348720932175275.
The average silhouette_score for k = 5 is: 0.04062131374348817.
The average silhouette_score for k = 6 is: 0.04044359434480617.
The average silhouette_score for k = 7 is: 0.040231337729230154.
The average silhouette_score for k = 8 is: 0.040429731234189824.
The average silhouette_score for k = 9 is: 0.04069563021719949.
The average silhouette_score for k = 10 is: 0.04061013367912646.
Best score for k = 2 (0.05416).

Skipp


ACS
The average silhouette_score for k = 2 is: 0.08870454277410064.
The average silhouette_score for k = 3 is: 0.07144780948639828.
The average silhouette_score for k = 4 is: 0.10295721007984614.
The average silhouette_score for k = 5 is: 0.10389210094255166.
The average silhouette_score for k = 6 is: 0.09388728021921634.
The average silhouette_score for k = 7 is: 0.08399378320815107.
The average silhouette_score for k = 8 is: 0.0866960089111212.
The average silhouette_score for k = 9 is: 0.08157679842607786.
The average silhouette_score for k = 10 is: 0.0844204207021763.
Best score for k = 5 (0.10389).

KMS-R50
The average silhouette_score for k = 2 is: 0.08362734587779333.
The average silhouette_score for k = 3 is: 0.09392442269123842.
The average silhouette_score for k = 4 is: 0.0958196716837536.
The average silhouette_score for k = 5 is: 0.10410452098819095.
The average silhouette_score for k = 6 is: 0.09737777451018122.
The average silhouette_score for k = 7 is: 0.154042393132756

The average silhouette_score for k = 9 is: 0.023201659162303753.
The average silhouette_score for k = 10 is: 0.02279399889714053.
Best score for k = 2 (0.07183).

--- V1_kulturberika ---
Length: 2314
For reduction to n_components = 50, 64 % explained

KMS
The average silhouette_score for k = 2 is: 0.024648356983370808.
The average silhouette_score for k = 3 is: 0.02996691879832212.
The average silhouette_score for k = 4 is: 0.03013012796220948.
The average silhouette_score for k = 5 is: 0.03006969061789358.
The average silhouette_score for k = 6 is: 0.02696063738554801.
The average silhouette_score for k = 7 is: 0.02265504786500871.
The average silhouette_score for k = 8 is: 0.02406176066092212.
The average silhouette_score for k = 9 is: 0.026450997618379907.
The average silhouette_score for k = 10 is: 0.025062580004827143.
Best score for k = 4 (0.03013).

KM5

ACS
The average silhouette_score for k = 2 is: 0.018559643334767683.
The average silhouette_score for k = 3 is: 0.005820445419

In [None]:
#my_results

In [21]:
file_out  = "/home/max/Results/fb_pol-yearly-bert/sts_fbmodel/cluster_data.json"
with open(file_out, "w") as f:
    f.write(json.dumps(my_results, indent = 4))

In [None]:
#file = Path("/home/max/Corpora/flashback-pol-time/yearly/contexts_per_term/vectors/sts_fbmodel/N1_förortsgäng.txt")
#file = Path("/home/max/Corpora/flashback-pol-time/yearly/contexts_per_term/vectors/sts_fbmodel/A1_globalistisk.txt")
#file = Path("/home/max/Corpora/flashback-pol-time/yearly/contexts_per_term/vectors/sts_fbmodel/N1_globalist.txt")
#file = Path("/home/max/Corpora/flashback-pol-time/yearly/contexts_per_term/vectors/sts_fbmodel/N1_återvandring.txt")

In [None]:
mc = multi_clust(Path("/home/max/Corpora/flashback-pol-time/yearly/contexts_per_term/vectors/sts_fbmodel/N1_förortsgäng.txt"))

In [None]:
viscor(mc)

In [None]:
mc["kms"].keys()

In [None]:
mc["kms"]["cluster_counts"]

In [None]:
mc["kms"]["words_per_class"]

In [None]:
mc

In [None]:
multi_clust(Path("/home/max/Corpora/flashback-pol-time/yearly/contexts_per_term/vectors/sts_fbmodel/N1_återvandring.txt"))

In [None]:
multi_clust(Path("/home/max/Corpora/flashback-pol-time/yearly/contexts_per_term/vectors/sts_fbmodel/V1_kulturberika.txt"))

In [None]:
#model = clustering(X)
#model = clustering(X, method = "agglomerative_cosine")
#model = kmeans(X, k_range = (2, 12))
#best_model = kmeans(Xr, k = 10, silohouette=False)
#model = aff_prop(Xr)

In [None]:
#visualize(X, model)

In [None]:
# orignal code from
# https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

EV = []
ks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

for k in ks:
    lsa = make_pipeline(TruncatedSVD(n_components=k), Normalizer(copy=False))
    t0 = time.time()
    X_lsa = lsa.fit_transform(X)
    explained_variance = lsa[0].explained_variance_ratio_.sum()
    EV.append(explained_variance)

    print("k =", k)
    print(f"LSA done in {time.time() - t0:.3f} s")
    print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

plt.plot(ks, EV)