## **SCD - Sense based**

- AP and KM Clusters
- Jensen Shannon Divergence
- Shannon-Entropy
- https://github.com/glnmario/cwr4lsc/blob/master/change_metrics.py
- https://github.com/matejMartinc/scalable_semantic_shift/blob/master/measure_semantic_shift.py

In [None]:
import pickle
from tqdm.notebook import tqdm
import pandas as pd
from collections import Counter, defaultdict
from itertools import chain, groupby, combinations
import os
import numpy as np

from scipy.stats import entropy

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
sns.set(style='whitegrid', context='notebook')

journals = ["pr", "pra", "prb", "prc", "prd", "pre", "prl", "rmp"]
years = [year for year in range(1924, 2023)]

In [None]:
def load_cluster_df(target_word, journal, slice_width, emb_type, cluster_type):
    if target_word == "virtual":
        if cluster_type == "k_means" or cluster_type == "k_means_filtered":
            cluster_type = "km"
        if cluster_type == "ap_cluster" or cluster_type == "ap_cluster_filtered":
            cluster_type = "ap"
        return pd.read_pickle(f"../../data/clustering/all_years/{emb_type}_{cluster_type}_clustering_{journal if journal else 'all'}.pkl")

def create_slice_df(data_df, year, slice_width, journal):
    if journal:
        return data_df.loc[(data_df.year >= year) & (data_df.year < year+slice_width) & (data_df.journal == journal)]
    else:
        return data_df.loc[(data_df.year >= year) & (data_df.year < year+slice_width)]
    
# probability distribution for time intervall
def get_probability_distribution(slice_df, cluster_type, n_clusters):
    pdist = slice_df[cluster_type].value_counts(normalize = True)
    pdist = [pdist[cluster] if cluster in pdist else 0 for cluster in range(n_clusters)]
    return pdist

# compute jsd. P and Q are probability distribution over clusters
# JSD is symetric so input order doesn't matter
def compute_jsd(p, q):
    p = np.asarray(p)
    q = np.asarray(q)
    m = (p + q) / 2
    return (entropy(p, m) + entropy(q, m)) / 2

# calculate entropy difference
# P and Q are again probability distributions over clusters
# Input order matters
def compute_entropy_difference(p, q, n_clusters):
    p = np.asarray(p)
    q = np.asarray(q)
    entropy_p = entropy(p)
    entropy_q = entropy(q)
    # normalize entropy on log(n_clusters)
    entropy_p /= np.log(n_clusters)
    entropy_q /= np.log(n_clusters)
    entropy_diff = entropy_q - entropy_p
    return entropy_diff

def compute_normalized_entropy(p, n_clusters):
    p = np.asarray(p)
    entropy_p = entropy(p)
    entropy_p /= np.log(n_clusters)
    return entropy_p

In [None]:
# Load data

embedding_types = ["token_emb"] #["token_emb", "sentence_emb"]
cluster_types = ["km"] #["ap", "km"]

print("Clustering not done for:\n")

for emb_type in embedding_types:
    for cluster_type in cluster_types:
        for journal in ["all"] + journals:
            if not os.path.isfile(f"../../data/clustering/all_years/{emb_type}_{cluster_type}_clustering_{journal}.pkl"):
                print(f"{emb_type}_{cluster_type}_clustering_{journal}")

In [None]:
target_word = "virtual"

cluster_types = ["k_means", "k_means_filtered"] #["k_means", "k_means_filtered", "ap_cluster", "ap_cluster_filtered"]
embedding_types = ["token_emb"] # ["token_emb", "sentence_emb"]
time_slices = [1, 3, 5]

for cluster_type in tqdm(cluster_types):
    
    for emb_type in tqdm(embedding_types): 
        
        for slice_width in tqdm(time_slices):
            
            for journal in journals + [None]:
                
                #journal = None
                
                # Check if clustering is done
                if cluster_type == "k_means" or cluster_type == "k_means_filtered":
                    ct = "km"
                if cluster_type == "ap_cluster" or cluster_type == "ap_cluster_filtered":
                    ct = "ap"
                if not os.path.isfile(f"../../data/clustering/all_years/{emb_type}_{ct}_clustering_{journal if journal else 'all'}.pkl"):
                    print(f"{emb_type}_{ct}_clustering_{journal if journal else 'all'}")
                    continue

                cluster_df = load_cluster_df(target_word, journal, slice_width, emb_type, cluster_type)
                n_clusters=cluster_df[cluster_type].unique().size
                if journal:
                    year_start = cluster_df.loc[cluster_df.journal == journal].year.min()
                    year_end = cluster_df.loc[cluster_df.journal == journal].year.max()
                else:
                    year_start = cluster_df.year.min()
                    year_end = cluster_df.year.max()

                jsd_dict = {}
                ed_dict = {}
                entropy_dict = {}

                # Initialize first time step
                slice_df = create_slice_df(cluster_df, year_start, slice_width, journal) 
                p = get_probability_distribution(slice_df, cluster_type, n_clusters)

                for year in range(year_start+slice_width, year_end+1, slice_width):

                    slice_df = create_slice_df(cluster_df, year, slice_width, journal)
                    q = get_probability_distribution(slice_df, cluster_type, n_clusters)

                    jsd = compute_jsd(p, q)
                    #ed = compute_entropy_difference(p, q, n_clusters)
                    entropy_p = compute_normalized_entropy(p, n_clusters)

                    jsd_dict[f"{year}_{year+slice_width-1}"] = jsd
                    #ed_dict[f"{year}_{year+slice_width-1}"] = ed
                    entropy_dict[f"{year}_{year+slice_width-1}"] = entropy_p

                    p = q
                    
                jsd_df = pd.DataFrame.from_dict(jsd_dict, orient="index", columns = [f"{cluster_type}_{emb_type}_{journal if journal else 'all'}_{slice_width}_jsd"])
                jsd_df.to_pickle(f"../../data/scd/sb/jsd/{target_word}_jsd_sb_{cluster_type}_{emb_type}_{journal if journal else 'all'}_{slice_width}.pkl")
                
                #ed_df = pd.DataFrame.from_dict(ed_dict, orient="index", columns = [f"{cluster_type}_{emb_type}_{journal if journal else 'all'}_{slice_width}_ed"])
                #ed_df.to_pickle(f"../../data/scd/sb/ed/{target_word}_ed_sb_{cluster_type}_{emb_type}_{journal if journal else 'all'}_{slice_width}.pkl")
                
                entropy_df = pd.DataFrame.from_dict(entropy_dict, orient="index", columns = [f"{cluster_type}_{emb_type}_{journal if journal else 'all'}_{slice_width}_entropy"])
                entropy_df.to_pickle(f"../../data/scd/sb/entropy/{target_word}_entropy_sb_{cluster_type}_{emb_type}_{journal if journal else 'all'}_{slice_width}.pkl")