In [1]:
import numpy as np
from scipy.stats import multivariate_normal, multivariate_t
import os
import torch
import pandas as pd
from sklearn.cluster import KMeans
from preprocess import get_csv
from sklearn.metrics.pairwise import cosine_similarity
import logging
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def kl_divergence(X0, X1):
    mean0 = np.mean(X0, axis=0)
    cov0 = np.cov(X0, rowvar=False)
    mean1 = np.mean(X1, axis=0)
    cov1 = np.cov(X1, rowvar=False)

    print(np.mean(cov0))
    print(np.max(cov0))
    print(np.mean(cov1))
    print(np.max(cov1))

    mvn0 = multivariate_normal(mean=mean0, cov=cov0, allow_singular = True)
    mvn1 = multivariate_normal(mean=mean1, cov=cov1, allow_singular = True)
    
    # X = mvn0.rvs(size=1000)
    
    pdf0 = mvn0.logpdf(X0)
    pdf1 = mvn1.logpdf(X0)
    
    for i in range(pdf0.shape[0]):
        if pdf0[i] == -np.inf:
            pdf0[i] = -1e5
    for i in range(pdf1.shape[0]):
        if pdf1[i] == -np.inf:
            pdf1[i] = -1e5
    
    kl_div = np.mean(pdf0-pdf1)
    
    return kl_div
    

In [3]:
def cos_similarity(X0, X1):
    X0 = X0/np.linalg.norm(X0, axis=1)[:, np.newaxis]
    X1 = X1/np.linalg.norm(X1, axis=1)[:, np.newaxis]
    
    cos_sim = cosine_similarity(X0, X1)
    avg_sim = np.mean(cos_sim)
    
    return avg_sim

In [4]:
embeddings_path = 'embeddings.npy'

# Check if the file exists
if not os.path.exists(embeddings_path):
    # If the file does not exist, call the get_embedding method
    print("Creating CSV from JSON file.")
    corpus_embeddings = model.encode(corpus, show_progress_bar=True)
    np.save("./embeddings.npy", corpus_embeddings, allow_pickle=True)
else:
    print("The file already exists.")
    corpus_embeddings = np.load("./embeddings.npy", allow_pickle=True)

print(corpus_embeddings.shape)

Creating CSV from JSON file.


NameError: name 'model' is not defined

In [None]:
clusters = np.load("./clusters.npy", allow_pickle=True)

In [None]:
# kldivergence

kld = np.zeros((30, 30))

for i in range(10):
    for j in range(10):
        kld[i,j] = kl_divergence(corpus_embeddings[np.where(clusters==i)[0]], corpus_embeddings[np.where(clusters==j)[0]])

In [None]:
cos = np.zeros((30, 30))

for i in range(10):
    for j in range(10):
        cos[i,j] = cos_similarity(corpus_embeddings[np.where(clusters==i)[0]], corpus_embeddings[np.where(clusters==j)[0]])

In [None]:
def create_sim_matrix(sim, title = 'hello', name = 'hi.png', fmt=".0f"):
    folder_name = "Matrix"

    # Create the folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    # Check if the data is a square matrix
    if sim.shape[0] != sim.shape[1]:
        logging.info("Data is not in the correct format. It should be a square matrix.")
        return
    
    # Reverse the order of the columns
    # data = data[data.columns[::-1]]

    plt.figure()
    heatmap = sns.heatmap(sim, annot=True, cmap='PiYG', fmt = fmt)
    plt.title(title)
    plt.tight_layout()

    # Save the matrix to the specified folder
    save_path = os.path.join(folder_name, name)
    heatmap.figure.savefig(save_path)
    logging.info(f"matrix saved to {save_path}")

    # Optionally, display the matrix
    plt.show()

In [None]:
create_sim_matrix(kld[:10,:10], title='kld', name='kld_matrix.png')
create_sim_matrix(cos[:10,:10], title='cos', name='cos_matrix.png', fmt=".3f")

In [None]:
years = np.load("./years.npy", allow_pickle=True)

In [None]:
divide_years = [1986, 1996, 2006, 2016, 2026]

cos_years = np.zeros((30, len(divide_years)-1, len(divide_years)-1))

for i in range(10):
    for j in range(len(divide_years)-1):
        for k in range(len(divide_years)-1):
            cos_years[i,j,k] = cos_similarity(corpus_embeddings[np.where((clusters==i) & (years>=divide_years[j]) & (years<divide_years[j+1]))[0]], 
                                              corpus_embeddings[np.where((clusters==i) & (years>=divide_years[k]) & (years<divide_years[k+1]))[0]])


In [None]:
print(cos_years[4,:,:])

In [None]:
def create_year_matrix(sim, title = 'hello', name = 'hi.png', fmt=".0f"):
    folder_name = "Matrix/years"

    # Create the folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    # Check if the data is a square matrix
    if sim.shape[0] != sim.shape[1]:
        logging.info("Data is not in the correct format. It should be a square matrix.")
        return
    
    # Reverse the order of the columns
    # data = data[data.columns[::-1]]
    plt.figure()
    heatmap = sns.heatmap(sim, vmin=0, vmax=1, annot=True, cmap='PiYG', fmt = fmt)
    plt.title(title)
    plt.tight_layout()

    # Save the matrix to the specified folder
    save_path = os.path.join(folder_name, name)
    heatmap.figure.savefig(save_path)
    logging.info(f"matrix saved to {save_path}")

    # Optionally, display the matrix
    plt.show()

In [None]:
for i in range(10):
    create_year_matrix(cos_years[i], title=f'cos_matrix_{i}', name=f'cos_matrix_{i}.png', fmt=".3f")