## **SCD - Form based**

- AID and PRT
- https://github.com/glnmario/cwr4lsc/blob/master/change_metrics.py

In [None]:
import pandas as pd
import panel as pn
from tqdm.notebook import tqdm
from itertools import chain
import os
from collections import Counter
import numpy as np

from scipy.spatial.distance import cosine
from scipy.spatial.distance import cdist, pdist
import json
import random
import datetime

import sys

journals = ["pr", "pra", "prb", "prc", "prd", "pre", "prl", "rmp"]

In [None]:
target_word = "virtual"
    
if target_word == "virtual":
    df = pd.read_pickle("../../data/embeddings/virtual_fulltexts_virtual_token_embeddings.pkl")
    df = df.loc[df.token == "virtual"].copy()
    df["joint_sents"] = df.sentence.apply(lambda x: "_".join(x))

In [None]:
# Function to categorize year slices
def categorize_year(year, min_year, slice_width):
    base_year = min_year + slice_width * ((year - min_year) // slice_width)
    return f"{base_year}-{base_year+slice_width-1}"

print(f"Anzahl {target_word} embeddings:", len(df))
if target_word == "virtual":
    print(f"Anzahl Satzembeddings für Sätze mit {target_word}:", len(df.drop_duplicates("joint_sents")))

### **Average Inner Distance**

for polysemy

In [None]:
if target_word == "virtual":
    embedding_types = ["token_emb"]
time_slices = [1, 3, 5]
metric = "euclidean"

for emb_type in tqdm(embedding_types):
    
    for slice_width in tqdm(time_slices):

        for journal in ["all"] + journals:
            
            result_dict = {}

            if journal == "all":
                journal_df = df.copy()
            else:
                journal_df = df.loc[df.journal == journal].copy()
            min_year = journal_df.year.min()

            # Create year slices:
            journal_df['year_slice'] = journal_df['year'].apply(lambda x: categorize_year(x, min_year, slice_width))

            for sli, years_slice_df in journal_df.groupby("year_slice"):

                aid = np.mean(pdist(years_slice_df[emb_type].to_list(), metric=metric))
                result_dict[sli] = aid
                
            result_df = pd.DataFrame.from_dict(result_dict, orient="index", columns = [f"{emb_type}_{journal}_{slice_width}"])
            result_df.to_pickle(f"../../data/scd/fb/aid_{target_word}/{target_word}_aid_{emb_type}_{journal}_{slice_width}.pkl")

### **Inverted Cosine Similarity over Word Prototypes (PRT)**

- a lá kutuzov et al 2020

In [None]:
if target_word == "virtual":
    embedding_types = ["token_emb"]
time_slices = [1, 3, 5]
inverted = True

for emb_type in tqdm(embedding_types):
    
    for slice_width in tqdm(time_slices):

        for journal in ["all"] + journals:
            
            result_dict = {}

            if journal == "all":
                journal_df = df.copy()
            else:
                journal_df = df.loc[df.journal == journal].copy()
            min_year = journal_df.year.min()

            if emb_type == "sentence_emb":
                journal_df = journal_df.drop_duplicates("joint_sents")

            # Create year slices:
            journal_df['year_slice'] = journal_df['year'].apply(lambda x: categorize_year(x, min_year, slice_width))

            start = True

            for sli, years_slice_df in journal_df.groupby("year_slice"):

                if start:
                    # Make Type Embeddings for Time Slice
                    last_type_embedding = years_slice_df[emb_type].to_list()
                    last_type_embedding = np.mean(last_type_embedding, axis=0)
                    start = False
                    continue

                # Make Type Embeddings for Time Slice
                current_type_embedding = years_slice_df[emb_type].to_list()
                current_type_embedding = np.mean(current_type_embedding, axis=0)
                
                if inverted:
                    cosine_sim = 1 / (1 - cosine(last_type_embedding, current_type_embedding))
                else:
                    cosine_sim = 1 - cosine(last_type_embedding, current_type_embedding)
                    
                result_dict[sli] = cosine_sim

                last_type_embedding = current_type_embedding

            result_df = pd.DataFrame.from_dict(result_dict, orient="index", columns = [f"{emb_type}_{journal}_{slice_width}"])
            result_df.to_pickle(f"../../data/scd/fb/prt_{target_word}/{target_word}_prt_fb_{emb_type}_{journal}_{slice_width}.pkl")