```
for model in models:
    previous_year = None
    for i, file in enumerate(files, start = 1):
        this_year = dict()
        for term, vector in file:
            APPEND(vector, this_year.term.vectors)
        
        this_year.term.mean  = MEAN(this_year.term.vectors)        # centroid
        this_year.term.width = MEAN_SIMILARITY(terms.term.vectors) # average pairwise cosine similarity 
        
        SAVE(this_year.term.mean, emb_file)
        SAVE(this_year.term.width, wdt_file)
        
        if i > 1:
            SAVE(COS_CHANGE(previous_year.term.mean, this_year.term.mean), gch_file)
            SAVE(COS_SIM(previous_year.term.mean, this_year.term.mean), gsim_file)
            SAVE(CHNG_WIDTH(previous_year.term.width, this_year.term.width), chw_file)
            
            for c in n_controls:
                mega = previous_year.term.vectors + this_year.term.vectors
                shuffle(mega)
                cutoff = int(len(mega)/2)
                ctrl1 = mega[:cutoff]
                ctrl2 = mega[cutoff:]

                SAVE(COS_CHANGE(ctrl1, ctrl2), chng_file_ctrl_c)
                SAVE(COS_SIM(ctrl1, ctrl2), sim_file_ctrl_c)
                
            del mega
            
        previous_year = this_year 

```

In [1]:
import os
from pathlib import Path
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity as sk_cos
import pandas as pd
import random
import tracemalloc
import time

In [2]:
random.seed(111)

In [3]:
def np_cos(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1)*np.linalg.norm(v2))

In [4]:
def angular_distance(v1, v2):
    return np.arccos(np_cos(v1,v2)) / np.pi

In [5]:
def upc_mean0(lst, function): # Mean of Unique Pairwise Comparison 
    # Very ineffecient
    
    if len(lst) == 1:
        return np.nan
    
    total = []

    for i, x in enumerate(lst, start=1): # Obs! start = 1
        for y in lst[i:]:
            total.append(function(x, y)) 
    
    M = sum(total)/len(total)
    
    return M

In [6]:
def upc_mean(lst, function=sk_cos):
    
    if len(lst) == 1:
        return np.nan    
    
    matrix = function(lst)
    
    L    = len(lst)
    S    = matrix.sum()
    adjS = S - L    # Remove diagonal ; 1.0 x N
    adjS = adjS / 2 # Remove duplicates from upper AND lower half in matrix
    N    = ((L*L) - L) / 2
    
    M = adjS / N 
    
    return M
    

In [17]:
def iline(line):
    line = line.strip("\n")
    line = line.split("\t")
    term = line[0]
    if len(term.split()) > 1:
        term = "_".join(term.split()[:3])
    vector = [float(value) for value in line[-1].split()]
    return term, vector

In [8]:
def centroid(vectors):
    arr = np.array(vectors)
    return np.mean(arr, axis=0)

In [9]:
def status_report(memory0, t0, prefix=""):
    norm, unit = (1000000, "MB")
    memory1 = tracemalloc.get_traced_memory()
    memory  = round(memory1[0]/norm, 1)
    memory_delta = round((memory1[0]-memory0[0])/norm, 1)
    memory0 = memory1
    t_delta = time.time() - t0
    m = int(t_delta/60)
    s = int(t_delta%60)
    print(f"#{prefix}--memory={memory} {unit}; {m} m {s} s.                   ", end="\r")     

In [21]:
def semantic_change(model, out_dir, n_controls):

    tracemalloc.start()
    memory0 = (0,0)
    t0 = time.time()    

    model = Path(model)
    files = os.listdir(model)
    files.sort()
    previous_year = {}
    
    for i, file in enumerate(files, start=1):
        print()
        print(model, file)
        this_year = {"year": file.strip(".txt"), "data": {}}

        # Get data
        with open(model/file, "r") as f:
            for j, line in enumerate(f):

                if j % 300 == 0:
                    status_report(memory0, t0, prefix=f"Line {j} of file")

                term, vector = iline(line)
                if term in this_year["data"]:
                    this_year["data"][term]["vector"].append(vector)
                else:
                    this_year["data"][term] = {"vector": [vector]}

        status_report(memory0, t0, prefix="Input done")

        # Centroid, Spread
        with open(out_dir/f"centroid/{file}", "w") as emb, open(out_dir/f"spread/{file}", "w") as wdt: 
            for term in sorted(this_year["data"].keys()):

                cntr = centroid(this_year["data"][term]["vector"])
                this_year["data"][term]["mean"] = cntr
                emb.write(f"{term}\t{cntr}\n") 

                status_report(memory0, t0, prefix=f"Starting UPC: {term}")
                
                spread = upc_mean(this_year["data"][term]["vector"])
                ################  TO BE REMOVED   ##################
                if spread > 1.000001: 
                    # There is some rounding issue ... Noble et al observed somthing similar; identical vectors have cos_sin > 1.0
                    
                    print("Something went wrong!")
                    print("Spread =", spread)
                    
                    vecs = this_year["data"][term]["vector"]
                    L    = len(vecs)
                    print("L", L)
                    mtrx = sk_cos(vecs)
                    df   = pd.DataFrame(mtrx)
                    upc0 = upc_mean0(vecs, function=np_cos)
                    
                    print("UPC0:", upc0)
                    print(df.round(3))
                    print("Matrix sum", mtrx.sum())
                    print("Adjusted Sum", (mtrx.sum() - len(vecs))/2)
                    print("Adjusted N", ((L*L) - L) / 2)
                    print()
                    
                    return
                #####################################################
                this_year["data"][term]["spread"] = spread
                wdt.write(f"{term}\t{spread}\n")

        status_report(memory0, t0, prefix="Centroids, Spread done")

        # Change and controls
        if i > 1:
            yi = previous_year["year"]
            yj = this_year["year"]
            
            shared_terms = [term for term in this_year["data"].keys() if term in previous_year["data"].keys()]

            # Genuine change
            with open(out_dir/f"cosine_change/{yi}_{yj}_genuine.txt", "w") as f:
                for term in shared_terms:
                    gch = angular_distance(previous_year["data"][term]["mean"], this_year["data"][term]["mean"])
                    f.write(f"{term}\t{gch}\n")

            # Genuine similarity
            with open(out_dir/f"cosine_sim/{yi}_{yj}_genuine.txt", "w") as f:
                for term in shared_terms:
                    sim = np_cos(previous_year["data"][term]["mean"], this_year["data"][term]["mean"])
                    f.write(f"{term}\t{sim}\n")

            status_report(memory0, t0, prefix="Start Control")

            # Controls
            control = {c: {} for c in range(1, n_controls+1)}
            for c in range(1, n_controls+1):
                status_report(memory0, t0, prefix=f"Control: {c}")
                for term in shared_terms:
                    control[c][term]={}
                    mega = previous_year["data"][term]["vector"] + this_year["data"][term]["vector"]

                    random.shuffle(mega)
                    cutoff = int(len(mega)/2)
                    ctrl1 = mega[:cutoff]
                    ctrl2 = mega[cutoff:]
                    mean1 = centroid(ctrl1)
                    mean2 = centroid(ctrl2)

                    # can you calculate rectified value for spread?

                    control[c][term]["cch"] = angular_distance(mean1, mean2)
                    control[c][term]["csim"] = np_cos(mean1, mean2)

            #del mega

            #status_report(memory0, t0)

            for n in control.keys():
                with open(out_dir/f"cosine_change/{yi}_{yj}_control{n}.txt", "w") as f:
                    for term in control[n].keys():
                        cch = control[n][term]["cch"]
                        f.write(f"{term}\t{cch}\n")

                with open(out_dir/f"cosine_sim/{yi}_{yj}_control{n}.txt", "w") as f:
                    for term in control[n].keys():
                        sim = control[n][term]["csim"]
                        f.write(f"{term}\t{sim}\n")                            

        previous_year = {k:v for k,v in this_year.items()}
        


In [22]:
def multiple(models, results, n_controls):
    
    for model in os.listdir(models):
        model = Path(model)
        semantic_change(
            model = models / model, 
            out_dir = results / model, 
            n_controls = n_controls)
        
    

In [23]:
multiple(
    models = Path("/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors"), 
    results = Path("/home/max/Results/fb_pol-yearly-bert"),
    n_controls = 10
)


/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sts_fbmodel_big_40epochs 2000.txt
#Centroids, Spread done--memory=4.1 MB; 0 m 0 s.                               
/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sts_fbmodel_big_40epochs 2001.txt
#Control: 10--memory=5.7 MB; 0 m 0 s.                                       
/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sts_fbmodel_big_40epochs 2002.txt
#Control: 10--memory=3.9 MB; 0 m 1 s.                                       
/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sts_fbmodel_big_40epochs 2003.txt
#Control: 10--memory=3.1 MB; 0 m 1 s.                                       
/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sts_fbmodel_big_40epochs 2004.txt
#Control: 10--memory=4.1 MB; 0 m 1 s.                                          
/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sts_fbmodel_big_40epochs 2005.txt
#Control: 10--memory=6.4 MB; 0 m 2 s. 

#Centroids, Spread done--memory=4.0 MB; 0 m 0 s.                               
/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sentence-bert-swedish-cased 2001.txt
#Control: 10--memory=5.5 MB; 0 m 0 s.                                       
/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sentence-bert-swedish-cased 2002.txt
#Control: 10--memory=3.8 MB; 0 m 1 s.                                       
/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sentence-bert-swedish-cased 2003.txt
#Control: 10--memory=2.9 MB; 0 m 1 s.                                       
/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sentence-bert-swedish-cased 2004.txt
#Control: 10--memory=4.0 MB; 0 m 1 s.                                          
/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sentence-bert-swedish-cased 2005.txt
#Control: 10--memory=6.4 MB; 0 m 2 s.                                          
/home/max/Corpora/flashback-pol-time/ye

In [None]:
semantic_change(Path("/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/fb_nli"), 
                Path("/home/max/Results/fb_pol-yearly-bert/fb_nli"),
               10)

In [None]:
for c in range(1, 10):
    print(c)

In [None]:
my_list = [1,2,3,4,5,6,7,8,9]
for i in range(10):
    random.shuffle(my_list)
    print(my_list)

In [None]:
import numpy as np
from numpy.linalg import norm
 
A = np.array([2,1,2,3,2,9])
B = np.array([3,4,2,4,5,5])
 
print("A:", A)
print("B:", B)
 
cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)

In [None]:
def add(a, b):
    return a + b

In [None]:
upc_mean([1,2,3,4,5], add)

In [None]:
A = np.array([2,1,2,3,2,9])
B = np.array([3,4,2,4,5,5])
C = np.array([6,6,6,9,9,9])
D = np.array([9,6,6,9,9,1])

print(np_cos(A, B))
print(sk_cos([A, B, C, D]))



In [None]:
arr = np.array([[1,2,3], [4,5,6], [7,8,9]])
print(sum([1,4,7])/len([1,4,7]), sum([2,5,8])/len([2,5,8]), sum([3,6,9])/len([3,6,9]))
np.mean(arr, axis=0)

In [None]:
df = pd.DataFrame(sk_cos([A, B, C, D]), columns=["A", "B", "C", "D"], index=["A", "B", "C", "D"])
df

In [None]:
upc_mean([A,B,C,D], np_cos)

In [None]:
upc_mean([A], angular_distance)

In [None]:
(0.81885047 + 0.82045058 + 0.98572812) / 3

In [None]:
[[x+y for y in my_list[i:]] for i, x in enumerate(my_list)]

In [None]:
long = df.unstack()
long.index.rename(['v1', 'v2'], inplace=True)
#df = df.to_frame("Cos").reset_index()
long = long.to_frame('cos').reset_index()

In [None]:
long[long["v1"] != long["v2"]]
# long_form['Country A'] != long_form['Country B']

In [None]:
long[long["v1"]=="A"]