In [1]:
import os
from pathlib import Path
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity as sk_cos
import pandas as pd
import random
import tracemalloc
import time

In [2]:
random.seed(111)

In [3]:
def np_cos(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1)*np.linalg.norm(v2))

In [4]:
def angular_distance(v1, v2):
    return np.arccos(np_cos(v1,v2)) / np.pi

In [5]:
def upc_mean0(lst, function): # Mean of Unique Pairwise Comparison 
    # Very ineffecient
    
    if len(lst) == 1:
        return np.nan
    
    total = []

    for i, x in enumerate(lst, start=1): # Obs! start = 1
        for y in lst[i:]:
            total.append(function(x, y)) 
    
    M = sum(total)/len(total)
    
    return M

In [6]:
def upc_mean(lst, function=sk_cos): # Mean of Unique Pairwise Comparison (UPC)
    
    if len(lst) == 1:
        return np.nan    
    
    matrix = function(lst)
    
    L    = len(lst)
    S    = matrix.sum()
    adjS = S - L    # Remove diagonal ; 1.0 x N
    adjS = adjS / 2 # Remove duplicates from upper AND lower half in matrix
    N    = ((L*L) - L) / 2
    
    M = adjS / N 
    
    return M
    

In [7]:
def iline(line):
    line = line.strip("\n")
    line = line.split("\t")
    term = line[0]
    if len(term.split()) > 1:
        term = "_".join(term.split()[:3])
    vector = [float(value) for value in line[-1].split()]
    return term, vector

In [8]:
def centroid(vectors):
    arr = np.array(vectors)
    return np.mean(arr, axis=0)

In [9]:
def status_report(memory0, t0, prefix=""):
    norm, unit = (1000000, "MB")
    memory1 = tracemalloc.get_traced_memory()
    memory  = round(memory1[0]/norm, 1)
    memory_delta = round((memory1[0]-memory0[0])/norm, 1)
    memory0 = memory1
    t_delta = time.time() - t0
    m = int(t_delta/60)
    s = int(t_delta%60)
    print(f"#{prefix}--memory={memory} {unit}; {m} m {s} s.                   ", end="\r")     

In [10]:
np.set_printoptions(linewidth=np.inf)

In [11]:
def get_vector(year, term, directory):
    
    filename = f"{term}_{year}.tmp" # f"{term}_{this_year['year']}.tmp"
    vs = []
    with open(directory / filename, "r") as f:
        for line in f:
            vs.append(np.fromstring(line, dtype=float, sep=' '))
    
    return vs
    

In [12]:
merger = {
    "N1C_berikareX": "N1_berikare",
    "N1C_förortsgängX": "N1_förortsgäng",
    "N1C_globalistX": "N1_globalist",
    "N1C_kulturberikarX": "N1_kulturberikare",
    "N1C_återvandringsX": "N1_återvandring",
    "N2C_återvandrarX": "N2_återvandrare",
}

In [13]:
def semantic_change(
    model, 
    out_dir, 
    n_controls, 
    pool_by = centroid, 
    merge_cmp = False, # Merge compounds, e.g. N1C_berikareX --> N1_berikare (there is a risk BERT clustering picks up on this)
    x_help = True, # Ad hoc! Should be solved in the data!! E.g. "X_hjälpa dem på plats" should be solved by paradigm file
    tmp_dir = "/home/max/tmp/tmp"
):

    tracemalloc.start()
    memory0 = (0,0)
    t0 = time.time()    

    model = Path(model)
    tmp_dir = Path(tmp_dir)
    files = os.listdir(model)
    files.sort()
    previous_year = {}
    
    for i, file in enumerate(files, start=1):
        print()
        print(model, file)
        this_year = {"year": file.replace(".txt", ""), "data": {}}

        # Get data
        with open(model/file, "r") as f:
            for j, line in enumerate(f):

                if j % 300 == 0:
                    status_report(memory0, t0, prefix=f"Line {j} of file")

                term, vector = iline(line)
                
                if merge_cmp:
                    if term in merger:
                        term = merger[term]
                
                if x_help:  # Should be solved in the data by dwts.paradigm
                    if term.startswith("x_hjälpa"):
                        term = "x_hjälpa"
                
                if term == "":
                    continue
                if term in this_year["data"]:
                    this_year["data"][term]["vector"].append(vector)
                else:
                    this_year["data"][term] = {"vector": [vector]}

        status_report(memory0, t0, prefix="Input done")
        
        #print("terms:", sorted(this_year["data"].keys()))

        # Pool, Spread
        with open(out_dir/f"centroid/{file}", "w") as emb, open(out_dir/f"spread/{file}", "w") as wdt: 
            for term in sorted(this_year["data"].keys()):
                
                status_report(memory0, t0, prefix=f"Pool, Spread: {term}")

                pool = pool_by(this_year["data"][term]["vector"])
                status_report(memory0, t0, prefix=f"Pool - check!")
                # high spread close to 1 (similarity low); low spread close to 0 (similarity high)
                spread = 1 - upc_mean(this_year["data"][term]["vector"]) 
                status_report(memory0, t0, prefix=f"UPC - check!")
                
                with open(tmp_dir / f"{term}_{this_year['year']}.tmp", "w") as f:
#                     print()
#                     print(term, this_year["data"][term]["vector"])
#                     print()
                    for v in this_year["data"][term]["vector"]:
                        #print(v)
                        #f.write(np.array_repr(v))
                        f.write(" ".join([str(val) for val in v]) + "\n")
                
                del this_year["data"][term]["vector"]
                
                this_year["data"][term]["mean"] = pool
                pool_as_str = " ".join([str(v) for v in pool.tolist()])# consider: np.array_repr(pool).replace("\n", ""), but we do not want brackets
                emb.write(f"{term}\t{pool_as_str}\n") 

                status_report(memory0, t0, prefix=f"Starting UPC: {term}")

                
                ################  TO BE REMOVED   ##################
                if spread > 1.000001: 
                    # There is some rounding issue ... Noble et al observed something similar; identical vectors have cos_sin > 1.0
                    
                    print()
                    print(f"Something went wrong! ({term})")
                    print("Spread =", spread)
                    
                    #vecs = this_year["data"][term]["vector"]
                    #L    = len(vecs)
                    #print("L", L)
                    #mtrx = sk_cos(vecs)
                    #df   = pd.DataFrame(mtrx)
                    #upc0 = upc_mean0(vecs, function=np_cos)
                    
                    #print("UPC0:", upc0)
                    #print(df.round(3))
                    #print("Matrix sum", mtrx.sum())
                    #print("Adjusted Sum", (mtrx.sum() - len(vecs))/2)
                    #print("Adjusted N", ((L*L) - L) / 2)
                    print()
                    
                    #return
                #####################################################
                this_year["data"][term]["spread"] = spread
                wdt.write(f"{term}\t{spread}\n")

        status_report(memory0, t0, prefix="Centroids, Spread done")

        # Change and controls
        if i > 1:
            yi = previous_year["year"]
            yj = this_year["year"]
            
            shared_terms = [term for term in this_year["data"].keys() if term in previous_year["data"].keys()]

            # Genuine change
            with open(out_dir/f"cosine_change/{yi}_{yj}_genuine.txt", "w") as f:
                for term in shared_terms:
                    gch = angular_distance(previous_year["data"][term]["mean"], this_year["data"][term]["mean"])
                    f.write(f"{term}\t{gch}\n")

            # Genuine similarity
            with open(out_dir/f"cosine_sim/{yi}_{yj}_genuine.txt", "w") as f:
                for term in shared_terms:
                    sim = np_cos(previous_year["data"][term]["mean"], this_year["data"][term]["mean"])
                    f.write(f"{term}\t{sim}\n")

            status_report(memory0, t0, prefix="Start Control")

            # Controls
            control = {c: {} for c in range(1, n_controls+1)}
            for c in range(1, n_controls+1):
                status_report(memory0, t0, prefix=f"Control: {c}")
                for term in shared_terms:
                    control[c][term]={}
                    
                    mega = []
                    mega.extend(get_vector(year = yi, term = term, directory = tmp_dir))
                    mega.extend(get_vector(year = yj, term = term, directory = tmp_dir))
                    
                    random.shuffle(mega)

                    cutoff = int(len(mega)/2)
                    ctrl1 = mega[:cutoff]
                    ctrl2 = mega[cutoff:]
                    mean1 = pool_by(ctrl1)
                    mean2 = pool_by(ctrl2)

                    # can you calculate rectified value for spread?

                    control[c][term]["cch"] = angular_distance(mean1, mean2)
                    control[c][term]["csim"] = np_cos(mean1, mean2)
                    
                    status_report(memory0, t0)

                    del mega
                    
                    status_report(memory0, t0)

            #status_report(memory0, t0)

            for n in control.keys():
                with open(out_dir/f"cosine_change/{yi}_{yj}_control{n}.txt", "w") as f:
                    for term in control[n].keys():
                        cch = control[n][term]["cch"]
                        f.write(f"{term}\t{cch}\n")

                with open(out_dir/f"cosine_sim/{yi}_{yj}_control{n}.txt", "w") as f:
                    for term in control[n].keys():
                        sim = control[n][term]["csim"]
                        f.write(f"{term}\t{sim}\n")                            

        previous_year = {k:v for k,v in this_year.items()}
        


In [14]:
def multiple(models, results, n_controls, merge_compounds, tmp):
    
    for model in os.listdir(models):
        model = Path(model)
        semantic_change(
            model = models / model, 
            out_dir = results / model, 
            n_controls = n_controls,
            merge_cmp = merge_compounds, 
            tmp_dir=tmp
        )
        
    

In [15]:
semantic_change(
    model = Path("../data/vectors/fb_pol/mt5-xl/"), 
    out_dir = Path("../Results/fb_pol-yearly-berT5/mT5-xl/"), 
    n_controls = 10, 
    pool_by = centroid, 
    merge_cmp = True, # Merge compounds, e.g. N1C_berikareX --> N1_berikare (there is a risk BERT clustering picks up on this)
    x_help = True, # Ad hoc! Should be solved in the data!! E.g. "X_hjälpa dem på plats" should be solved by paradigm file
    tmp_dir = "C:/Users/xbohma/Desktop/tmp/"
)


..\data\vectors\fb_pol\mt5-xl 2000.txt
#Centroids, Spread done--memory=0.5 MB; 0 m 1 s.                            
..\data\vectors\fb_pol\mt5-xl 2001.txt
#--memory=0.8 MB; 0 m 4 s.                                                  
..\data\vectors\fb_pol\mt5-xl 2002.txt
#--memory=1.3 MB; 0 m 7 s.                                                  
..\data\vectors\fb_pol\mt5-xl 2003.txt
#--memory=0.7 MB; 0 m 9 s.                                                  
..\data\vectors\fb_pol\mt5-xl 2004.txt
#--memory=0.7 MB; 0 m 13 s.                                                  
..\data\vectors\fb_pol\mt5-xl 2005.txt
#--memory=0.9 MB; 0 m 21 s.                                                   
..\data\vectors\fb_pol\mt5-xl 2006.txt
#--memory=1.1 MB; 0 m 38 s.                                                     
..\data\vectors\fb_pol\mt5-xl 2007.txt
#--memory=1.1 MB; 1 m 22 s.                                                      
..\data\vectors\fb_pol\mt5-xl 2008.txt
#--memory=1.1 MB; 3 