#### Procedure (psuedo code)
```
DWEs = {förortsgäng, återvandring, globalist, berika}
selection_strategies = {really_naive, naive_no_overlap, top1, top3, ...}
models = {sbert_kb, ...}

for DWE in DWEs:
    for strategy in selection_strategies:
        for model in models:

            IN_vectors, OUT_vectors = select(replacement_vectors_of_dwe, strategy)
            avg_IN_vec = mean(IN_vectors)
            avg_OUT_vec = mean(OUT_vectors)

            for year in years:
                dwe_vect_at_year = get_vec(DWE)

                IN_dimension_mean = cosine_similarity(avg_IN_vec, dwe_vect_at_year)
                IN_dimension_pairwise_mean = mean(cosine_similarity(IN_vectors, dwe_vect_at_year))

                OUT_dimension_mean = cosine_similarity(avg_OUT_vec, dwe_vect_at_year)
                OUT_dimension_pairwise_mean = mean(cosine_similarity(OUT_vectors, dwe_vect_at_year))  

                norm_dimension_mean = normalizer(IN_dimension_mean, OUT_dimension_mean)
                norm_dimension_pariwise_mean = normalizer(IN_dimension_pairwise_mean, OUT_dimension_pairwise_mean)

                # e.g. softmax or simply normalize(x, y) = x / (x+y)
```

#### Will get you something like (for each model)...

|DWE            |Selection strategy|Method Dimension|Year<sub>1</sub>|...|Year<sub>*n*</sub>|
|---------------|------------------|----------------|----------------|---|------------------|
|DWE<sub>1</sub>|Really naive      |Mean            |...             |...|...               |
|DWE<sub>1</sub>|Really naive      |Pairwise mean   |...             |...|...               |
|DWE<sub>1</sub>|Really naive      |Normalized      |...             |...|...               |
|DWE<sub>1</sub>|Naive no overlap  |Mean            |...             |...|...               |
|DWE<sub>1</sub>|Naive no overlap  |Pairwise mean   |...             |...|...               |
|DWE<sub>1</sub>|Naive no overlap  |Normalized      |...             |...|...               |
|DWE<sub>1</sub>|Top1              |...             |...             |...|...               |
|...            |...               |...             |...             |...|...               |
|DWE<sub>2</sub>|...               |...             |...             |...|...               |


In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import pandas as pd
from difflib import SequenceMatcher
from pathlib import Path
from sklearn.utils.extmath import softmax
from collections import Counter
import json
from gensim.models import KeyedVectors
import time
from datetime import datetime
import logging

In [2]:
import unimorph

In [3]:
import stanza
nlp = stanza.Pipeline(lang='sv', processors='tokenize,pos,lemma')

2024-06-03 10:46:05 INFO: Loading these models for language: sv (Swedish):
| Processor | Package   |
-------------------------
| tokenize  | talbanken |
| pos       | talbanken |
| lemma     | talbanken |

2024-06-03 10:46:05 INFO: Use device: cpu
2024-06-03 10:46:05 INFO: Loading: tokenize
2024-06-03 10:46:06 INFO: Loading: pos
2024-06-03 10:46:07 INFO: Loading: lemma
2024-06-03 10:46:07 INFO: Done loading processors!


In [4]:
def keyness(trg, ref, min_frq = 3, verbose = True): # Consider metric
    
    d = dict()
    
#     trg_tot = sum(trg.values())
#     ref_tot = sum(ref.values())
    trg_tot = len(trg)
    ref_tot = len(ref)
    
    for w in trg.keys():
        if trg[w] < min_frq:
            continue
        if w in ref:
            d[w] = (trg[w] / trg_tot) / (ref[w] / ref_tot) # Odds Ratio (OR)
        else:
            d[w] = np.inf
    
    if verbose:
        for word, trg_freq, keyness  in sorted([(w, trg[w], k) for w, k in d.items()], key = lambda x: x[1], reverse = True)[:20]:
            if word in ref:
                ref_freq = ref[word]
            else:
                ref_freq = 0
            print(f"{word:<20}{trg_freq:<4}{(trg_freq/trg_tot):<6.3f}{ref_freq:<4}{(ref_freq/ref_tot):<6.3f}{keyness:.4}")        
    
    return d
    

In [5]:
def inspect(
    df,            # Replacement Dataframe
    dwe,           # Dog Whistle Expression
    meaning,       # 1 for ingroup, 2 for outgroup
    phase,         # 1 for first phase of data collection, 2 for second phase
    sw = None,     # stopwords
    punct = None,  # remove punctuations
    verbose = True,
    multi = False, # Keep the multi-word units of the replacements
    rel_freq = False, # use relative frequncies freq / no. of documents
    lower_input = True 
):
    
    counter = Counter()
    
    if type(df) == pd.DataFrame:
        column = df.loc[df[f"{dwe}_w{phase}_C"] == meaning, f"{dwe}_text_w{phase}"]
    else:
        column = df
    
    for x in column:
        if lower_input:
            x = x.lower()
        
        if punct != None:
            for p in punct:
                x = x.replace(p, "")
        x = x.split()
        if sw != None:
            x = [w for w in x if w not in sw]
        
        if multi:
            x = ["_".join(x)]
        
        counter.update(set(x)) # Obs. terms are only counted once per "document"
    
    if rel_freq:
        counter = Counter({w: c/len(column) for w,c in counter.items()})
        
    if verbose:
        for w, f in sorted(counter.items(), key = lambda x: x[1], reverse = True)[:15]:
            print(f"{w:<30}{f}")
        print("-----------------------")
        print("Total no. of types:", len(counter))

    return counter

In [6]:
def select_A(
    df,             # Replacement Dataframe
    dwe,            # Dog Whistle Expression
    phase = "both", # 1 for first phase of data collection, 2 for second phase, "both" for both
    sw = None,      # stopwords
    punct = None,   # remove punctuations
    k = None,
    min_freq = None,
    min_OR = None,
    empty_intersect = False
):
    
    if type(k) == tuple:
        k_in, k_out = k
    else:
        k_in  = k
        k_out = k
    if type(min_freq) == tuple:
        min_freq_in, min_freq_out = min_freq
    else:
        min_freq_in  = min_freq
        min_freq_out = min_freq
    if type(min_OR) == tuple:
        min_OR_in, min_OR_out = min_OR
    else:
        min_OR_in  = min_OR
        min_OR_out = min_OR
    
    if phase == "both":
        x = pd.concat([
            df.loc[df[f"{dwe}_w{1}_C"] == 1, f"{dwe}_text_w{1}"],
            df.loc[df[f"{dwe}_w{2}_C"] == 1, f"{dwe}_text_w{2}"]
        ]).to_list()
                
        y = pd.concat([
            df.loc[df[f"{dwe}_w{1}_C"] == 2, f"{dwe}_text_w{1}"],
            df.loc[df[f"{dwe}_w{2}_C"] == 2, f"{dwe}_text_w{2}"]
        ]).to_list()

        ingroup = inspect(x, dwe, None, None, sw, punct, verbose = False, rel_freq = True)
        outgroup = inspect(y, dwe, None, None, sw, punct, verbose = False, rel_freq = True)

        keyness_in2out = keyness(ingroup, outgroup, verbose = False, min_frq = -1)
        keyness_out2in = keyness(outgroup, ingroup, verbose = False, min_frq = -1)
        
    else:    
    
        ingroup = inspect(df, dwe, 1, phase, sw, punct, verbose = False, rel_freq = True)
        outgroup = inspect(df, dwe, 2, phase, sw, punct, verbose = False, rel_freq = True)
        keyness_in2out = keyness(ingroup, outgroup, verbose = False, min_frq = -1)
        keyness_out2in = keyness(outgroup, ingroup, verbose = False, min_frq = -1)
    
    A_in  = [w for w in ingroup.keys()]
    A_out = [w for w in outgroup.keys()]
    
    if empty_intersect:
        A_in  = [w for w in A_in if w not in outgroup.keys()]
        A_out = [w for w in A_out if w not in ingroup.keys()]
        
    if min_freq != None:
        A_in  = [w for w in A_in if ingroup[w] >= min_freq_in]
        A_out = [w for w in A_out if outgroup[w] >= min_freq_out]
    
    if min_OR != None:
        A_in  = [w for w in A_in if keyness_in2out[w] >= min_OR_in]
        A_out = [w for w in A_out if keyness_out2in[w] >= min_OR_out] # too strict to have the same threshold for both
        
    if k != None:
        A_in  = [w for w,_ in sorted(ingroup.items(), key = lambda x: x[1], reverse = True) if w in A_in][:k_in]
        A_out = [w for w,_ in sorted(outgroup.items(), key = lambda x: x[1], reverse = True) if w in A_out][:k_out]
    
    
    return A_in, A_out

In [10]:
#     df,            # Replacement Dataframe
#     dwe,           # Dog Whistle Expression
#     phase,         # 1 for first phase of data collection, 2 for second phase, "both" for both
#     sw = None,     # stopwords
#     punct = None,  # remove punctuations
#     k = None,
#     min_freq = None,
#     min_OR = None,
#     empty_intersect = False

def strat2select(mode, dwe, wh_rnds, path_dfA, stopwords, punct, verbose = True):
    """
    Based on a strategy, i.e. `mode`, Select A and returns vectors of replacments that map to A. 
    Uses `select_A()` and `collect_vec()`.
    """
    
    if mode == "rn":    # Really naive; probably the most sensible for SBERT
        
        igt_vectors = []
        for rnd in wh_rnds:
            _, vecs = zip(*load_replacements(dwe, "ingroup", rnd, model, data_path))
            
            igt_vectors.extend(vecs)
        
        ogt_vectors = []
        for rnd in wh_rnds:
            _, vecs = zip(*load_replacements(dwe, "outgroup", rnd, model, data_path))
            ogt_vectors.extend(vecs)  
            
        return np.array(igt_vectors), np.array(ogt_vectors)
    
    else:
        
        dfA = pd.read_csv(path_dfA, sep="\t") # check parameters
        dfA = dfA.applymap(lambda s: s.lower() if type(s) == str else s)
        
        if wh_rnds == ["first_round"]:
            PHASE = 1
        if wh_rnds == ["second_round"]:
            PHASE = 2
        else:
            PHASE = "both"
        
        if mode == "nno":   # Naive No Overlap
            Aigt, Aogt = select_A(
                df = dfA, 
                dwe = dwe, 
                phase = PHASE, 
                sw = stopwords, 
                punct = punct, 
                empty_intersect = True)

        if mode == "top1":  # Top 1 (no overlap)
            Aigt, Aogt = select_A(
                df = dfA, 
                dwe = dwe,
                phase = PHASE,
                sw = stopwords,
                punct = punct,
                k = 1,
                empty_intersect = True
            )

        if mode == "top3":  # Top 3 (no overlap)
            Aigt, Aogt = select_A(
                df = dfA, 
                dwe = dwe,
                phase = PHASE,
                sw = stopwords,
                punct = punct,
                k = 3,
                empty_intersect = True
            )

        if mode == "ms1":    # Multiple Selection; threshold ... 
            Aigt, Aogt = select_A(
                df = dfA, 
                dwe = dwe,
                phase = PHASE,
                sw = stopwords,
                punct = punct,
                k = 3,
                min_OR = 2.0,
                empty_intersect = False
            )
        
        if verbose:
            if len(Aigt) < 4:
                logging.info(f"Aigt: {', '.join(Aigt)}")
                logging.info(f"Aogt: {', '.join(Aogt)}")
        
        if wh_rnds == 1:
            rounds = ["first_round"]
        elif wh_rnds == 2:
            rounds = ["second_round"]
        else: # i.e. wh_rnds == "both"
            rounds = ["first_round", "second_round"]
        
        return Aigt, Aogt

In [11]:
def PairwiseMeanSimilarity(v, v_list, METHOD = cosine_similarity):
    
    pairwise = METHOD(v, v_list)
    pairwise_mean = pairwise.mean()
    
    return pairwise_mean

In [12]:
def angular_distance(v1, v2):
    
    angular = np.arccos(cosine_similarity(v1,v2)) / np.pi # Noble et al
    
    return angular

In [13]:
def similar_string(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [14]:
def repl_dwe(dwe, rule = None, verbose = True):
    
    if rule != None:
        return rule[dwe]
    else: # infer!
        potential_dwes = ["forortsgang", "aterinvandring", "berikar", "globalister"]
        
        dwe = dwe.split("_")[-1]
        
        best_score = 0
        best_guess = None
        
        for candidate in potential_dwes:
            score = similar_string(dwe, candidate)
            if score > best_score:
                best_score = score
                best_guess = candidate
        
        if verbose:
            logging.info(f"Inference for {dwe}: {best_guess} (score = {best_score:.2f}).")
        
        return best_guess


In [15]:
class Config:
    def __init__(self):
        self
        
#         self.dwes = dwes
#         self.wh_rounds = rounds
#         self.dfA_path = dfA_path
#         self.strategies = strategies
#         self.years = years
#         self_measures = 
#         self.add_correlations
#         self.model
        

In [16]:
def get_keyed_vec(term, keyed_vecs):
    
    if term in keyed_vecs:
        vec = keyed_vecs[term]
    else:
        vec = None
    
    return vec

In [17]:
def load_vocab(filename): 
    vocab = {}
    with open(filename) as f:
        for line in f.readlines():
            w,frq = line.rstrip('\n').split()
            vocab[w] = int(frq)
    return vocab 

In [18]:
def lemmatize(A):
    """ 
    Lemmatizes a list of words. 
    If no lemma is found, the original term is kept as the lemma.
    """
    
    A_mod     = {}
    for w in A:
        doc = nlp(w)
        lemma = doc.sentences[0].words[0].lemma # it seems stanza always returns something
        
        ######################################################################
        # The Stanza lemmatizer for Swedish infer the lemma "jud" for "judar"?
        # How to "update" the lemmatizer with lemma "jude" for "judar"
        if lemma == "jud":
            lemma = "jude"
        ######################################################################
        
        if lemma in A_mod:
            A_mod[lemma].add(w)
        else:
            A_mod[lemma] = set()
            A_mod[lemma].add(w)
    
    return list(A_mod.keys()), A_mod


In [19]:
def wf_expand(B, A_mod, use_saldo, saldo, verbose = True):
    """ 
    Expands a lemma (lexeme) to its word forms. 
    If no expansion is found, the lemma form + the original word form(s) re kept as the only word forms.
    """
    
    exp_B = []
    
    for lemma in B:
        # Try first unimorph
        wfs = [line.split("\t")[1] for line in unimorph.inflect_word(lemma, lang="swe").split("\n") if line != ""]
        if wfs != []:
            exp_B.append(set(wfs))
        else: # wfs == []
            if use_saldo:
                # Try Saldo
                if lemma in saldo:
                    exp_B.append(set(saldo[lemma]))
                else:
                    if verbose:
                        logging.info(f"For {lemma}, neither `unimorph` nor `saldo` found nothing.")
                    logging.info(f"Amod to rescue...{set(A_mod[lemma])}")
                    exp_B.append(set(A_mod[lemma]))
            else:
                exp_B.append(set(A_mod[lemma]))
        
            
    return exp_B


In [20]:
def a2b2vec(A, strategy, wv, vocab, use_saldo, saldo):
    
    vecs = []             
    
    if strategy == "lazy": # do nothing; take them as they are
        
        for w in [w for w in A if w in wv]:
            vecs.append(wv[w])

    else:
        
        B, A_mod = lemmatize(A)
        B        = wf_expand(B, A_mod, use_saldo, saldo) # implement as a list of sets in order to pick to most common forms of a lemma
        B        = [[w for w in lexeme if w in wv and w in vocab] for lexeme in B]
        
        
        if strategy == "greedy": # hungry
            logging.info(f"B: {B}")
            for lexeme in B:
                for w in lexeme:
                    vecs.append(wv[w])
                    
        else:
            #print(B)
            lemmatized_voc_B   = {lexeme[0].upper(): {w: vocab[w] for w in lexeme if w in vocab} for lexeme in B if not lexeme == []}
            flattened_voc_B    = {wf: vocab[wf] for lexeme in B for wf in lexeme}
            proportional_voc_B = {lexeme: {w: (lemmatized_voc_B[lexeme][w]/sum(lemmatized_voc_B[lexeme].values())) for w in lemmatized_voc_B[lexeme]} for lexeme in lemmatized_voc_B}
            flatt_prop_voc_B   = dict()
            for p_dict in proportional_voc_B.values():
                for w, prop in p_dict.items():
                    flatt_prop_voc_B[w] = prop
            
            if strategy.startswith("top"): # e.g. top1, top3, etc.
                k = int(strategy.replace("top", ""))
                T = []

                for lexeme in B: # lexeme is a list
                    
                    VOC = {w:f for w,f in flattened_voc_B.items() if w in lexeme}
                    
                    ranked = sorted(VOC.items(), key = lambda x: x[1], reverse=True)[:k]
                    for w, _ in ranked:
                        T.append(w)

                logging.info(f"B: {', '.join(T)}")
                for w in T:
                    vecs.append(wv[w])
            
            if strategy.startswith("min"): # e.g. min0.1 
                threshold = float(strategy.replace("min", ""))
                T = []
            
                for lexeme in B:
                    for w in lexeme:
                        if flatt_prop_voc_B[w] >= threshold:
                            T.append(w)
                            
                logging.info(f"B: {', '.join(T)}")                
                for w in T:
                    vecs.append(wv[w])
    
    if vecs == []:
        return None
    else:
        
        return np.array(vecs)

In [29]:
def sgns_builder(config):
    
    t0 = time.time()
    
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(
        level=logging.INFO, 
        handlers=[
            logging.FileHandler(config.log_path / f"{config.log_prefix}_sgns_builder.log", mode= "w"),
            logging.StreamHandler()
        ])
    
    logging.info(vars(config))    
    results = []
    methods = [
        "I-cnt", 
        "I-ang",
        "I-pwn", 
        "O-cnt",
        "O-pwn",
        "O-ang",
        "cnt-ssc", 
        "cnt-smx", 
        "pwn-ssc", 
        "pwn-smx",
        "ang-ssc",
        "ang-smx",
        "sep-ang",
        "sep-euc"
        ]
    # alternative: make this a config attribute
    years = [str(year) for year in range(config.first_year, config.last_year+1)] # Obs! need to add 1
    years.sort()
    
    if config.use_saldo:
        with open(config.saldo_path) as f:
            saldo = json.loads(f.read())
    else:
        saldo = None
    
    with open(config.stopwords) as f:
        stopwords = [w.strip("\n") for w in f.readlines()]
    
    for progress, dwe in enumerate(config.dwes, start = 1):
        t = time.time() - t0
        logging.info(f"PROCESSING {progress} OF {len(config.dwes)}: '{dwe}'; {int(t/60)} m. {int(t%60)} s.")
        dwe_in_replacement_test = repl_dwe(dwe)
        for a_strategy in config.Astrategies:
            logging.info(f"A-Strategy: {a_strategy}")
            
            Aigt, Aogt = strat2select(
                mode      = a_strategy, 
                dwe       = dwe_in_replacement_test, 
                wh_rnds   = config.wh_rounds, 
                #model     = config.model, 
                path_dfA  = config.dfA_path, 
                stopwords = stopwords, 
                punct     = config.punct, 
                #data_path = config.data_path
            )

            d = {b: {method: [] for method in methods} for b in config.Bstrategies}

            for year in years:
                
                wv = KeyedVectors.load_word2vec_format(config.sgns_path / f"{year}.w2v")
                vocab = load_vocab(config.vocab_path / f"{year}.txt")

                dwe_vector = get_keyed_vec(dwe, wv)
                
                for b_strategy in config.Bstrategies:
                    logging.info(f"{year} :: B-strategy: {b_strategy}")
                
                    if type(dwe_vector) != np.ndarray:
                        d[b_strategy]["I-cnt"].append(None) 
                        d[b_strategy]["O-cnt"].append(None)
                        d[b_strategy]["I-pwn"].append(None)
                        d[b_strategy]["O-pwn"].append(None)                        
                        d[b_strategy]["I-ang"].append(None)
                        d[b_strategy]["O-ang"].append(None)
                        
                        d[b_strategy]["cnt-ssc"].append(None)
                        d[b_strategy]["cnt-smx"].append(None)
                        d[b_strategy]["pwn-ssc"].append(None)
                        d[b_strategy]["pwn-smx"].append(None) 
                        d[b_strategy]["ang-ssc"].append(None)
                        d[b_strategy]["ang-smx"].append(None)

                        d[b_strategy]["sep-ang"].append(None)
                        d[b_strategy]["sep-euc"].append(None)                        

                    else:                
                        logging.info("In-group")
                        INGROUPvec  = a2b2vec(Aigt, b_strategy, wv, vocab, config.use_saldo, saldo)
                        logging.info("Out-group")
                        OUTGROUPvec = a2b2vec(Aogt, b_strategy, wv, vocab, config.use_saldo, saldo)
                        
                        if type(INGROUPvec) == np.ndarray:
                            ING_centroid  = INGROUPvec.mean(axis=0)
                            
                            i_cnt = cosine_similarity(
                                dwe_vector.reshape(1,-1), 
                                ING_centroid.reshape(1,-1)
                                )[0][0]
                            i_pwn = PairwiseMeanSimilarity(dwe_vector.reshape(1, -1), INGROUPvec)
                            i_cnt_ang = 1 - angular_distance(
                                dwe_vector.reshape(1,-1), 
                                ING_centroid.reshape(1,-1)
                                )[0][0] # https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/cosdist.htm
                            
                            d[b_strategy]["I-cnt"].append(i_cnt)
                            d[b_strategy]["I-pwn"].append(i_pwn)
                            d[b_strategy]["I-ang"].append(i_cnt_ang)
                            
                            if type(OUTGROUPvec) == np.ndarray: # Both ingroup and outgroup vector
                                OUTG_centroid = OUTGROUPvec.mean(axis=0)

                                o_cnt = cosine_similarity(dwe_vector.reshape(1,-1), OUTG_centroid.reshape(1,-1))[0][0]
                                o_pwn = PairwiseMeanSimilarity(dwe_vector.reshape(1, -1), OUTGROUPvec)
                                o_cnt_ang = 1 - angular_distance(
                                    dwe_vector.reshape(1,-1), 
                                    OUTG_centroid.reshape(1,-1)
                                    )[0][0] # https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/cosdist.htm

                                sep_ang_cnt = angular_distance(
                                    ING_centroid.reshape(1,-1), 
                                    OUTG_centroid.reshape(1,-1)
                                    )[0][0] 
                                sep_euc_cnt = euclidean_distances(
                                    ING_centroid.reshape(1,-1), 
                                    OUTG_centroid.reshape(1,-1)
                                    )[0][0]
                                
                                d[b_strategy]["O-cnt"].append(o_cnt)
                                d[b_strategy]["O-pwn"].append(o_pwn)
                                d[b_strategy]["O-ang"].append(o_cnt_ang)
                                 
                                d[b_strategy]["cnt-ssc"].append(i_cnt / (i_cnt + o_cnt))
                                d[b_strategy]["cnt-smx"].append(softmax([[i_cnt, o_cnt]])[0][0])
                                d[b_strategy]["pwn-ssc"].append(i_pwn / (i_pwn + o_pwn))
                                d[b_strategy]["pwn-smx"].append(softmax([[i_pwn, o_pwn]])[0][0])
                                d[b_strategy]["ang-ssc"].append(i_cnt_ang / (i_cnt_ang + o_cnt_ang))
                                d[b_strategy]["ang-smx"].append(softmax([[i_cnt_ang, o_cnt_ang]])[0][0])
                                
                                d[b_strategy]["sep-ang"].append(sep_ang_cnt)
                                d[b_strategy]["sep-euc"].append(sep_euc_cnt)
                            
                            else: # Ingroup, but no outgroup vector
                                d[b_strategy]["O-cnt"].append(None)
                                d[b_strategy]["O-pwn"].append(None)
                                d[b_strategy]["O-ang"].append(None)
                                
                                d[b_strategy]["cnt-ssc"].append(None)
                                d[b_strategy]["cnt-smx"].append(None)
                                d[b_strategy]["pwn-ssc"].append(None)
                                d[b_strategy]["pwn-smx"].append(None)
                                d[b_strategy]["ang-ssc"].append(None)
                                d[b_strategy]["ang-smx"].append(None)

                                d[b_strategy]["sep-ang"].append(None)
                                d[b_strategy]["sep-euc"].append(None)                                
                                
                        else: # No ingroup vector
                            d[b_strategy]["I-cnt"].append(None)
                            d[b_strategy]["I-pwn"].append(None)
                            d[b_strategy]["I-ang"].append(None)
                            
                            d[b_strategy]["cnt-ssc"].append(None)
                            d[b_strategy]["cnt-smx"].append(None)
                            d[b_strategy]["pwn-ssc"].append(None)
                            d[b_strategy]["pwn-smx"].append(None)
                            d[b_strategy]["ang-ssc"].append(None)
                            d[b_strategy]["ang-smx"].append(None)
                            
                            d[b_strategy]["sep-ang"].append(None)
                            d[b_strategy]["sep-euc"].append(None)                                
                            
                            if type(OUTGROUPvec) == np.ndarray: # Outgtoup, but no ingroup vector
                                OUTG_centroid = OUTGROUPvec.mean(axis=0)
                                
                                o_cnt = cosine_similarity(
                                    dwe_vector.reshape(1,-1), 
                                    OUTG_centroid.reshape(1,-1)
                                    )[0][0]
                                o_pwn = PairwiseMeanSimilarity(dwe_vector.reshape(1, -1), OUTGROUPvec)
                                o_cnt_ang = 1 - angular_distance(
                                    dwe_vector.reshape(1,-1), 
                                    OUTG_centroid.reshape(1,-1)
                                    )[0][0] # https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/cosdist.htm

                                d[b_strategy]["O-cnt"].append(o_cnt)
                                d[b_strategy]["O-pwn"].append(o_pwn)
                                d[b_strategy]["O-ang"].append(o_cnt_ang)

            if config.results_format == "long":
                
                for b_strategy in d.keys():
                    for method in d[b_strategy].keys():
                        line = [dwe, a_strategy, b_strategy, method]
                        line.extend(d[b_strategy][method])
    #                     if config.add_correlations:
    #                         r_naive
    #                         r_rect
    #                         rho_naive
    #                         rho_rect
    #                         r_fpm
    #                         rho_fpm
    #                         N                    
                        results.append(line)

            else: # if results_format == "wide"
                for b_strategy in d.keys():
                    line = [dwe, a_strategy, b_strategy]
                    for method in d[b_strategy].keys:
                        line.extend(d[b_strategy][method])
                results.append(line)
    
    if config.results_format == "long":
        features = ["DWE", "A-Strategy", "B-Strategy", "Method"] + years
        if config.add_correlations:
            additional_headings = ["r_naive", "r_rect", ...]
            features.extend(additional_headings)
        
    
    else: # if wide
        features = ["DWE", "A-Strategy", "B-Strategy"]
        for method in methods:
            m = [f"{method}_{year}" for year in years]
            features.extend(m)
    
    df = pd.DataFrame(results, columns = features)
    
    df.to_csv(config.results_path)
    
    t = time.time() - t0
    logging.info(f"Done! {int(t/60)} m. {int(t%60)} s.")    
                    

## Run

## Window sizes, 100 dim

### w5, 10, 15 (Flashback)

In [None]:
for WS in [5, 10, 15]:
    config = Config()
    #WS=5

    config.log_prefix = f"fb-w{WS}"
    config.log_path   = Path(f"/home/max/Results/rplc_w{WS}/log")
    config.first_year = 2000
    config.last_year  = 2022
    #config.last_year  = 2005
    config.dwes       = [
                        "V1_berika",
                        "N1_berikare",
                        "V1_kulturberika",
                        "N1_kulturberikare",
                        "N1_globalist",
                        "A1_globalistisk",
                        "N1_återvandring",
                        "V1_återvandra",
                        #"V1_hjälpa_på_plats",
                        "N1_förortsgäng"
                        ]
    #config.Astrategies = ["top1", "top3", "ms1"] # add "rn", "nno" but need to fix code in function
    config.Astrategies = ["top3", "ms1"] # add "rn", "nno" but need to fix code in function
    #config.Bstrategies = ["lazy", "greedy", "top1", "top3", "min0.5", "min0.2"]
    config.Bstrategies = ["lazy", "greedy", "top3", "min0.2"]
    config.wh_rounds  = ["first_round", "second_round"]
    config.dfA_path   = Path("/home/max/Documents/research/replacement_data/panel_wide_onlyreplace.csv")
    config.stopwords  = Path("../../data/utils/stopwords-sv.txt")
    config.punct      = [",", "?", ".", "!", ";", "”", '"', ")", ")", "&", "=", "'"]
#     config.data_path   = Path(f"/home/max/Results/rplc_w{WS}/data")
    config.results_format = "long"
    config.add_correlations = False
    config.results_path = Path(f"/home/max/Results/rplc_w{WS}/results/fb_sgns-w{WS}_results.csv")
    config.sgns_path = Path(f"/home/max/Results/fb_pol-yearly-rad3-w{WS}-d100/models") # <- 100 dim!
    config.use_saldo = True
    config.saldo_path = Path("/home/max/Datasets/saldom.json")
    config.vocab_path = Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical3/vocab")

    sgns_builder(config)

## Window sizes, 200 dim

### w5, 10, 15 (Flashback)

In [None]:
for WS in [5]:
    config = Config()
    #WS=5

    config.log_prefix = f"fb-w{WS}-200"
    config.log_path   = Path(f"/home/max/Results/rplc_w{WS}-200/log")
    config.first_year = 2000
    config.last_year  = 2022
    #config.last_year  = 2005
    config.dwes       = [
                        "V1_berika",
                        "N1_berikare",
                        "V1_kulturberika",
                        "N1_kulturberikare",
                        "N1_globalist",
                        "A1_globalistisk",
                        "N1_återvandring",
                        "V1_återvandra",
                        #"V1_hjälpa_på_plats",
                        "N1_förortsgäng"
                        ]
    #config.Astrategies = ["top1", "top3", "ms1"] # add "rn", "nno" but need to fix code in function
    config.Astrategies = ["top3", "ms1"] # add "rn", "nno" but need to fix code in function
    #config.Bstrategies = ["lazy", "greedy", "top1", "top3", "min0.5", "min0.2"]
    config.Bstrategies = ["lazy", "greedy", "top3", "min0.2"]
    config.wh_rounds  = ["first_round", "second_round"]
    config.dfA_path   = Path("/home/max/Documents/research/replacement_data/panel_wide_onlyreplace.csv")
    config.stopwords  = Path("../../data/utils/stopwords-sv.txt")
    config.punct      = [",", "?", ".", "!", ";", "”", '"', ")", ")", "&", "=", "'"]
#     config.data_path   = Path(f"/home/max/Results/rplc_w{WS}/data")
    config.results_format = "long"
    config.add_correlations = False
    config.results_path = Path(f"/home/max/Results/rplc_w{WS}-200/results/fb_sgns-w{WS}-200_results.csv")
    config.sgns_path = Path(f"/home/max/Results/fb_pol-yearly-rad3-w{WS}-d200/models") # <- 100 dim!
    config.use_saldo = True
    config.saldo_path = Path("/home/max/Datasets/saldom.json")
    config.vocab_path = Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical3/vocab")

    sgns_builder(config)

In [None]:
for WS in [10, 15]:
    config = Config()
    #WS=5

    config.log_prefix = f"fb-w{WS}-200"
    config.log_path   = Path(f"/home/max/Results/rplc_w{WS}-200/log")
    config.first_year = 2000
    config.last_year  = 2022
    #config.last_year  = 2005
    config.dwes       = [
                        "V1_berika",
                        "N1_berikare",
                        "V1_kulturberika",
                        "N1_kulturberikare",
                        "N1_globalist",
                        "A1_globalistisk",
                        "N1_återvandring",
                        "V1_återvandra",
                        #"V1_hjälpa_på_plats",
                        "N1_förortsgäng"
                        ]
    #config.Astrategies = ["top1", "top3", "ms1"] # add "rn", "nno" but need to fix code in function
    config.Astrategies = ["top3", "ms1"] # add "rn", "nno" but need to fix code in function
    #config.Bstrategies = ["lazy", "greedy", "top1", "top3", "min0.5", "min0.2"]
    config.Bstrategies = ["lazy", "greedy", "top3", "min0.2"]
    config.wh_rounds  = ["first_round", "second_round"]
    config.dfA_path   = Path("/home/max/Documents/research/replacement_data/panel_wide_onlyreplace.csv")
    config.stopwords  = Path("../../data/utils/stopwords-sv.txt")
    config.punct      = [",", "?", ".", "!", ";", "”", '"', ")", ")", "&", "=", "'"]
#     config.data_path   = Path(f"/home/max/Results/rplc_w{WS}/data")
    config.results_format = "long"
    config.add_correlations = False
    config.results_path = Path(f"/home/max/Results/rplc_w{WS}-200/results/fb_sgns-w{WS}-200_results.csv")
    config.sgns_path = Path(f"/home/max/Results/fb_pol-yearly-rad3-w{WS}-d200/models") # <- 100 dim!
    config.use_saldo = True
    config.saldo_path = Path("/home/max/Datasets/saldom.json")
    config.vocab_path = Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical3/vocab")

    sgns_builder(config)