In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import re
from util import load_metric
from scipy.stats import spearmanr, pearsonr, zscore, rankdata
#from collections import Counter

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>div.output_area pre {white-space: pre;}</style>"))

In [3]:
pd.set_option('display.expand_frame_repr', False)

In [4]:
pd.set_option('display.max_columns', None)

## Functions

In [5]:
def jaccard(a, b):
    return len(a.intersection(b)) / len(a.union(b))

In [6]:
def q_show(df, var, k = 40, as_table = False, transition=True, min_freq=None, return_md=False):
    """
    Given a dataframe, shows the top k words for variable var each year.
    Note: as_table only sypported for transition variables!
    """
    
    COLUMNS = [col for col in sorted(df.columns) if col.startswith(var)]

    if min_freq == None:    
        WORDS = [df[col].sort_values(ascending=False)[:k].index for col in COLUMNS]
    else:
        TRANSITIONS = [tuple(col.split("_")[-1].split(":")) for col in COLUMNS]
        WORDS = [df[(df[f"frq_{trs[0]}"] >= min_freq) & (df[f"frq_{trs[1]}"] >= min_freq)][col].sort_values(ascending=False)[:k].index for col, trs in zip(COLUMNS, TRANSITIONS)]
    
    ser = [(c, d) for c, d in zip(COLUMNS, WORDS)]
    
    md = ""
    
    for i, (col, s) in enumerate(ser):
        if i > 0:
            jac = round(jaccard(set(s), ser[i-1][-1]), 2)
        else:
            jac = None
        
        print(); md += "\n"
        print(col, "jaccard =", jac); md += f"{col} jaccard = {jac}"

        if transition:
            if as_table == False:
                print(s)
            else:
                trans  = col.split("_")[-1]
                ti, tj = tuple(trans.split(":"))
                table  = [] 
                for word in s:
                    v    = df.loc[word][col]
                    f_ti = int(df.loc[word][f"frq_{ti}"])
                    f_tj = int(df.loc[word][f"frq_{tj}"])
                    gch  = df.loc[word][f"gch_{trans}"]
                    m    = df.loc[word][f"mccc_{trans}"]
                    std  = df.loc[word][f"stdc_{trans}"]
                    columns = ["Word", var.upper(), "n_i", "n_j", "GCH", "Mctrl", "Sctrl"]
                    table.append([word, v, f_ti, f_tj, gch, m, std])
                print(pd.DataFrame(table, columns=columns).dropna().round(3))
                md += pd.DataFrame(table, columns=columns).dropna().round(3).to_markdown()
        
        else:
            year = col.split("_")[-1]
            table = []
            for word in s:
                v    = df.loc[word][col]
                f = int(df.loc[word][f"frq_{year}"])
                columns = ["Word", var.upper(), "Freq"]
                table.append([word, v, f])
            print(pd.DataFrame(table, columns=columns).dropna().round(3))
            md += pd.DataFrame(table, columns=columns).dropna().round(3).to_markdown()
    if return_md:
        return md

In [7]:
def change_show(df, var, targets, th=4.781, return_md = False, min_freq=10):
    """
    Given a dataframe, shows the value for a variable of target 
    each transition/year.
    Provide threshold to only show the targets meeting the threshold
    """
    
    md = ""

    for col in sorted([col for col in df.columns if col.startswith(var)]):
        trans  = col.split("_")[-1]
        ti, tj = tuple(trans.split(":"))
        table  = [] 
        
        for word in targets:
            v    = df.loc[word][col]
            f_ti = int(df.loc[word][f"frq_{ti}"])
            f_tj = int(df.loc[word][f"frq_{tj}"])
            if min_freq != None:
                if f_ti < min_freq or f_tj < min_freq:
                    continue
            gch  = df.loc[word][f"gch_{trans}"]
            m    = df.loc[word][f"mccc_{trans}"]
            std  = df.loc[word][f"stdc_{trans}"]            
            
            if th != None:
                if v > th:
                    table.append([word, v, f_ti, f_tj, gch, m, std])
            else:
                table.append([word, v, f_ti, f_tj, gch, m, std])
        
        columns = ["Word", "Value", f"n_{ti}", f"n_{tj}", f"gch_{ti}:{tj}", f"M_{ti}:{tj}", f"Std_{ti}:{tj}"]
        if table != []:
            display = pd.DataFrame(table, columns=columns)
            print(); md += "\n" 
            print(col); md += f"{col}\n"
            print(display); md += display.to_markdown()
    
    if return_md:
        return md

In [8]:
def v_renamer(var_string):
    var_string = var_string.split("_")
    yi, yj     = tuple(var_string[-1].split(":"))
    yi         = yi[-2:]
    yj         = yj[-2:]
    return f"{yi}:{yj}"

In [9]:
def overview(
    df, 
    var, 
    targets, 
    prefixes = ("N", "A", "V"), 
    th=4.781, 
    transition = True, 
    min_freq = 10, 
    return_md = False,
    rounder = 3
):
    """ 
    Similar to change_show but:
    * Show data as one table
    * Only show variable (change_show display additional data)
    * Provide th to show True/False
    """
    
    cols = sorted([col for col in df.columns if col.startswith(var)])
    targets = [w for w in targets if w.startswith(prefixes)]
    targets.sort()
    if transition:
        renamer = {k: v_renamer(k) for k in cols}
    else:
        renamer = {k: k.split("_")[-1][-2:] for k in cols}
    
    md = ""
    
    if min_freq != None and transition:
        df = df.copy()
        transitions = find_transitions(df, "df", var)
        for trg in targets:
            for ti, tj in transitions:
                if df.loc[trg][f"frq_{ti}"] < min_freq or df.loc[trg][f"frq_{tj}"] < min_freq:
                    df.at[trg, f"{var}_{ti}:{tj}"] = 0

    if min_freq != None and transition == False:
        df = df.copy()
        years = sorted([int(col.split("_")[-1]) for col in cols])
        for trg in targets:
            for year in years:
                if df.loc[trg][f"frq_{year}"] < min_freq:
                    df.at[trg, f"{var}_{year}"] = 0        
                       
    
    if th != None:
        out = df.loc[targets, cols] > th
        out.rename(columns = renamer, inplace = True)
        print(var.upper()); md += var.upper() + "\n"
        print(out); md += out.to_markdown()
        print("SUM:", out.sum().sum()); md += f"\nSUM: {out.sum().sum()}\n"
    else:
        out = df.loc[targets, cols].round(rounder)
        out.rename(columns = renamer, inplace = True)
        print(var.upper()); md += var.upper() + "\n"
        print(out); md += out.to_markdown()
        print("SUM:", out.sum().sum()); md += f"\nSUM: {out.sum().sum()}\n"
    
    if return_md:
        return md

In [10]:
def read_csv(path):
    return pd.read_csv(path, sep=";", index_col=0)

In [11]:
def get_dwts(df, path):
    with open(Path(path), "r") as f:
        dwt_roots = [w.strip("\n") for w in f.readlines()]
    dwt_regex = re.compile(f"({'|'.join(dwt_roots)})")    
    dwts = [str(w) for w in df.index if re.search(dwt_regex, str(w)) != None]
    return dwts    

In [12]:
def get_variables(df):
    """
    Summeraises variables of a dataframe
    """
    
    yr_prefix = set()
    tr_prefix = set()
    years = set()
    transitions = set()
    
    for v in df.columns:
        prefix, suffix = tuple(v.split("_"))
        if ":" in suffix:
            tr_prefix.add(prefix)
            transitions.add(suffix)
        else:
            yr_prefix.add(prefix)
            years.add(suffix)
    
    return {
        "yr_prefix": yr_prefix, 
        "tr_prefix": tr_prefix,
        "years": years,
        "transitions": transitions
        }    

In [13]:
def checker(word, transition, controls_dir, n_ctrl=10, variable="cosine_change"):
    """
    Goes to original data, shows the control change/similarity of a word 
    at a transition.
    param word
    param transition    tupple of ti and tj 
    param controls_dir  where to find controls
    param variable      "cosine_change" or "cosine_sim"
    """
    
    ti, tj = transition
    basename = Path(controls_dir) / variable 
    
    filenames = [f"{ti}_{tj}_control{n}.txt" for n in range(1, n_ctrl+1)]
    
    values = []
    
    for file in filenames:
        data = load_metric(basename / file)
        value = data[word] if word in data else "NO MEASURE"
        values.append(value)
        print(file, value)
        
    return values   

In [14]:
def ncd(DATA, CORPUS, VAR, VAL): # No Change Detector

    corpus = Path(CORPUS)
    transitions = find_transitions(corpus / "vocab")    

    for yi, yj in transitions:
        print()
        print(f"{yi}:{yj}")
        
        A = list(DATA[DATA[f"{VAR}_{yi}:{yj}"] == VAL].index)
        print("No change (A):", len(A))
        B = list(DATA[DATA[f"{VAR}_{yi}:{yj}"] != VAL].index)
        print("Other (B):", len(B))
        print()
        print("{: <20} {}".format("A", "B"))
        print("{: <20} {}".format("---", "---"))
        for w1, w2 in zip(A[:100], B[:100]):
            print(f"{w1: <20} {w2}")

In [15]:
def find_transitions(source, mode="file", var = None):
    """
    List transitions. 
    For mode = "file", expected source: filepath
    For mode = "df", expected source: pandas DataFrame; provide varible var
    """
    if mode == "file":
        years = [int(file.strip(".txt")) for file in os.listdir(source)]
        years.sort()
        transitions = [(year, years[i]) for i, year in enumerate(years[:-1], start=1)]
    if mode == "df":
        cols = [col for col in source.columns if col.startswith(var)]
        cols.sort()
        transitions = [tuple(col.split("_")[-1].split(":")) for col in cols]
    return transitions

In [16]:
def w_overlap_checker(corpus, th_c):
    
    corpus = Path(corpus)
    transitions = find_transitions(corpus / "vocab")
    
    for yi, yj in transitions:
        print()
        print(f"{yi}:{yj}")
        
        voc_a = load_metric(corpus / f"vocab/{yi}.txt")
        voc_b = load_metric(corpus / f"vocab/{yj}.txt")

        voc_a = {w: c for w, c in voc_a.items() if c >= th_c}
        voc_b = {w: c for w, c in voc_b.items() if c >= th_c}
        print(f"{yi}:", len(voc_a))
        print(f"{yj}:", len(voc_b))
        print(f"{yi} and {yj}:", len([w for w in voc_a.keys() if w in voc_b.keys()]))
        print(f"{yi} or {yj}:", len(set(voc_a.keys()).union(set(voc_b.keys()))))
        print(f"{yi} - {yj}:", len(set(voc_a.keys()).difference(set(voc_b.keys()))))

In [17]:
def trend(df, var, norm=None, transition = True, metric="pearson", prefixes = ("N", "A", "V")):
    
    if transition:
        T = find_transitions(source=df, mode = "df", var=var)
    else:
        cols = [col for col in df.columns if col.startswith(var)]
        cols.sort()
        T = [int(col.split("_")[-1]) for col in cols]

    Y = [n for n in range(len(T))]
    table = []
    for w in df.index:
        
        if prefixes != None:
            if not w.startswith(prefixes):
                continue
        
        if transition:
            X = df[[f"{var}_{ti}:{tj}" for ti, tj in T]].loc[w]
        else:
            X = df[[f"{var}_{t}" for t in T]].loc[w]
        
        
        valid = [(x, y) for x, y in zip(X, Y) if not pd.isna(x)]

        N = len(valid)

        if N < 2:
            v = np.nan
            p = np.nan
        else:
            X, Y = zip(*valid)
            if norm != None:
                X = norm(X)

            if metric == "pearson":
                R_data = pearsonr(X, Y)

            if metric == "spearman":
                R_data = spearmanr(X, Y)
                
            v = R_data.statistic
            p = R_data.pvalue

        table.append([w, round(v, 3), round(p, 5), N])
    correlation = pd.DataFrame(table, columns=["Word", "Trend", "p", "N"])
        
    return correlation
    

In [18]:
def correlation(df, var1, var2, mode=1, norm1=None, norm2=None, metric="pearson"):    
    """
    
    ...
    param norm1  function to normalize/transform var1 with (default None); provide function 
                 e.g. zscore or np.log 
    """

    transitions = find_transitions(df, "df", var1) 
    # Assumes `var1` and `var2` are both transition variables
    # Consider implement `var1cut` parameter as in `universal_correlation`

    if mode == 1:
        correlation = df[[f"{var1}_{ti}:{tj}" for ti, tj in transitions]].corrwith(df[[f"{var2}_{ti}:{tj}" for ti, tj in transitions]], axis=1)
    
    if mode == 2:
        table = []
        for w in df.index:
            valid = []
            X = df[[f"{var1}_{ti}:{tj}" for ti, tj in transitions]].loc[w]
            Y = df[[f"{var2}_{ti}:{tj}" for ti, tj in transitions]].loc[w]
            for x, y in zip(X, Y):
                if pd.isna(x):
                    continue
                if pd.isna(y):
                    continue
                valid.append((x, y))
            
            N = len(valid)
            
            if N < 2:
                v = np.nan
                p = np.nan
            else:
                X, Y = zip(*valid)
                if norm1 != None:
                    X = norm1(X)
                if norm2 != None:
                    Y = norm2(Y)
                if metric == "pearson":
                    R_data = pearsonr(X, Y)
                    v = R_data.statistic
                    p = R_data.pvalue

                if metric == "spearman":
                    R_data = spearmanr(X, Y)
                    v = R_data.statistic
                    p = R_data.pvalue
                    
            table.append([w, round(v, 2), round(p, 2), N])
        correlation = pd.DataFrame(table, columns=["Word", "Correlation", "p", "N"])
        
    return correlation

In [19]:
def nonaninf(x, y):
    if pd.isna(x):
        return False
    if pd.isna(y):
        return False   
    if abs(x) == np.inf:
        return False
    if abs(y) == np.inf:
        return False
    return True

In [20]:
def nonalist(lst):
    for x in lst:
        if pd.isna(x):
            return False
    return True

In [21]:
def collect_var(df, var, varcut):
    X = []
    varcol = sorted([col for col in df.columns if col.startswith(var)])
    if varcut != None:
        del varcol[varcut]
    for col in varcol:
        X.extend(list(df[col]))
    return X    

In [22]:
def universal_correlation(
    df, 
    var1, 
    var2, 
    var1cut = None, 
    var2cut = None, 
    norm1=None, 
    norm2=None, 
    metric="pearson",
    min_freq = None
):
    # https://stackoverflow.com/questions/16031056/how-to-form-tuple-column-from-two-columns-in-pandas
    
    if min_freq != None:
        df = df.copy()
        
        t1 = find_transitions(df, "df", var1)
        t2 = find_transitions(df, "df", var2)

        if var1cut != None:
            transitions = [t for t in t2]

        for trg in [w for w in df.index]:
            for ti, tj in transitions:
                if df.loc[trg][f"frq_{ti}"] < min_freq or df.loc[trg][f"frq_{tj}"] < min_freq:
                    df.at[trg, f"{var2}_{ti}:{tj}"] = np.nan     
    
    X = collect_var(df, var1, var1cut)
    Y = collect_var(df, var2, var2cut)
            
    print("Length:")
    print("X:", len(X))
    print("Y:", len(Y))
    
    X, Y = zip(*[(x, y) for x, y in zip(X, Y) if nonaninf(x,y)])
        
    if norm1 != None:
        X = norm1(X)
    if norm2 != None:
        Y = norm2(Y)
    
    if metric == "pearson":
        R_data = pearsonr(X, Y)
    if metric == "spearman":
        R_data = spearmanr(X, Y)
        
    v = R_data.statistic
    p = R_data.pvalue
    
    return v, p

In [23]:
def model_comparison(
    dfs,
    mnames,
    var, 
    norm=None, 
    targets = Path("../data/utils/dwts.txt"), # =get_dwts(df_yearly_dwt, dwt_path)
    prefix = ("N", "A", "V"),
    mode="universal", # no other mode supported at the moment ...
    word=None, # not supported at the moment ... 
    metric="pearson",
):
    
    cols = [col for col in dfs[0].columns if col.startswith(var)] # based on first df
    trgs = [trg for trg in get_dwts(dfs[0], targets)]              # based on first df
    print("Targets:", ", ".join(trgs))
    trgs = [trg for trg in trgs if trg.startswith(prefix)] if prefix != None else trgs
    
    _XY = []
    if mode == "universal":    
        for model, mname in zip(dfs, mnames):
            this_model = []
            for col in cols:
                this_model.extend(model.loc[trgs, col])
            print("Length", mname, len(this_model))
            _XY.append(this_model)
    
    _XY = list(zip(*[xyz for xyz in zip(*_XY) if nonalist(xyz)])) # note: xyz is a variable for a tuple
    
    print("Length (no NaN):")
    for vector in _XY:
        print(len(vector))
    
    _XY = [norm(model) for model in _XY] if norm != None else _XY
    
    if metric == "pearson":
        R_data = np.corrcoef(_XY)
    if metric == "spearman":
        _XY = [rankdata(model) for model in _XY]
        R_data = np.corrcoef(_XY)
        
    out = pd.DataFrame(R_data, columns=mnames, index=mnames).round(3)

    return out

In [24]:
def visualize(df, var):
    pass

## Files

In [25]:
#file_path = Path("../../dw_results/fb_pol-yearly-radical3.csv")
#file_path = Path("fb_pol-yearly-radical3.csv")
results_dir = Path("../../dw_results")

In [26]:
files = sorted(os.listdir(results_dir))
_ = [print(file) for file in files]

bert-v0
fb_pol-time_bin-bert-fb_nli.csv
fb_pol-time_bin-bert-sentence-bert-swedish-cased.csv
fb_pol-time_bin-bert-sts_fbmodel.csv
fb_pol-time_bin-bert-sts_fbmodel_big_40epochs.csv
fb_pol-yearly-bert-sentence-bert-swedish-cased.csv
fb_pol-yearly-bert-sts_fbmodel.csv
fb_pol-yearly-bert-sts_fbmodel_big_40epochs.csv
fb_pol-yearly-radical3-full.csv
fb_pol-yearly-radical3-full.csv.bz2
fb_pol-yearly-radical3-restricted.csv
neighbors.csv


In [27]:
dwt_path = "../data/utils/dwts.txt"

In [28]:
#crp_tib = Path("/srv/data/gusbohom/root/corpora/toypol/time_bin/radical3/")
corpus = Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical3")

In [29]:
df_yearly = read_csv(results_dir / "fb_pol-yearly-radical3-full.csv")

In [30]:
df_yearly_dwt = read_csv(results_dir / "fb_pol-yearly-radical3-restricted.csv")

In [31]:
# df_bert_nli = read_csv(results_dir / "fb_pol-time_bin-bert-fb_nli.csv") # Missing! What happened?

In [32]:
df_bert_sts = read_csv(results_dir / "fb_pol-yearly-bert-sts_fbmodel.csv")

In [33]:
df_bert_big = read_csv(results_dir / "fb_pol-yearly-bert-sts_fbmodel_big_40epochs.csv")

In [34]:
df_bert_kb  = read_csv(results_dir / "fb_pol-yearly-bert-sentence-bert-swedish-cased.csv")

In [35]:
df_tbn_sts = read_csv(results_dir / "fb_pol-time_bin-bert-sts_fbmodel.csv")

In [36]:
#toypol = read_csv(Path("../../toypol-time_bin.csv"))

In [37]:
#df_time_bin = read_csv(results_dir / "fb_pol-time_bin-radical3-full.csv")

In [38]:
#df_time_bin_dwt = read_csv(results_dir / "fb_pol-time_bin-radical3-restricted.csv")

## Model comparison

In [39]:
for w in sorted([w for w in df_yearly_dwt.index if w.startswith(("N", "A", "V"))]):
    print(w)

A1_globalistisk
N1_berikare
N1_förortsgäng
N1_globalist
N1_kulturberikare
N1_återvandring
N2_återvandrare
V1_berika
V1_hjälpa_på_plats
V1_kulturberika
V1_återvandra


In [40]:
for w in sorted([w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))]):
    print(w)

A1_globalistisk
N1C_berikareX
N1C_berikareX;
N1C_globalistX
N1C_globalistX;
N1C_kulturberikarX
N1C_återvandringsX
N1C_återvandringsX;
N1_berikare
N1_berikare;
N1_förortsgäng
N1_globalist
N1_globalist;
N1_kulturberikare
N1_kulturberikare;
N1_återvandring
N1_återvandring;
N2C_återvandrarX
N2_återvandrare
N2_återvandrare;
V1_berika
V1_berika;
V1_hjälpa_på_plats
V1_hjälpa_på_plats;
V1_kulturberika
V1_återvandra


In [41]:
print(model_comparison(
#     dfs=[df_yearly_dwt, df_bert_nli, df_bert_sts, df_bert_big, df_bert_kb],
#     mnames=["SGNS", "NLI", "STS", "BIG", "KB"],
    dfs=[df_yearly_dwt, df_bert_sts, df_bert_big, df_bert_kb],
    mnames=["SGNS", "STS", "BIG", "KB"],    
    var="rch", 
    norm=None, 
    targets = Path("../data/utils/dwts.txt"), # =get_dwts(df_yearly_dwt, dwt_path)
    prefix = ("N", "A", "V"),
    mode="universal", # no other mode supported at the moment ...
    word=None, # not supported at the moment ... 
    metric="pearson"
).to_markdown())

Targets: V1_berika, N1_kulturberikare, N1_berikare, V1_kulturberika, N1_förortsgäng, N1_globalist, N1_återvandring, A1_globalistisk, antiglobalister, V1_återvandra, oberikat, V1_hjälpa_på_plats, oberikade, oberikad, N2_återvandrare, antiglobalist, antiglobalistisk, P1_självständig_utrikespolitik, biståndsåtervandring, antiglobalistiska, antiglobalistiskt, vänsterglobalister, massåtervandring, återvandras, massåtervandringen, vänsterglobalisterna
Length SGNS 242
Length STS 242
Length BIG 242
Length KB 242
Length (no NaN):
154
154
154
154
|      |   SGNS |   STS |   BIG |    KB |
|:-----|-------:|------:|------:|------:|
| SGNS |  1     | 0.551 | 0.505 | 0.447 |
| STS  |  0.551 | 1     | 0.877 | 0.801 |
| BIG  |  0.505 | 0.877 | 1     | 0.752 |
| KB   |  0.447 | 0.801 | 0.752 | 1     |


## Q-Show

### Check

### Yearly

#### SGNS

In [42]:
q_show(df_yearly, "rch", as_table = True)
# q_show("rsim")
# q_show("gch")
# q_show("gsim")


rch_2000:2001 jaccard = None
           Word      RCH   n_i   n_j    GCH  Mctrl  Sctrl
0      bidragit  235.060    11    14  0.168  0.080  0.000
1   individuell   25.213    10    11  0.162  0.122  0.002
2    avslöjande   22.048    11    10  0.076  0.054  0.001
3             $   21.444    17    22  0.206  0.027  0.008
4     interests   17.947    12    10  0.035  0.024  0.001
5   självkänsla   14.758    10    10  0.114  0.100  0.001
6     gåsfonden   14.399    13    93  0.227  0.037  0.013
7            bo   13.569   123   225  0.336  0.081  0.018
8            re   12.918    47   367  0.209  0.079  0.010
9       gyllene   11.471    11    10  0.113  0.080  0.003
10        about   10.528    93    44  0.047  0.023  0.002
11      element   10.152   109    35  0.403  0.174  0.022
12        björn    9.600    37  1472  0.289  0.117  0.017
13   tillgången    9.378    10    10  0.104  0.092  0.001
14        svara    9.269   305   493  0.340  0.141  0.020
15       tensta    9.213    12    10  0.20

             Word     RCH  n_i  n_j    GCH  Mctrl  Sctrl
0           klura  21.557   10   10  0.163  0.143  0.001
1      candidates  17.907   15   86  0.186  0.054  0.007
2              fi  14.765   12  585  0.344  0.166  0.012
3      accepterar  13.419  135  231  0.221  0.158  0.004
4            axis  13.298   10  149  0.212  0.060  0.011
5             eva  10.630   27  193  0.274  0.148  0.011
6    socialist.nu  10.381   10   70  0.266  0.119  0.014
7         applied   9.472   10   10  0.035  0.028  0.001
8          nyttan   9.439   17   50  0.250  0.137  0.011
9           varpå   9.390   42   53  0.273  0.189  0.009
10      skeptiskt   8.955   11   10  0.160  0.116  0.005
11       förståss   8.917   10   12  0.177  0.143  0.004
12      förfallet   8.745   13   25  0.273  0.169  0.011
13         skydda   8.499  182  358  0.167  0.130  0.004
14       odugliga   8.480   17   38  0.219  0.154  0.007
15          ironi   8.344   92  140  0.175  0.136  0.004
16  sofistikerade   8.341   11 

                   Word     RCH   n_i   n_j    GCH  Mctrl  Sctrl
0                 minds  19.947    25    69  0.276  0.123  0.007
1                giladi  16.179    50   986  0.283  0.170  0.007
2                magyar  15.270    14   172  0.332  0.181  0.009
3                    pp  13.508    69  2271  0.285  0.165  0.008
4                svanar  13.082    15    53  0.253  0.133  0.009
5                habibi  12.747    15    32  0.289  0.176  0.008
6               euvalet  12.437    88   591  0.221  0.167  0.004
7                 ipred  12.378   139   502  0.285  0.185  0.008
8               griffin  12.219    10   188  0.330  0.180  0.012
9              teologin  12.196    11    10  0.171  0.153  0.001
10               romano  11.562    13    62  0.324  0.182  0.012
11                slida  11.464    10    11  0.143  0.131  0.001
12  bosnienhercegovinas  11.427    10    34  0.184  0.123  0.005
13                 saab  11.184   102  1356  0.249  0.169  0.007
14                cobra  

                     Word     RCH   n_i   n_j    GCH  Mctrl  Sctrl
0               morfologi  94.033    10    11  0.107  0.098  0.000
1              rullatorer  39.292    10    10  0.156  0.127  0.001
2              spökstäder  36.946    10    10  0.156  0.134  0.001
3                   ipsos  18.125    28   380  0.289  0.154  0.007
4                   sarin  16.532    10   301  0.289  0.161  0.007
5               megafonen  15.800    11   142  0.324  0.180  0.009
6               ringvägen  15.758    10    12  0.155  0.120  0.002
7                 svoboda  14.522    14   118  0.278  0.155  0.008
8             startbidrag  13.582    11    98  0.289  0.119  0.012
9                     hen  12.920  3071  2220  0.239  0.194  0.003
10               sjöström  12.714    10   136  0.335  0.168  0.013
11            tvmottagare  12.397    20   121  0.218  0.123  0.007
12                    pyd  12.254    29   484  0.247  0.171  0.006
13                  raqqa  12.006    58   346  0.193  0.131  0

                    Word     RCH    n_i    n_j    GCH  Mctrl  Sctrl
0                   guam  26.865     19    199  0.332  0.144  0.007
1                    afs  25.637     22    430  0.382  0.192  0.007
2                   lies  18.719     88    129  0.250  0.145  0.005
3                 åhléns  16.659    205    558  0.318  0.164  0.009
4               ärvilket  16.229     11     10  0.159  0.151  0.000
5            afghanernas  15.324     36    130  0.314  0.212  0.006
6                     nk  13.979    183   2384  0.261  0.169  0.006
7                   fake  13.885    225   2733  0.269  0.179  0.006
8                trump´s  13.400     11     67  0.316  0.185  0.009
9                  korea  13.263    444   1960  0.187  0.135  0.004
10                 grand  13.031   1165    242  0.262  0.163  0.007
11            manchester  12.930     45    270  0.293  0.182  0.008
12        drottninggatan  12.300    105   1338  0.246  0.161  0.007
13          brandstyrkan  12.093     10     11  

              Word     RCH   n_i   n_j    GCH  Mctrl  Sctrl
0             kyle  38.177    14   416  0.406  0.181  0.006
1          viruset  31.639    21  6633  0.289  0.132  0.005
2               eb  30.229    36   553  0.361  0.178  0.006
3   nedstängningar  29.849    11   168  0.330  0.182  0.005
4            floyd  29.838    14   934  0.396  0.164  0.007
5             ebba  25.127  1581  4884  0.304  0.125  0.007
6         pandemin  23.284    10  2945  0.373  0.180  0.008
7          smittan  23.135    20  4242  0.258  0.133  0.005
8         smittade  20.109    40  3784  0.280  0.134  0.007
9              who  19.553  1740  3570  0.235  0.165  0.003
10         mäklare  19.117    33   808  0.275  0.144  0.007
11              nc  18.560    23   394  0.300  0.144  0.008
12           cuomo  18.557    17   224  0.307  0.175  0.007
13     immuniteten  18.457    29   147  0.367  0.179  0.010
14              nv  18.423    13   178  0.334  0.144  0.010
15           nyser  18.082    12   117  

In [43]:
checker(
    word="gyllene", 
    transition=(2000,2001), 
    controls_dir="/home/max/Results/fb_pol-yearly-rad3", 
    n_ctrl=10, 
    variable="cosine_change")

2000_2001_control1.txt NO MEASURE
2000_2001_control2.txt 0.08135257661342621
2000_2001_control3.txt NO MEASURE
2000_2001_control4.txt NO MEASURE
2000_2001_control5.txt NO MEASURE
2000_2001_control6.txt NO MEASURE
2000_2001_control7.txt 0.08152643591165543
2000_2001_control8.txt NO MEASURE
2000_2001_control9.txt 0.07663091272115707
2000_2001_control10.txt NO MEASURE


['NO MEASURE',
 0.08135257661342621,
 'NO MEASURE',
 'NO MEASURE',
 'NO MEASURE',
 'NO MEASURE',
 0.08152643591165543,
 'NO MEASURE',
 0.07663091272115707,
 'NO MEASURE']

In [44]:
q_show(df_yearly, "gch", as_table = True, min_freq=50)


gch_2000:2001 jaccard = None
            Word    GCH    n_i    n_j    GCH  Mctrl  Sctrl
0          svara  0.340    305    493  0.340  0.141  0.020
1            som  0.340  27860  18475  0.340  0.269  0.119
2              b  0.339    125   1257  0.339  0.211  0.036
3             bo  0.336    123    225  0.336  0.081  0.018
4       cannabis  0.330    145    114  0.330  0.114  0.024
5         talmud  0.325     64     75  0.325  0.165  0.020
6             ns  0.310    390   5653  0.310  0.221  0.015
7            zog  0.296    115    182  0.296  0.128  0.028
8        finland  0.294     72    144  0.294  0.172  0.018
9       religion  0.293    175     78  0.293  0.188  0.022
10          ditt  0.292    773    496  0.292  0.219  0.053
11  organisation  0.292    160    147  0.292  0.192  0.022
12           nsf  0.287    216    607  0.287  0.204  0.081
13             )  0.286    233    169  0.286  0.148  0.027
14          hand  0.284    367    247  0.284  0.124  0.028
15           ett  0.284   

           Word    GCH  n_i  n_j    GCH  Mctrl  Sctrl
0         quote  0.316  191  181  0.316  0.161  0.021
1      uttryckt  0.291   57  109  0.291  0.215  0.011
2          raka  0.289   51   97  0.289  0.200  0.019
3       snarast  0.289   92  128  0.289  0.208  0.025
4      relation  0.274   78   96  0.274  0.233  0.020
5           ang  0.274   96   95  0.274  0.244  0.025
6          japp  0.273  133  174  0.273  0.181  0.012
7       syftade  0.272   81  137  0.272  0.228  0.016
8           arm  0.272   51   61  0.272  0.259  0.023
9     informera  0.271   52   50  0.271  0.194  0.017
10         skär  0.270   60   78  0.270  0.223  0.017
11   uppskattar  0.270   80  110  0.270  0.221  0.021
12          sk.  0.269   59   65  0.269  0.192  0.018
13     hyckleri  0.268  124  110  0.268  0.186  0.010
14          iom  0.266   74   99  0.266  0.222  0.018
15         vana  0.264   75  112  0.264  0.204  0.015
16          s.k  0.263   71  109  0.263  0.196  0.016
17        rykte  0.263   51 

            Word    GCH   n_i   n_j    GCH  Mctrl  Sctrl
0           mena  0.326   454  1371  0.326  0.234  0.015
1           dude  0.309    51   218  0.309  0.249  0.014
2    hursomhelst  0.302   680   675  0.302  0.293  0.012
3         babben  0.298    64   275  0.298  0.222  0.012
4         t.o.m.  0.296   700   792  0.296  0.268  0.020
5        pågrund  0.294   132   249  0.294  0.257  0.012
6       förståss  0.293   156   185  0.293  0.308  0.019
7           jepp  0.291   368   417  0.291  0.295  0.020
8   gissningsvis  0.290   106   180  0.290  0.295  0.018
9            iof  0.290   332   485  0.290  0.268  0.020
10        rentav  0.288   244   439  0.288  0.280  0.014
11           iom  0.288   504   567  0.288  0.270  0.018
12            pp  0.285    69  2271  0.285  0.165  0.008
13   tillexempel  0.285   123   170  0.285  0.299  0.011
14         ipred  0.285   139   502  0.285  0.185  0.008
15        giladi  0.283    50   986  0.283  0.170  0.007
16         t.o.m  0.281   466  

              Word    GCH   n_i   n_j    GCH  Mctrl  Sctrl
0             japp  0.327  1064  1105  0.327  0.297  0.017
1             nåja  0.323   730   591  0.323  0.301  0.016
2             jepp  0.313   228   397  0.313  0.289  0.012
3              isf  0.312   689   551  0.312  0.282  0.015
4        kontentan  0.309   256   333  0.309  0.293  0.015
5      hursomhelst  0.302   751   648  0.302  0.305  0.017
6          javisst  0.302   570   514  0.302  0.304  0.018
7            t.o.m  0.300   569   645  0.300  0.299  0.014
8          mustafa  0.299   148  1093  0.299  0.165  0.011
9              iom  0.299   656   626  0.299  0.276  0.015
10           d.v.s  0.295   256   321  0.295  0.294  0.012
11           tvärt  0.294  1119  1009  0.294  0.236  0.011
12              gg  0.293   221   479  0.293  0.205  0.014
13          likväl  0.292  1011   992  0.292  0.271  0.009
14       avseendet  0.291   299   277  0.291  0.278  0.014
15        tveklöst  0.290   291   273  0.290  0.294  0.0

           Word    GCH   n_i   n_j    GCH  Mctrl  Sctrl
0          pool  0.324    86   378  0.324  0.209  0.012
1   självfallet  0.323  1083   787  0.323  0.278  0.008
2        åhléns  0.318   205   558  0.318  0.164  0.009
3          japp  0.299  1504  1040  0.299  0.307  0.011
4        trotts  0.297   199   170  0.297  0.294  0.016
5        åhlens  0.296    97   464  0.296  0.172  0.010
6           iom  0.296   670   484  0.296  0.281  0.015
7        fastän  0.294   329   322  0.294  0.286  0.014
8     hazarerna  0.294   160   700  0.294  0.218  0.013
9          jodå  0.293   573   476  0.293  0.279  0.017
10       likväl  0.288   795   640  0.288  0.290  0.015
11           ty  0.288   586   623  0.288  0.239  0.006
12  hursomhelst  0.288   536   504  0.288  0.304  0.016
13          tim  0.285   149   395  0.285  0.214  0.007
14      isåfall  0.284   843   757  0.284  0.271  0.018
15     förmodar  0.283   392   337  0.283  0.306  0.017
16         iofs  0.283  1629  1317  0.283  0.269

            Word    GCH   n_i   n_j    GCH  Mctrl  Sctrl
0            oxå  0.335   341   980  0.335  0.259  0.009
1             hb  0.323    53  1665  0.323  0.171  0.007
2           japp  0.315  1333  1376  0.315  0.286  0.012
3             jb  0.311    84   713  0.311  0.190  0.010
4             ma  0.307   120   430  0.307  0.223  0.015
5        sprutor  0.306    59  1386  0.306  0.171  0.007
6           nåja  0.304   515   598  0.304  0.303  0.017
7         d.v.s.  0.301   494   632  0.301  0.272  0.014
8              )  0.299   261   477  0.299  0.297  0.014
9         spruta  0.295   171  1699  0.295  0.173  0.006
10      förmodar  0.294   314   379  0.294  0.280  0.015
11          endå  0.292   175   363  0.292  0.242  0.013
12           tja  0.291  1300  1309  0.291  0.283  0.017
13         t.o.m  0.291   413   434  0.291  0.286  0.015
14        trotts  0.289   186   261  0.289  0.277  0.016
15  gissningsvis  0.288   436   414  0.288  0.293  0.019
16        t.o.m.  0.288  1007  

#### NLI

#### STS (small)

In [45]:
q_show(df_bert_sts, "rch", as_table = True)


rch_2000:2001 jaccard = None
                Word    RCH  n_i  n_j    GCH  Mctrl  Sctrl
0  N1_kulturberikare  8.141   43   27  0.156  0.082  0.009
1          V1_berika  4.429   43   22  0.182  0.110  0.016
2           X_berika  2.276    6    4  0.297  0.215  0.035
3        N1_berikare  2.216   19    5  0.256  0.158  0.042
4    V1_kulturberika  0.385    7    4  0.235  0.223  0.028

rch_2001:2002 jaccard = 1.0
                Word    RCH  n_i  n_j    GCH  Mctrl  Sctrl
0  N1_kulturberikare  3.763   27   10  0.180  0.113  0.017
1          V1_berika  3.137   22   28  0.179  0.127  0.016
2    V1_kulturberika  0.788    4    2  0.283  0.271  0.015
3        N1_berikare  0.786    5    3  0.273  0.262  0.014

rch_2002:2003 jaccard = 1.0
                Word    RCH  n_i  n_j    GCH  Mctrl  Sctrl
0        N1_berikare  5.264    3    8  0.330  0.191  0.025
1          V1_berika  4.496   28   22  0.170  0.121  0.010
2  N1_kulturberikare  3.935   10    6  0.211  0.179  0.008
3       N1_globalist  0.905


rch_2012:2013 jaccard = 1.0
                              Word    RCH   n_i   n_j    GCH  Mctrl  Sctrl
0                N1_kulturberikare  6.528   587   463  0.032  0.024  0.001
1                        V1_berika  6.340  1600  1331  0.026  0.016  0.001
2                     N1_globalist  6.114   536   519  0.035  0.025  0.001
3                  V1_kulturberika  4.265   245   190  0.047  0.037  0.002
4                      X_återvandr  4.171     3    14  0.212  0.151  0.014
5                      N1_berikare  3.153   408   266  0.038  0.033  0.002
6                   N1C_globalistX  2.866   251   323  0.041  0.032  0.003
7                    V1_återvandra  2.751    42    63  0.083  0.066  0.006
8                  N1_återvandring  2.702   106   125  0.057  0.043  0.005
9                  N2_återvandrare  2.693     3     9  0.207  0.175  0.011
10                     X_globalist  2.557    34    43  0.115  0.101  0.005
11                  N1_förortsgäng  2.124     8    13  0.177  0.159  0.

                  Word     RCH   n_i   n_j    GCH  Mctrl  Sctrl
0         N1_globalist  19.340  3645  3340  0.020  0.011  0.000
1      N1_återvandring  13.051  2619  1296  0.031  0.011  0.001
2   N1C_återvandringsX  12.223  1161   344  0.044  0.019  0.002
3        V1_återvandra   8.683   303   136  0.052  0.033  0.002
4      N2_återvandrare   5.005    38    10  0.127  0.099  0.005
5      A1_globalistisk   4.085   782   839  0.027  0.020  0.002
6            V1_berika   3.551  1018  1059  0.025  0.019  0.002
7          X_återvandr   3.323    70    40  0.088  0.068  0.006
8             X_berika   3.190    34    47  0.115  0.100  0.004
9   V1_hjälpa_på_plats   2.936    71    45  0.071  0.057  0.005
10   N1_kulturberikare   2.718   238   193  0.043  0.035  0.003
11         X_globalist   2.321   165   146  0.061  0.051  0.004
12    N2C_återvandrarX   1.427     6     6  0.200  0.179  0.014
13      N1C_globalistX   1.201  1075   799  0.021  0.019  0.001
14       N1C_berikareX   1.135     3    

In [46]:
q_show(df_bert_sts, "gch", as_table = True)


gch_2000:2001 jaccard = None
                Word    GCH  n_i  n_j    GCH  Mctrl  Sctrl
0           X_berika  0.297    6    4  0.297  0.215  0.035
1        N1_berikare  0.256   19    5  0.256  0.158  0.042
2    V1_kulturberika  0.235    7    4  0.235  0.223  0.028
3          V1_berika  0.182   43   22  0.182  0.110  0.016
4  N1_kulturberikare  0.156   43   27  0.156  0.082  0.009

gch_2001:2002 jaccard = 1.0
                Word    GCH  n_i  n_j    GCH  Mctrl  Sctrl
0    V1_kulturberika  0.283    4    2  0.283  0.271  0.015
1        N1_berikare  0.273    5    3  0.273  0.262  0.014
2  N1_kulturberikare  0.180   27   10  0.180  0.113  0.017
3          V1_berika  0.179   22   28  0.179  0.127  0.016

gch_2002:2003 jaccard = 1.0
                Word    GCH  n_i  n_j    GCH  Mctrl  Sctrl
0       N1_globalist  0.384    1    1  0.384  0.384  0.000
1        N1_berikare  0.330    3    8  0.330  0.191  0.025
2  N1_kulturberikare  0.211   10    6  0.211  0.179  0.008
3          V1_berika  0.170

                              Word    GCH   n_i   n_j    GCH  Mctrl  Sctrl
0                  N2_återvandrare  0.367     1     3  0.367  0.310  0.003
1   P1_självständig_utrikespolitik  0.225     4     5  0.225  0.226  0.015
2                   N1_förortsgäng  0.193     9     8  0.193  0.173  0.015
3               N1C_kulturberikarX  0.157    13    16  0.157  0.141  0.013
4                    N1C_berikareX  0.119    23    33  0.119  0.121  0.007
5                      X_globalist  0.107    46    34  0.107  0.099  0.009
6               N1C_återvandringsX  0.101    16    60  0.101  0.069  0.007
7                    V1_återvandra  0.099    36    42  0.099  0.080  0.007
8                         X_berika  0.079    98   111  0.079  0.062  0.003
9                  N1_återvandring  0.064    38   106  0.064  0.054  0.006
10              V1_hjälpa_på_plats  0.058    33    47  0.058  0.061  0.005
11                    N1_globalist  0.055   253   536  0.055  0.031  0.002
12                  N1C_g

                                       Word    GCH   n_i   n_j    GCH  Mctrl  Sctrl
0                        N1C_kulturberikarX  0.299     1     3  0.299  0.292  0.012
1                             N1C_berikareX  0.233     9     3  0.233  0.205  0.014
2                               X_återvandr  0.216     2    70  0.216  0.085  0.006
3                           N2_återvandrare  0.206     4    38  0.206  0.106  0.012
4   P1_ordning_och_reda_i_flyktingpolitiken  0.183     8     6  0.183  0.148  0.021
5                            N1_förortsgäng  0.132    14    24  0.132  0.128  0.010
6                                  X_berika  0.124    35    34  0.124  0.114  0.006
7                        N1C_återvandringsX  0.109    55  1161  0.109  0.021  0.002
8                             V1_återvandra  0.091    27   303  0.091  0.039  0.003
9                           V1_kulturberika  0.074    59    71  0.074  0.069  0.006
10                              X_globalist  0.063   151   165  0.063  0.053

In [47]:
q_show(df_bert_sts, "spr", as_table = True, transition=False)


spr_2000 jaccard = None
                Word    SPR  Freq
0          V1_berika  0.638    43
1           X_berika  0.596     6
2    V1_kulturberika  0.592     7
3        N1_berikare  0.590    19
4  N1_kulturberikare  0.588    43
5     N1_förortsgäng  0.340     8

spr_2001 jaccard = 1.0
                Word    SPR  Freq
0        N1_berikare  0.712     5
1          V1_berika  0.680    22
2    V1_kulturberika  0.583     4
3           X_berika  0.577     4
4  N1_kulturberikare  0.568    27

spr_2002 jaccard = 1.0
                             Word    SPR  Freq
0                       V1_berika  0.656    28
1  P1_självständig_utrikespolitik  0.648     2
2                 V1_kulturberika  0.591     2
3                  N1_förortsgäng  0.591     2
4               N1_kulturberikare  0.590    10
5                     N1_berikare  0.459     3

spr_2003 jaccard = 1.0
                Word    SPR  Freq
0  N1_kulturberikare  0.603     6
1          V1_berika  0.600    22
2        N1_berikare  0.491   

                                       Word    SPR  Freq
0                               X_återvandr  0.689     2
1                                  X_berika  0.680    62
2                             N1C_berikareX  0.670    13
3                              N1_globalist  0.662  1641
4                               X_globalist  0.655    87
5                        N1C_kulturberikarX  0.647     4
6                                 V1_berika  0.647  1329
7                               N1_berikare  0.642   149
8                            N1C_globalistX  0.625   451
9                           A1_globalistisk  0.622   336
10                           N1_förortsgäng  0.622    12
11                        N1_kulturberikare  0.598   188
12                            V1_återvandra  0.593    21
13                          V1_kulturberika  0.584   105
14                          N2_återvandrare  0.555     5
15                       V1_hjälpa_på_plats  0.539    67
16                          N1_

In [48]:
q_show(df_bert_sts, "anospr", as_table = True, transition=False)


anospr_2002 jaccard = None
                Word  ANOSPR  Freq
0  N1_kulturberikare   0.879    10
1    V1_kulturberika   0.573     2
2          V1_berika  -0.105    28
3        N1_berikare  -2.212     3

anospr_2003 jaccard = 1.0
                Word  ANOSPR  Freq
0  N1_kulturberikare   1.721     6
1        N1_berikare  -0.756     8
2          V1_berika  -2.743    22

anospr_2004 jaccard = 1.0
                Word  ANOSPR  Freq
0    V1_kulturberika  13.112     2
1           X_berika   4.282     4
2          V1_berika  -0.521    77
3  N1_kulturberikare  -0.713    35

anospr_2005 jaccard = 1.0
                Word  ANOSPR  Freq
0           X_berika   3.366     6
1  N1_kulturberikare   2.502    22
2        N1_berikare   0.891     6
3     N1_förortsgäng   0.391     2
4          V1_berika   0.227    86
5    V1_kulturberika  -1.361    11

anospr_2006 jaccard = 1.0
                Word  ANOSPR  Freq
0       N1_globalist   0.983    57
1        N1_berikare   0.627    20
2          V1_berika   0

                                       Word  ANOSPR  Freq
0                            N1C_globalistX   1.652   520
1                           N1_återvandring   1.554   122
2                               X_globalist   1.269   151
3                           A1_globalistisk   1.116   526
4                            N1_förortsgäng   1.015    14
5                              N1_globalist   0.658  2339
6                                  X_berika   0.617    35
7                         N1_kulturberikare   0.568   115
8                           N2_återvandrare   0.478     4
9                               N1_berikare   0.403   120
10                            V1_återvandra   0.390    27
11                       V1_hjälpa_på_plats   0.158    60
12                       N1C_återvandringsX   0.152    55
13                                V1_berika   0.072   841
14                          V1_kulturberika  -0.104    59
15                              X_återvandr  -0.541     2
16  P1_ordning

#### STS (big)

#### KB

In [49]:
#q_show(toypol, "rch", as_table = True)

In [50]:
#q_show(toypol, "gsim", as_table = True)

### Time bin

In [51]:
#q_show(df_time_bin, "rch", as_table = True)

In [52]:
q_show(df_tbn_sts, "rch", as_table = True)


rch_2003:2007 jaccard = None
                              Word      RCH  n_i   n_j    GCH  Mctrl  Sctrl
0                        V1_berika  101.638  455  5923  0.082  0.010  0.001
1                   N1C_globalistX   67.424    1   384  0.283  0.038  0.003
2                      N1_berikare   52.883   34  1953  0.093  0.019  0.001
3                    N1C_berikareX   51.851    3   250  0.228  0.056  0.003
4                N1_kulturberikare   41.973  138  2762  0.044  0.013  0.001
5                         X_berika   25.111   19   446  0.139  0.043  0.004
6                  V1_kulturberika   17.618   30   902  0.079  0.026  0.003
7                  N1_återvandring   16.346   22   277  0.093  0.036  0.003
8                  A1_globalistisk   14.326   12   189  0.132  0.056  0.005
9                      X_globalist   13.461   14    53  0.177  0.096  0.006
10                    N1_globalist   13.072   93   541  0.078  0.037  0.003
11              V1_hjälpa_på_plats   12.744    3    77  0.

## Change Show

```
def overview(
    df, 
    var, 
    targets, 
    prefixes = ("N", "A", "V"), 
    th=4.781, 
    transition = True, 
    min_freq = 10, 
    return_md = False
)
```

### Yearly

#### SGNS - only DWTS

In [53]:
_ = overview(df_yearly_dwt, "rch", get_dwts(df_yearly_dwt, dwt_path), return_md=True)
print(_)

RCH
                    00:01  01:02  02:03  03:04  04:05  05:06  06:07  07:08  08:09  09:10  10:11  11:12  12:13  13:14  14:15  15:16  16:17  17:18  18:19  19:20  20:21  21:22
A1_globalistisk     False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False
N1_berikare         False  False  False  False  False  False   True  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False
N1_förortsgäng      False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False
N1_globalist        False  False  False  False  False  False  False   True  False  False  False  False  False  False  False  False  False  False  False  False  False  False
N1_kulturberikare   False  False  False  False  False  False   True  False  False   True  False  False  False  False  False  False 

In [54]:
# change_show(df, var, targets, th=4.781)
change_show(df_yearly_dwt, "rch", get_dwts(df_yearly_dwt, dwt_path))


rch_2006:2007
                Word     Value  n_2006  n_2007  gch_2006:2007  M_2006:2007  Std_2006:2007
0  N1_kulturberikare  5.140348      51     255       0.278955     0.218979       0.011125
1        N1_berikare  7.390323      14     185       0.290620     0.205861       0.010935

rch_2007:2008
           Word     Value  n_2007  n_2008  gch_2007:2008  M_2007:2008  Std_2007:2008
0  N1_globalist  9.722278      26     294       0.288232     0.191171       0.009519

rch_2009:2010
                Word     Value  n_2009  n_2010  gch_2009:2010  M_2009:2010  Std_2009:2010
0  N1_kulturberikare  8.116137     525     925       0.289864     0.223601       0.007784

rch_2017:2018
              Word     Value  n_2017  n_2018  gch_2017:2018  M_2017:2018  Std_2017:2018
0  N1_återvandring  11.08269     165    3288        0.19051     0.145217       0.003897

rch_2018:2019
              Word     Value  n_2018  n_2019  gch_2018:2019  M_2018:2019  Std_2018:2019
0  N1_återvandring  6.600887    3288    1

In [55]:
change_show(df_yearly_dwt, "rch", get_dwts(df_yearly_dwt, dwt_path), th=None)


rch_2000:2001
                Word     Value  n_2000  n_2001  gch_2000:2001  M_2000:2001  Std_2000:2001
0          V1_berika  3.112465      41      20       0.203531     0.109905       0.028681
1  N1_kulturberikare -1.504499      40      28       0.140368     0.164519       0.015306

rch_2001:2002
        Word     Value  n_2001  n_2002  gch_2001:2002  M_2001:2002  Std_2001:2002
0  V1_berika  2.056634      20      25       0.171617     0.117677       0.025007

rch_2002:2003
        Word     Value  n_2002  n_2003  gch_2002:2003  M_2002:2003  Std_2002:2003
0  V1_berika -2.531236      25      14       0.056936     0.112201       0.020817

rch_2003:2004
        Word     Value  n_2003  n_2004  gch_2003:2004  M_2003:2004  Std_2003:2004
0  V1_berika  3.490279      14      54       0.168842     0.099211       0.019022

rch_2004:2005
                Word     Value  n_2004  n_2005  gch_2004:2005  M_2004:2005  Std_2004:2005
0          V1_berika  0.202163      54      55       0.175218     0.17326


rch_2014:2015
                  Word     Value  n_2014  n_2015  gch_2014:2015  M_2014:2015  Std_2014:2015
0            V1_berika  1.363301    1001    1095       0.210943     0.201690       0.006471
1    N1_kulturberikare -3.496329     172     128       0.193388     0.246629       0.014519
2          N1_berikare -3.039826     174     198       0.215969     0.252438       0.011439
3      V1_kulturberika -1.415994     101     120       0.211916     0.222310       0.006999
4       N1_förortsgäng -1.436558      24      15       0.143899     0.171350       0.018220
5         N1_globalist -0.840169     722     676       0.205190     0.214200       0.010225
6      N1_återvandring -3.893727     157     136       0.138897     0.179320       0.009898
7      A1_globalistisk -3.165283     209     132       0.185394     0.221419       0.010852
8        V1_återvandra -3.578348      32      28       0.109443     0.171284       0.016478
9   V1_hjälpa_på_plats -0.002356     106     251       0.180740  

In [56]:
change_show(df_yearly_dwt, "gsim", get_dwts(df_yearly_dwt, dwt_path), th=None)


gsim_2000:2001
                Word     Value  n_2000  n_2001  gch_2000:2001  M_2000:2001  Std_2000:2001
0          V1_berika  0.802447      41      20       0.203531     0.109905       0.028681
1  N1_kulturberikare  0.904334      40      28       0.140368     0.164519       0.015306

gsim_2001:2002
        Word     Value  n_2001  n_2002  gch_2001:2002  M_2001:2002  Std_2001:2002
0  V1_berika  0.858144      20      25       0.171617     0.117677       0.025007

gsim_2002:2003
        Word     Value  n_2002  n_2003  gch_2002:2003  M_2002:2003  Std_2002:2003
0  V1_berika  0.984046      25      14       0.056936     0.112201       0.020817

gsim_2003:2004
        Word     Value  n_2003  n_2004  gch_2003:2004  M_2003:2004  Std_2003:2004
0  V1_berika  0.862588      14      54       0.168842     0.099211       0.019022

gsim_2004:2005
                Word     Value  n_2004  n_2005  gch_2004:2005  M_2004:2005  Std_2004:2005
0          V1_berika  0.852281      54      55       0.175218     0.


gsim_2014:2015
                  Word     Value  n_2014  n_2015  gch_2014:2015  M_2014:2015  Std_2014:2015
0            V1_berika  0.788336    1001    1095       0.210943     0.201690       0.006471
1    N1_kulturberikare  0.821051     172     128       0.193388     0.246629       0.014519
2          N1_berikare  0.778524     174     198       0.215969     0.252438       0.011439
3      V1_kulturberika  0.786451     101     120       0.211916     0.222310       0.006999
4       N1_förortsgäng  0.899544      24      15       0.143899     0.171350       0.018220
5         N1_globalist  0.799325     722     676       0.205190     0.214200       0.010225
6      N1_återvandring  0.906297     157     136       0.138897     0.179320       0.009898
7      A1_globalistisk  0.835128     209     132       0.185394     0.221419       0.010852
8        V1_återvandra  0.941473      32      28       0.109443     0.171284       0.016478
9   V1_hjälpa_på_plats  0.843080     106     251       0.180740 

#### NLI

#### STS (small)

In [57]:
_ = overview(df_bert_sts, "frq", get_dwts(df_yearly_dwt, dwt_path), th= None, transition = False, min_freq = None)
print(_)

FRQ
                      00    01    02    03    04    05     06     07      08      09      10      11      12      13      14      15      16      17      18      19      20      21      22
A1_globalistisk      0.0   0.0   1.0   0.0   0.0   3.0    9.0    9.0    39.0    59.0    82.0   153.0   269.0   217.0   267.0   171.0   336.0   526.0   782.0   839.0   715.0   591.0   748.0
N1_berikare         19.0   5.0   3.0   8.0   0.0   6.0   20.0  213.0   619.0   639.0   482.0   408.0   408.0   266.0   200.0   213.0   149.0   120.0   138.0   145.0    88.0    32.0    35.0
N1_förortsgäng       8.0   0.0   2.0   0.0   0.0   2.0    0.0    2.0     6.0    19.0     5.0     9.0     8.0    13.0    25.0    17.0    12.0    14.0    24.0    29.0    10.0    21.0    39.0
N1_globalist         0.0   0.0   1.0   1.0   3.0  32.0   57.0   27.0   174.0   140.0   200.0   253.0   536.0   519.0   589.0   572.0  1641.0  2339.0  3645.0  3340.0  3063.0  2645.0  4385.0
N1_kulturberikare   43.0  27.0  10.0   6.0  35.0  2

In [58]:
_ = overview(df_bert_sts, "gch", get_dwts(df_yearly_dwt, dwt_path), th= None, transition = True, min_freq = None, return_md=True)
print(_)

GCH
                    00:01  01:02  02:03  03:04  04:05  05:06  06:07  07:08  08:09  09:10  10:11  11:12  12:13  13:14  14:15  15:16  16:17  17:18  18:19  19:20  20:21  21:22
A1_globalistisk       NaN    NaN    NaN    NaN    NaN  0.200  0.213  0.139  0.077  0.084  0.054  0.046  0.040  0.043  0.042  0.051  0.033  0.036  0.027  0.031  0.033  0.037
N1_berikare         0.256  0.273  0.330    NaN    NaN  0.171  0.106  0.041  0.030  0.031  0.031  0.035  0.038  0.042  0.046  0.058  0.052  0.051  0.045  0.057  0.084  0.096
N1_förortsgäng        NaN    NaN    NaN    NaN    NaN    NaN    NaN  0.256  0.171  0.163  0.227  0.193  0.177  0.142  0.116  0.122  0.140  0.132  0.112  0.131  0.137  0.115
N1_globalist          NaN    NaN  0.384  0.320  0.289  0.100  0.106  0.122  0.077  0.050  0.048  0.055  0.035  0.032  0.040  0.039  0.023  0.028  0.020  0.029  0.024  0.040
N1_kulturberikare   0.156  0.180  0.211  0.182  0.099  0.096  0.069  0.032  0.026  0.026  0.025  0.030  0.032  0.036  0.041  0.053 

In [59]:
_ = overview(df_bert_sts, "rch", get_dwts(df_yearly_dwt, dwt_path), min_freq = 10, return_md=True)
print(_)

RCH
                    00:01  01:02  02:03  03:04  04:05  05:06  06:07  07:08  08:09  09:10  10:11  11:12  12:13  13:14  14:15  15:16  16:17  17:18  18:19  19:20  20:21  21:22
A1_globalistisk     False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False   True  False   True  False   True
N1_berikare         False  False  False  False  False  False   True   True  False  False  False  False  False  False  False  False  False  False  False  False  False  False
N1_förortsgäng      False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False
N1_globalist        False  False  False  False  False  False  False   True   True  False  False   True   True   True   True   True   True   True   True   True   True   True
N1_kulturberikare    True  False  False  False  False  False   True  False  False  False  False   True   True  False  False  False 

In [61]:
change_show(df_bert_sts, "rch", get_dwts(df_bert_sts, dwt_path))


rch_2000:2001
                Word     Value  n_2000  n_2001  gch_2000:2001  M_2000:2001  Std_2000:2001
0  N1_kulturberikare  8.141006      43      27        0.15589     0.082283       0.008621

rch_2003:2004
        Word     Value  n_2003  n_2004  gch_2003:2004  M_2003:2004  Std_2003:2004
0  V1_berika  9.377859      22      77        0.15421     0.083474       0.007192

rch_2006:2007
                Word      Value  n_2006  n_2007  gch_2006:2007  M_2006:2007  Std_2006:2007
0          V1_berika  17.937806     270     661       0.074011     0.029935       0.002343
1  N1_kulturberikare  15.641346      75     306       0.068502     0.038899       0.001805
2        N1_berikare  10.549533      20     213       0.105796     0.056945       0.004415
3    V1_kulturberika   6.850862      17     136       0.093213     0.064404       0.004009

rch_2007:2008
           Word      Value  n_2007  n_2008  gch_2007:2008  M_2007:2008  Std_2007:2008
0     V1_berika  10.640198     661    1408       0.0299

In [62]:
df_bert_sts.loc["V1_berika"]["gsim_2008:2009"]

0.9964507179938941

In [63]:
_ = overview(df_bert_sts, "anospr", get_dwts(df_yearly_dwt, dwt_path), th=None, transition = False, min_freq = None, return_md=True, rounder = 2)
print(_)

ANOSPR
                      02    03     04    05    06    07    08    09    10    11    12    13    14    15    16    17    18    19    20    21    22
A1_globalistisk      NaN   NaN    NaN   NaN   NaN  0.53  0.17 -0.13  0.35  0.39  0.46  0.09  1.00  1.16  0.95  1.12  1.26  1.31  1.27  1.16  0.99
N1_berikare        -2.21 -0.76    NaN  0.89  0.63  0.50  0.36  0.41  0.46  0.31  0.29  0.22  0.18  0.20  0.32  0.40  0.26  0.38  0.22  0.49  0.07
N1_förortsgäng       NaN   NaN    NaN  0.39   NaN  1.36 -0.07  0.34  0.26  0.55  0.26  0.70  0.04  0.42  0.76  1.01  0.45  0.37  0.20  0.24  0.91
N1_globalist         NaN   NaN    NaN   NaN  0.98  0.62  0.23  0.37  0.46  0.34 -0.44 -0.23  0.09  0.63  0.40  0.66  0.56  0.49  0.53  0.47  0.55
N1_kulturberikare   0.88  1.72  -0.71  2.50 -1.56  0.70  0.29  0.42 -0.10 -0.07 -0.40 -0.20 -0.36 -0.35  0.78  0.57 -0.79  0.11  0.24 -0.12  0.40
N1_återvandring      NaN   NaN    NaN   NaN   NaN   NaN -0.16  0.13  0.31 -0.10 -0.90  0.23  0.22  0.14 -0.09  1.55  

In [64]:
_ = overview(df_bert_sts, "anospr", get_dwts(df_yearly_dwt, dwt_path), th=-2, transition = False, min_freq = None, return_md=True, rounder = 2)
print(_)

ANOSPR
                       02     03     04     05     06     07    08     09    10     11    12    13    14    15    16    17    18    19    20    21    22
A1_globalistisk     False  False  False  False  False   True  True   True  True   True  True  True  True  True  True  True  True  True  True  True  True
N1_berikare         False   True  False   True   True   True  True   True  True   True  True  True  True  True  True  True  True  True  True  True  True
N1_förortsgäng      False  False  False   True  False   True  True   True  True   True  True  True  True  True  True  True  True  True  True  True  True
N1_globalist        False  False  False  False   True   True  True   True  True   True  True  True  True  True  True  True  True  True  True  True  True
N1_kulturberikare    True   True   True   True   True   True  True   True  True   True  True  True  True  True  True  True  True  True  True  True  True
N1_återvandring     False  False  False  False  False  False  True   True  

#### STS (big)

#### KB

In [65]:
_ = overview(df_bert_kb, "rch", get_dwts(df_yearly_dwt, dwt_path), return_md=True)
print(_)

RCH
                    00:01  01:02  02:03  03:04  04:05  05:06  06:07  07:08  08:09  09:10  10:11  11:12  12:13  13:14  14:15  15:16  16:17  17:18  18:19  19:20  20:21  21:22
A1_globalistisk     False  False  False  False  False  False  False  False  False  False  False  False  False  False  False   True  False   True  False   True   True   True
N1_berikare         False  False  False  False  False  False   True   True  False  False  False  False  False  False  False  False  False  False  False  False  False  False
N1_förortsgäng      False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False  False
N1_globalist        False  False  False  False  False  False  False   True   True  False  False   True  False  False  False   True   True   True   True   True   True   True
N1_kulturberikare    True  False  False  False  False  False   True   True  False   True   True  False  False  False  False  False 

### Time bin

In [66]:
#change_show(df_time_bin_dwt, "rch", get_dwts(df_time_bin_dwt, dwt_path))

NameError: name 'df_time_bin_dwt' is not defined

In [None]:
#change_show(df_time_bin_dwt, "gsim", get_dwts(df_time_bin_dwt, dwt_path), th=None)

In [68]:
# a=load_metric("/srv/data/gusbohom/root/corpora/fb_pol/time_bin/radical3/vocab/2015.txt")["V1_berika"]
# b=load_metric("/srv/data/gusbohom/root/corpora/fb_pol/time_bin/radical3/vocab/2019.txt")["V1_berika"]
# print(a, b)

In [69]:
change_show(df_tbn_sts, "rch", get_dwts(df_tbn_sts, dwt_path))


rch_2003:2007
                Word       Value  n_2003  n_2007  gch_2003:2007  M_2003:2007  Std_2003:2007
0          V1_berika  101.638235     455    5923       0.082196     0.010324       0.000674
1  N1_kulturberikare   41.972630     138    2762       0.044481     0.013360       0.000707
2       N1_globalist   13.071699      93     541       0.077816     0.036770       0.002994
3        N1_berikare   52.882744      34    1953       0.092718     0.018655       0.001335
4    V1_kulturberika   17.617830      30     902       0.079005     0.026139       0.002861
5    N1_återvandring   16.346126      22     277       0.093373     0.036198       0.003335
6           X_berika   25.111275      19     446       0.138702     0.042703       0.003645
7        X_globalist   13.460569      14      53       0.176886     0.095513       0.005764
8    A1_globalistisk   14.325943      12     189       0.132307     0.055879       0.005087

rch_2007:2011
                 Word      Value  n_2007  n_2011  

## Correlation

In [70]:
get_variables(df_yearly_dwt)

{'yr_prefix': {'doc', 'fpm', 'frq', 'tot'},
 'tr_prefix': {'diffpm',
  'diffrq',
  'gch',
  'gsim',
  'mccc',
  'mcsim',
  'rch',
  'rsim',
  'stdc',
  'stdsim'},
 'years': {'2000',
  '2001',
  '2002',
  '2003',
  '2004',
  '2005',
  '2006',
  '2007',
  '2008',
  '2009',
  '2010',
  '2011',
  '2012',
  '2013',
  '2014',
  '2015',
  '2016',
  '2017',
  '2018',
  '2019',
  '2020',
  '2021',
  '2022',
  'frq'},
 'transitions': {'2000:2001',
  '2001:2002',
  '2002:2003',
  '2003:2004',
  '2004:2005',
  '2005:2006',
  '2006:2007',
  '2007:2008',
  '2008:2009',
  '2009:2010',
  '2010:2011',
  '2011:2012',
  '2012:2013',
  '2013:2014',
  '2014:2015',
  '2015:2016',
  '2016:2017',
  '2017:2018',
  '2018:2019',
  '2019:2020',
  '2020:2021',
  '2021:2022'}}

### Yearly

#### All words (SGNS)

In [71]:
universal_correlation(
    df=df_yearly, 
    var1="gch", 
    var2="frq", 
    var1cut = None, 
    var2cut = -1, 
    norm1=None, 
    norm2=np.log, 
    metric="pearson")

Length:
X: 3342680
Y: 3342680


(0.3803103896395752, 0.0)

In [72]:
universal_correlation(
    df=df_yearly, 
    var1="gsim", 
    var2="frq", 
    var1cut = None, 
    var2cut = -1, 
    norm1=None, 
    norm2=np.log, 
    metric="pearson")

Length:
X: 3342680
Y: 3342680


(-0.36867454175092923, 0.0)

In [73]:
universal_correlation(
    df=df_yearly, 
    var1="rch", 
    var2="frq", 
    var1cut = None, 
    var2cut = -1, 
    norm1=None, 
    norm2=np.log, 
    metric="pearson")

Length:
X: 3342680
Y: 3342680


(0.06659562110360326, 0.0)

In [74]:
universal_correlation(
    df=df_yearly, 
    var1="rch", 
    var2="stdc", 
    var1cut = None, 
    var2cut = None, 
    norm1=None, 
    norm2=None, 
    metric="pearson")

Length:
X: 3342680
Y: 3342680


(0.054552105535179966, 0.0)

#### DWTs

###### SGNS

In [75]:
trend(df_yearly_dwt, var="frq", norm=None, transition = False, metric="pearson")

Unnamed: 0,Word,Trend,p,N
0,V1_berika,0.41,0.05178,23
1,N1_kulturberikare,0.079,0.72154,23
2,N1_berikare,0.077,0.7276,23
3,V1_kulturberika,0.15,0.49578,23
4,N1_förortsgäng,0.738,6e-05,23
5,N1_globalist,0.869,0.0,23
6,N1_återvandring,0.639,0.00104,23
7,A1_globalistisk,0.893,0.0,23
8,V1_återvandra,0.724,9e-05,23
9,V1_hjälpa_på_plats,0.467,0.02461,23


In [76]:
trend(df_yearly_dwt, var="gch", norm=None, transition = True, metric="pearson")

Unnamed: 0,Word,Trend,p,N
0,V1_berika,0.266,0.2313,22
1,N1_kulturberikare,-0.052,0.83392,19
2,N1_berikare,-0.866,0.00013,13
3,V1_kulturberika,-0.732,0.06159,7
4,N1_förortsgäng,,,0
5,N1_globalist,-1.0,1.0,2
6,N1_återvandring,,,0
7,A1_globalistisk,,,0
8,V1_återvandra,,,0
9,V1_hjälpa_på_plats,,,0


In [77]:
correlation(df=df_yearly_dwt, 
            var1="diffpm", 
            var2="rch", 
            mode=2, 
            metric = "pearson")

Unnamed: 0,Word,Correlation,p,N
0,V1_berika,-0.13,0.56,22
1,N1_kulturberikare,-0.39,0.1,19
2,N1_berikare,-0.43,0.1,16
3,V1_kulturberika,-0.53,0.04,16
4,N1_förortsgäng,0.02,0.98,6
5,N1_globalist,-0.19,0.47,17
6,N1_återvandring,-0.4,0.14,15
7,A1_globalistisk,-0.32,0.27,14
8,antiglobalister,-0.88,0.02,6
9,V1_återvandra,-0.56,0.03,15


In [80]:
correlation(df=df_yearly_dwt, var1="stdc", var2="rch", mode=2, metric = "pearson")

Unnamed: 0,Word,Correlation,p,N
0,V1_berika,0.21,0.35,22
1,N1_kulturberikare,-0.29,0.23,19
2,N1_berikare,-0.11,0.69,16
3,V1_kulturberika,0.04,0.88,16
4,N1_förortsgäng,0.36,0.49,6
5,N1_globalist,-0.13,0.63,17
6,N1_återvandring,-0.39,0.15,15
7,A1_globalistisk,0.26,0.36,14
8,antiglobalister,0.17,0.75,6
9,V1_återvandra,-0.01,0.97,15


In [81]:
correlation(df=df_yearly_dwt, var1="diffpm", var2="rch", mode=2, metric = "spearman")

Unnamed: 0,Word,Correlation,p,N
0,V1_berika,-0.03,0.9,22
1,N1_kulturberikare,-0.38,0.11,19
2,N1_berikare,-0.14,0.6,16
3,V1_kulturberika,-0.08,0.78,16
4,N1_förortsgäng,0.09,0.87,6
5,N1_globalist,-0.47,0.06,17
6,N1_återvandring,-0.21,0.44,15
7,A1_globalistisk,-0.3,0.3,14
8,antiglobalister,-0.6,0.21,6
9,V1_återvandra,-0.19,0.51,15


```
def universal_correlation(
    df, 
    var1, 
    var2, 
    var1cut = None, 
    var2cut = None, 
    norm1=None, 
    norm2=None, 
    metric="pearson")
```

In [None]:
##### Universal

In [82]:
universal_correlation(df_yearly, "fpm", "gch", -1, norm1=np.log)

Length:
X: 3342680
Y: 3342680


(0.3944829273050009, 0.0)

In [83]:
universal_correlation(df_yearly, "fpm", "gch", 0, norm1=np.log)

Length:
X: 3342680
Y: 3342680


(0.47518719624787575, 0.0)

In [84]:
universal_correlation(df_yearly_dwt.loc[[w for w in df_yearly_dwt.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      -1, 
                      norm1=np.log)

Length:
X: 242
Y: 242


(0.38421898291972817, 8.683424766812245e-07)

In [85]:
universal_correlation(df_yearly_dwt.loc[[w for w in df_yearly_dwt.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      0, 
                      norm1=np.log)

Length:
X: 242
Y: 242


(0.5454433760928777, 2.581384088189342e-13)

#### Towards Understanding the Mystery (What is wrong?)

In [86]:
term = "N1_kulturberikare"

In [87]:
df_bert_sts.loc[term, [f"gch_{year}:{year+1}" for year in range(2000, 2021)]]

gch_2000:2001    0.155890
gch_2001:2002    0.179747
gch_2002:2003    0.210698
gch_2003:2004    0.182223
gch_2004:2005    0.098843
gch_2005:2006    0.096017
gch_2006:2007    0.068502
gch_2007:2008    0.032416
gch_2008:2009    0.025822
gch_2009:2010    0.026369
gch_2010:2011    0.024593
gch_2011:2012    0.030090
gch_2012:2013    0.032451
gch_2013:2014    0.036250
gch_2014:2015    0.040695
gch_2015:2016    0.053390
gch_2016:2017    0.055503
gch_2017:2018    0.050042
gch_2018:2019    0.043174
gch_2019:2020    0.049453
gch_2020:2021    0.056498
Name: N1_kulturberikare, dtype: float64

In [88]:
df_yearly.loc[term, [f"gch_{year}:{year+1}" for year in range(2000, 2021)]]

gch_2000:2001    0.140368
gch_2001:2002         NaN
gch_2002:2003         NaN
gch_2003:2004         NaN
gch_2004:2005    0.125113
gch_2005:2006    0.217431
gch_2006:2007    0.278955
gch_2007:2008    0.236006
gch_2008:2009    0.243729
gch_2009:2010    0.289864
gch_2010:2011    0.245441
gch_2011:2012    0.236930
gch_2012:2013    0.250001
gch_2013:2014    0.220381
gch_2014:2015    0.193388
gch_2015:2016    0.237264
gch_2016:2017    0.203891
gch_2017:2018    0.248979
gch_2018:2019    0.195478
gch_2019:2020    0.173714
gch_2020:2021    0.186256
Name: N1_kulturberikare, dtype: float64

##### NLI

##### STS (small)

In [89]:
trend(df_bert_sts, var="frq", norm=None, transition = False, metric="pearson")

Unnamed: 0,Word,Trend,p,N
0,V1_berika,0.472,0.0229,23
1,N1_kulturberikare,0.081,0.712,23
2,N1_berikare,0.085,0.69995,23
3,N1_förortsgäng,0.795,1e-05,23
4,V1_kulturberika,0.173,0.42998,23
5,N1C_kulturberikarX,-0.056,0.80044,23
6,N1_kulturberikare;,0.216,0.32191,23
7,N1_återvandring,0.696,0.00023,23
8,A1_globalistisk,0.895,0.0,23
9,N1_globalist,0.854,0.0,23


In [90]:
trend(df_bert_sts, var="gch", norm=None, transition = True, metric="pearson")

Unnamed: 0,Word,Trend,p,N
0,V1_berika,-0.739,9e-05,22
1,N1_kulturberikare,-0.666,0.00072,22
2,N1_berikare,-0.659,0.00158,20
3,N1_förortsgäng,-0.795,0.00117,13
4,V1_kulturberika,-0.895,0.00019,11
5,N1C_kulturberikarX,-0.791,0.06075,6
6,N1_kulturberikare;,,,0
7,N1_återvandring,,,0
8,A1_globalistisk,,,1
9,N1_globalist,-0.934,0.06613,4


In [91]:
trend(df_bert_sts, var="spr", norm=None, transition = False, metric="pearson")

Unnamed: 0,Word,Trend,p,N
0,V1_berika,-0.094,0.67098,23
1,N1_kulturberikare,0.064,0.77321,23
2,N1_berikare,0.338,0.12383,22
3,N1_förortsgäng,0.566,0.01425,18
4,V1_kulturberika,0.053,0.83871,17
5,N1C_kulturberikarX,-0.32,0.33678,11
6,N1_kulturberikare;,,,0
7,N1_återvandring,0.326,0.59214,5
8,A1_globalistisk,,,0
9,N1_globalist,,,1


In [92]:
trend(df_bert_sts, var="anospr", norm=None, transition = False, metric="pearson")

Unnamed: 0,Word,Trend,p,N
0,V1_berika,0.412,0.06379,21
1,N1_kulturberikare,-0.255,0.26527,21
2,N1_berikare,0.359,0.12018,20
3,N1_förortsgäng,-0.099,0.71459,16
4,V1_kulturberika,-0.351,0.20007,15
5,N1C_kulturberikarX,0.09,0.81798,9
6,N1_kulturberikare;,,,0
7,N1_återvandring,0.99,0.08851,3
8,A1_globalistisk,,,0
9,N1_globalist,,,0


In [93]:
universal_correlation(df_bert_sts.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      -1, 
                      norm1=np.log,
                      min_freq = 10
                     )

Length:
X: 572
Y: 572


(-0.4848305321284184, 3.465294648044792e-13)

In [94]:
universal_correlation(df_bert_sts.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      0, 
                      norm1=np.log,
                      min_freq = 10
                     )

Length:
X: 572
Y: 572


(-0.48112023935444664, 5.545210782258646e-13)

##### STS (big)

In [95]:
universal_correlation(df_bert_big.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      -1, 
                      norm1=np.log,
                      min_freq = 10
                     )

Length:
X: 572
Y: 572


(-0.47044748734004177, 2.0783157327707672e-12)

In [96]:
universal_correlation(df_bert_big.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      0, 
                      norm1=np.log,
                      min_freq = 10
                     )

Length:
X: 572
Y: 572


(-0.462595786575757, 5.337378768368977e-12)

##### KB

In [97]:
universal_correlation(df_bert_kb.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      -1, 
                      norm1=np.log,
                      min_freq = 10
                     )

Length:
X: 572
Y: 572


(-0.42320995460506067, 4.280025597147215e-10)

In [98]:
universal_correlation(df_bert_big.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      0, 
                      norm1=np.log,
                      min_freq = 10
                     )

Length:
X: 572
Y: 572


(-0.462595786575757, 5.337378768368977e-12)