In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import re
from util import load_metric
from scipy.stats import spearmanr, pearsonr, zscore, rankdata
#from collections import Counter

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>div.output_area pre {white-space: pre;}</style>"))

In [3]:
pd.set_option('display.expand_frame_repr', False)

In [4]:
pd.set_option('display.max_columns', None)

## Functions

In [5]:
def jaccard(a, b):
    return len(a.intersection(b)) / len(a.union(b))

In [6]:
def q_show(df, var, k = 40, as_table = False, transition=True, min_freq=None, return_md=False):
    """
    Given a dataframe, shows the top k words for variable var each year.
    Note: as_table only sypported for transition variables!
    """
    
    COLUMNS = [col for col in sorted(df.columns) if col.startswith(var)]

    if min_freq == None:    
        WORDS = [df[col].sort_values(ascending=False)[:k].index for col in COLUMNS]
    else:
        TRANSITIONS = [tuple(col.split("_")[-1].split(":")) for col in COLUMNS]
        WORDS = [df[(df[f"frq_{trs[0]}"] >= min_freq) & (df[f"frq_{trs[1]}"] >= min_freq)][col].sort_values(ascending=False)[:k].index for col, trs in zip(COLUMNS, TRANSITIONS)]
    
    ser = [(c, d) for c, d in zip(COLUMNS, WORDS)]
    
    md = ""
    
    for i, (col, s) in enumerate(ser):
        if i > 0:
            jac = round(jaccard(set(s), ser[i-1][-1]), 2)
        else:
            jac = None
        
        print(); md += "\n"
        print(col, "jaccard =", jac); md += f"{col} jaccard = {jac}"

        if transition:
            if as_table == False:
                print(s)
            else:
                trans  = col.split("_")[-1]
                ti, tj = tuple(trans.split(":"))
                table  = [] 
                for word in s:
                    v    = df.loc[word][col]
                    f_ti = int(df.loc[word][f"frq_{ti}"])
                    f_tj = int(df.loc[word][f"frq_{tj}"])
                    gch  = df.loc[word][f"gch_{trans}"]
                    m    = df.loc[word][f"mccc_{trans}"]
                    std  = df.loc[word][f"stdc_{trans}"]
                    columns = ["Word", var.upper(), "n_i", "n_j", "GCH", "Mctrl", "Sctrl"]
                    table.append([word, v, f_ti, f_tj, gch, m, std])
                print(pd.DataFrame(table, columns=columns).dropna().round(3))
                md += pd.DataFrame(table, columns=columns).dropna().round(3).to_markdown()
        
        else:
            year = col.split("_")[-1]
            table = []
            for word in s:
                v    = df.loc[word][col]
                f = int(df.loc[word][f"frq_{year}"])
                columns = ["Word", var.upper(), "Freq"]
                table.append([word, v, f])
            print(pd.DataFrame(table, columns=columns).dropna().round(3))
            md += pd.DataFrame(table, columns=columns).dropna().round(3).to_markdown()
    if return_md:
        return md

In [7]:
def change_show(df, var, targets, th=4.781, return_md = False, min_freq=10):
    """
    Given a dataframe, shows the value for a variable of target 
    each transition/year.
    Provide threshold to only show the targets meeting the threshold
    """
    
    md = ""

    for col in sorted([col for col in df.columns if col.startswith(var)]):
        trans  = col.split("_")[-1]
        ti, tj = tuple(trans.split(":"))
        table  = [] 
        
        for word in targets:
            v    = df.loc[word][col]
            f_ti = int(df.loc[word][f"frq_{ti}"])
            f_tj = int(df.loc[word][f"frq_{tj}"])
            if min_freq != None:
                if f_ti < min_freq or f_tj < min_freq:
                    continue
            gch  = df.loc[word][f"gch_{trans}"]
            m    = df.loc[word][f"mccc_{trans}"]
            std  = df.loc[word][f"stdc_{trans}"]            
            
            if th != None:
                if v > th:
                    table.append([word, v, f_ti, f_tj, gch, m, std])
            else:
                table.append([word, v, f_ti, f_tj, gch, m, std])
        
        columns = ["Word", "Value", f"n_{ti}", f"n_{tj}", f"gch_{ti}:{tj}", f"M_{ti}:{tj}", f"Std_{ti}:{tj}"]
        if table != []:
            display = pd.DataFrame(table, columns=columns)
            print(); md += "\n" 
            print(col); md += f"{col}\n"
            print(display); md += display.to_markdown()
    
    if return_md:
        return md

In [8]:
def v_renamer(var_string):
    var_string = var_string.split("_")
    yi, yj     = tuple(var_string[-1].split(":"))
    yi         = yi[-2:]
    yj         = yj[-2:]
    return f"{yi}:{yj}"

In [9]:
def overview(
    df, 
    var, 
    targets, 
    prefixes = ("N", "A", "V"), 
    th=4.781, 
    transition = True, 
    min_freq = 10, 
    return_md = False,
    rounder = 3
):
    """ 
    Similar to change_show but:
    * Show data as one table
    * Only show variable (change_show display additional data)
    * Provide th to show True/False
    """
    
    cols = sorted([col for col in df.columns if col.startswith(var)])
    targets = [w for w in targets if w.startswith(prefixes)]
    targets.sort()
    if transition:
        renamer = {k: v_renamer(k) for k in cols}
    else:
        renamer = {k: k.split("_")[-1][-2:] for k in cols}
    
    md = ""
    
    if min_freq != None and transition:
        df = df.copy()
        transitions = find_transitions(df, "df", var)
        for trg in targets:
            for ti, tj in transitions:
                if df.loc[trg][f"frq_{ti}"] < min_freq or df.loc[trg][f"frq_{tj}"] < min_freq:
                    df.at[trg, f"{var}_{ti}:{tj}"] = 0

    if min_freq != None and transition == False:
        df = df.copy()
        years = sorted([int(col.split("_")[-1]) for col in cols])
        for trg in targets:
            for year in years:
                if df.loc[trg][f"frq_{year}"] < min_freq:
                    df.at[trg, f"{var}_{year}"] = 0        
                       
    
    if th != None:
        out = df.loc[targets, cols] > th
        out.rename(columns = renamer, inplace = True)
        print(var.upper()); md += var.upper() + "\n"
        print(out); md += out.to_markdown()
        print("SUM:", out.sum().sum()); md += f"\nSUM: {out.sum().sum()}\n"
    else:
        out = df.loc[targets, cols].round(rounder)
        out.rename(columns = renamer, inplace = True)
        print(var.upper()); md += var.upper() + "\n"
        print(out); md += out.to_markdown()
        print("SUM:", out.sum().sum()); md += f"\nSUM: {out.sum().sum()}\n"
    
    if return_md:
        return md

In [10]:
def read_csv(path):
    return pd.read_csv(path, sep=";", index_col=0)

In [11]:
def get_dwts(df, path):
    with open(Path(path), "r") as f:
        dwt_roots = [w.strip("\n") for w in f.readlines()]
    dwt_regex = re.compile(f"({'|'.join(dwt_roots)})")    
    dwts = [str(w) for w in df.index if re.search(dwt_regex, str(w)) != None]
    return dwts    

In [12]:
def get_variables(df):
    """
    Summeraises variables of a dataframe
    """
    
    yr_prefix = set()
    tr_prefix = set()
    years = set()
    transitions = set()
    
    for v in df.columns:
        prefix, suffix = tuple(v.split("_"))
        if ":" in suffix:
            tr_prefix.add(prefix)
            transitions.add(suffix)
        else:
            yr_prefix.add(prefix)
            years.add(suffix)
    
    return {
        "yr_prefix": yr_prefix, 
        "tr_prefix": tr_prefix,
        "years": years,
        "transitions": transitions
        }    

In [13]:
def checker(word, transition, controls_dir, n_ctrl=10, variable="cosine_change"):
    """
    Goes to original data, shows the control change/similarity of a word 
    at a transition.
    param word
    param transition    tupple of ti and tj 
    param controls_dir  where to find controls
    param variable      "cosine_change" or "cosine_sim"
    """
    
    ti, tj = transition
    basename = Path(controls_dir) / variable 
    
    filenames = [f"{ti}_{tj}_control{n}.txt" for n in range(1, n_ctrl+1)]
    
    values = []
    
    for file in filenames:
        data = load_metric(basename / file)
        value = data[word] if word in data else "NO MEASURE"
        values.append(value)
        print(file, value)
        
    return values   

In [14]:
def ncd(DATA, CORPUS, VAR, VAL): # No Change Detector

    corpus = Path(CORPUS)
    transitions = find_transitions(corpus / "vocab")    

    for yi, yj in transitions:
        print()
        print(f"{yi}:{yj}")
        
        A = list(DATA[DATA[f"{VAR}_{yi}:{yj}"] == VAL].index)
        print("No change (A):", len(A))
        B = list(DATA[DATA[f"{VAR}_{yi}:{yj}"] != VAL].index)
        print("Other (B):", len(B))
        print()
        print("{: <20} {}".format("A", "B"))
        print("{: <20} {}".format("---", "---"))
        for w1, w2 in zip(A[:100], B[:100]):
            print(f"{w1: <20} {w2}")

In [15]:
def find_transitions(source, mode="file", var = None):
    """
    List transitions. 
    For mode = "file", expected source: filepath
    For mode = "df", expected source: pandas DataFrame; provide varible var
    """
    if mode == "file":
        years = [int(file.strip(".txt")) for file in os.listdir(source)]
        years.sort()
        transitions = [(year, years[i]) for i, year in enumerate(years[:-1], start=1)]
    if mode == "df":
        cols = [col for col in source.columns if col.startswith(var)]
        cols.sort()
        transitions = [tuple(col.split("_")[-1].split(":")) for col in cols]
    return transitions

In [16]:
def w_overlap_checker(corpus, th_c):
    
    corpus = Path(corpus)
    transitions = find_transitions(corpus / "vocab")
    
    for yi, yj in transitions:
        print()
        print(f"{yi}:{yj}")
        
        voc_a = load_metric(corpus / f"vocab/{yi}.txt")
        voc_b = load_metric(corpus / f"vocab/{yj}.txt")

        voc_a = {w: c for w, c in voc_a.items() if c >= th_c}
        voc_b = {w: c for w, c in voc_b.items() if c >= th_c}
        print(f"{yi}:", len(voc_a))
        print(f"{yj}:", len(voc_b))
        print(f"{yi} and {yj}:", len([w for w in voc_a.keys() if w in voc_b.keys()]))
        print(f"{yi} or {yj}:", len(set(voc_a.keys()).union(set(voc_b.keys()))))
        print(f"{yi} - {yj}:", len(set(voc_a.keys()).difference(set(voc_b.keys()))))

In [51]:
def trend(df, var, norm=None, transition = True, metric="pearson"):
    
    if transition == 1:
        T = find_transitions(source=df, mode = "df", var=var)
    else:
        cols = [col for col in df.columns if col.startswith(var)]
        cols.sort()
        T = [int(col.split("_")[-1]) for col in cols]        
    
    table = []
    for w in df.index:
        valid = []
        if transition:
            X = df[[f"{var}_{ti}:{tj}" for ti, tj in T]].loc[w]
        else:
            X = df[[f"{var}_{t}" for t in T]].loc[w]
        
        valid = [(x, t) for x, t in zip(X, T) if not pd.isna(x)]

        N = len(valid)

        if N < 2:
            v = np.nan
            p = np.nan
        else:
            X, T = zip(*valid)
            if norm != None:
                X = norm(X)

            if metric == "pearson":
                R_data = pearsonr(X, T)

            if metric == "spearman":
                R_data = spearmanr(X, Y)
                
            v = R_data.statistic
            p = R_data.pvalue

        table.append([w, round(v, 2), round(p, 2), N])
    correlation = pd.DataFrame(table, columns=["Word", "Trend", "p", "N"])
        
    return correlation
    

In [19]:
def correlation(df, var1, var2, mode=1, norm1=None, norm2=None, metric="pearson"):    
    """
    
    ...
    param norm1  function to normalize/transform var1 with (default None); provide function 
                 e.g. zscore or np.log 
    """

    transitions = find_transitions(df, "df", var1) 
    # Assumes `var1` and `var2` are both transition variables
    # Consider implement `var1cut` parameter as in `universal_correlation`

    if mode == 1:
        correlation = df[[f"{var1}_{ti}:{tj}" for ti, tj in transitions]].corrwith(df[[f"{var2}_{ti}:{tj}" for ti, tj in transitions]], axis=1)
    
    if mode == 2:
        table = []
        for w in df.index:
            valid = []
            X = df[[f"{var1}_{ti}:{tj}" for ti, tj in transitions]].loc[w]
            Y = df[[f"{var2}_{ti}:{tj}" for ti, tj in transitions]].loc[w]
            for x, y in zip(X, Y):
                if pd.isna(x):
                    continue
                if pd.isna(y):
                    continue
                valid.append((x, y))
            
            N = len(valid)
            
            if N < 2:
                v = np.nan
                p = np.nan
            else:
                X, Y = zip(*valid)
                if norm1 != None:
                    X = norm1(X)
                if norm2 != None:
                    Y = norm2(Y)
                if metric == "pearson":
                    R_data = pearsonr(X, Y)
                    v = R_data.statistic
                    p = R_data.pvalue

                if metric == "spearman":
                    R_data = spearmanr(X, Y)
                    v = R_data.statistic
                    p = R_data.pvalue
                    
            table.append([w, round(v, 2), round(p, 2), N])
        correlation = pd.DataFrame(table, columns=["Word", "Correlation", "p", "N"])
        
    return correlation

In [20]:
def nonaninf(x, y):
    if pd.isna(x):
        return False
    if pd.isna(y):
        return False   
    if abs(x) == np.inf:
        return False
    if abs(y) == np.inf:
        return False
    return True

In [21]:
def nonalist(lst):
    for x in lst:
        if pd.isna(x):
            return False
    return True

In [22]:
def collect_var(df, var, varcut):
    X = []
    varcol = sorted([col for col in df.columns if col.startswith(var)])
    if varcut != None:
        del varcol[varcut]
    for col in varcol:
        X.extend(list(df[col]))
    return X    

In [23]:
def universal_correlation(
    df, 
    var1, 
    var2, 
    var1cut = None, 
    var2cut = None, 
    norm1=None, 
    norm2=None, 
    metric="pearson",
    min_freq = None
):
    # https://stackoverflow.com/questions/16031056/how-to-form-tuple-column-from-two-columns-in-pandas
    
    if min_freq != None:
        df = df.copy()
        
        t1 = find_transitions(df, "df", var1)
        t2 = find_transitions(df, "df", var2)

        if var1cut != None:
            transitions = [t for t in t2]

        for trg in [w for w in df.index]:
            for ti, tj in transitions:
                if df.loc[trg][f"frq_{ti}"] < min_freq or df.loc[trg][f"frq_{tj}"] < min_freq:
                    df.at[trg, f"{var2}_{ti}:{tj}"] = np.nan     
    
    X = collect_var(df, var1, var1cut)
    Y = collect_var(df, var2, var2cut)
            
    print("Length:")
    print("X:", len(X))
    print("Y:", len(Y))
    
    X, Y = zip(*[(x, y) for x, y in zip(X, Y) if nonaninf(x,y)])
        
    if norm1 != None:
        X = norm1(X)
    if norm2 != None:
        Y = norm2(Y)
    
    if metric == "pearson":
        R_data = pearsonr(X, Y)
    if metric == "spearman":
        R_data = spearmanr(X, Y)
        
    v = R_data.statistic
    p = R_data.pvalue
    
    return v, p

In [24]:
def model_comparison(
    dfs,
    mnames,
    var, 
    norm=None, 
    targets = Path("../data/utils/dwts.txt"), # =get_dwts(df_yearly_dwt, dwt_path)
    prefix = ("N", "A", "V"),
    mode="universal", # no other mode supported at the moment ...
    word=None, # not supported at the moment ... 
    metric="pearson",
):
    
    cols = [col for col in dfs[0].columns if col.startswith(var)] # based on first df
    trgs = [trg for trg in get_dwts(dfs[0], targets)]              # based on first df
    print("Targets:", ", ".join(trgs))
    trgs = [trg for trg in trgs if trg.startswith(prefix)] if prefix != None else trgs
    
    _XY = []
    if mode == "universal":    
        for model, mname in zip(dfs, mnames):
            this_model = []
            for col in cols:
                this_model.extend(model.loc[trgs, col])
            print("Length", mname, len(this_model))
            _XY.append(this_model)
    
    _XY = list(zip(*[xyz for xyz in zip(*_XY) if nonalist(xyz)])) # note: xyz is a variable for a tuple
    
    print("Length (no NaN):")
    for vector in _XY:
        print(len(vector))
    
    _XY = [norm(model) for model in _XY] if norm != None else _XY
    
    if metric == "pearson":
        R_data = np.corrcoef(_XY)
    if metric == "spearman":
        _XY = [rankdata(model) for model in _XY]
        R_data = np.corrcoef(_XY)
        
    out = pd.DataFrame(R_data, columns=mnames, index=mnames).round(3)

    return out

In [25]:
def visualize(df, var):
    pass

## Files

In [26]:
#file_path = Path("../../dw_results/fb_pol-yearly-radical3.csv")
#file_path = Path("fb_pol-yearly-radical3.csv")
results_dir = Path("../../dw_results")

In [33]:
files = sorted(os.listdir(results_dir))
_ = [print(file) for file in files]

bert-v0
fb_pol-time_bin-bert-fb_nli.csv
fb_pol-time_bin-bert-sentence-bert-swedish-cased.csv
fb_pol-time_bin-bert-sts_fbmodel.csv
fb_pol-time_bin-bert-sts_fbmodel_big_40epochs.csv
fb_pol-yearly-bert-sentence-bert-swedish-cased.csv
fb_pol-yearly-bert-sts_fbmodel.csv
fb_pol-yearly-bert-sts_fbmodel_big_40epochs.csv
fb_pol-yearly-radical3-full.csv
fb_pol-yearly-radical3-full.csv.bz2
fb_pol-yearly-radical3-restricted.csv


In [28]:
dwt_path = "../data/utils/dwts.txt"

In [29]:
#crp_tib = Path("/srv/data/gusbohom/root/corpora/toypol/time_bin/radical3/")
corpus = Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical3")

In [30]:
df_yearly = read_csv(results_dir / "fb_pol-yearly-radical3-full.csv")

In [31]:
df_yearly_dwt = read_csv(results_dir / "fb_pol-yearly-radical3-restricted.csv")

In [None]:
df_bert_nli = read_csv(results_dir / "fb_pol-yearly-bert-fb_nli.csv")

In [32]:
df_bert_sts = read_csv(results_dir / "fb_pol-yearly-bert-sts_fbmodel.csv")

In [None]:
df_bert_big = read_csv(results_dir / "fb_pol-yearly-bert-sts_fbmodel_big_40epochs.csv")

In [None]:
df_bert_kb  = read_csv(results_dir / "fb_pol-yearly-bert-sentence-bert-swedish-cased.csv")

In [34]:
df_tbn_sts = read_csv(results_dir / "fb_pol-time_bin-bert-sts_fbmodel.csv")

In [None]:
#toypol = read_csv(Path("../../toypol-time_bin.csv"))

In [None]:
#df_time_bin = read_csv(results_dir / "fb_pol-time_bin-radical3-full.csv")

In [None]:
#df_time_bin_dwt = read_csv(results_dir / "fb_pol-time_bin-radical3-restricted.csv")

## Model comparison

In [None]:
for w in sorted([w for w in df_yearly_dwt.index if w.startswith(("N", "A", "V"))]):
    print(w)

In [None]:
for w in sorted([w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))]):
    print(w)

In [None]:
11 * 22


In [None]:
model_comparison(
    dfs=[df_yearly_dwt, df_bert_nli, df_bert_sts, df_bert_big, df_bert_kb],
    mnames=["SGNS", "NLI", "STS", "BIG", "KB"],
    var="gch", 
    norm=None, 
    targets = Path("../data/utils/dwts.txt"), # =get_dwts(df_yearly_dwt, dwt_path)
    prefix = ("N", "A", "V"),
    mode="universal", # no other mode supported at the moment ...
    word=None, # not supported at the moment ... 
    metric="pearson"
)

In [None]:
print(model_comparison(
    dfs=[df_yearly_dwt, df_bert_nli, df_bert_sts, df_bert_big, df_bert_kb],
    mnames=["SGNS", "NLI", "STS", "BIG", "KB"],
    var="rch", 
    norm=None, 
    targets = Path("../data/utils/dwts.txt"), # =get_dwts(df_yearly_dwt, dwt_path)
    prefix = ("N", "A", "V"),
    mode="universal", # no other mode supported at the moment ...
    word=None, # not supported at the moment ... 
    metric="pearson"
).to_markdown())

## Q-Show

### Check

### Yearly

#### SGNS

In [None]:
q_show(df_yearly, "rch", as_table = True)
# q_show("rsim")
# q_show("gch")
# q_show("gsim")

In [None]:
checker(
    word="gyllene", 
    transition=(2000,2001), 
    controls_dir="/home/max/Results/fb_pol-yearly-rad3", 
    n_ctrl=10, 
    variable="cosine_change")

In [None]:
q_show(df_yearly, "gch", as_table = True, min_freq=50)

#### NLI

In [None]:
q_show(df_bert_nli, "rch", as_table = True)

In [None]:
q_show(df_bert_nli, "gch", as_table = True)

In [None]:
q_show(df_bert_nli, "spr", as_table = True, transition=False)

In [None]:
q_show(df_bert_nli, "anospr", as_table = True, transition=False)

#### STS (small)

In [35]:
q_show(df_bert_sts, "rch", as_table = True)


rch_2000:2001 jaccard = None
                Word    RCH  n_i  n_j    GCH  Mctrl  Sctrl
0  N1_kulturberikare  8.141   45   28  0.156  0.082  0.009
1          V1_berika  4.429   43   17  0.182  0.110  0.016
2        N1_berikare  2.216   21    1  0.256  0.158  0.042
3    V1_kulturberika  0.385    7    4  0.235  0.223  0.028

rch_2001:2002 jaccard = 1.0
                Word    RCH  n_i  n_j    GCH  Mctrl  Sctrl
0  N1_kulturberikare  3.763   28   10  0.180  0.113  0.017
1          V1_berika  3.137   17   30  0.179  0.127  0.016
2    V1_kulturberika  0.788    4    2  0.283  0.271  0.015
3        N1_berikare  0.786    1    2  0.273  0.262  0.014

rch_2002:2003 jaccard = 1.0
                Word    RCH  n_i  n_j    GCH  Mctrl  Sctrl
0        N1_berikare  5.264    2    8  0.330  0.191  0.025
1          V1_berika  4.496   30   23  0.170  0.121  0.010
2  N1_kulturberikare  3.935   10    6  0.211  0.179  0.008
3       N1_globalist  0.905    1    0  0.384  0.384  0.000

rch_2003:2004 jaccard = 1.

                              Word    RCH   n_i   n_j    GCH  Mctrl  Sctrl
0                N1_kulturberikare  6.528   596   470  0.032  0.024  0.001
1                        V1_berika  6.340  1622  1363  0.026  0.016  0.001
2                     N1_globalist  6.114   524   538  0.035  0.025  0.001
3                  V1_kulturberika  4.265   236   188  0.047  0.037  0.002
4                      N1_berikare  3.153   425   276  0.038  0.033  0.002
5                   N1C_globalistX  2.866   299   350  0.041  0.032  0.003
6                    V1_återvandra  2.751    42    63  0.083  0.066  0.006
7                  N1_återvandring  2.702   116   131  0.057  0.043  0.005
8                  N2_återvandrare  2.693     3     9  0.207  0.175  0.011
9                   N1_förortsgäng  2.124     8    13  0.177  0.159  0.008
10                   N1C_berikareX  2.096    36    27  0.137  0.120  0.008
11              N1C_kulturberikarX  1.934    23    17  0.161  0.140  0.010
12              V1_hjälpa

                  Word     RCH   n_i   n_j    GCH  Mctrl  Sctrl
0         N1_globalist  25.439  3240  2982  0.029  0.011  0.001
1       N1C_globalistX   9.806   890   744  0.038  0.022  0.002
2            V1_berika   7.159  1073   839  0.036  0.019  0.002
3      A1_globalistisk   6.648   807   706  0.031  0.021  0.001
4        V1_återvandra   4.116   137   140  0.057  0.042  0.004
5      V1_kulturberika   3.225    59    67  0.076  0.063  0.004
6      N1_återvandring   2.876  1338  1215  0.019  0.014  0.001
7     N2C_återvandrarX   2.277     7     1  0.257  0.231  0.011
8   N1C_återvandringsX   1.461   362   263  0.033  0.029  0.003
9    N1_kulturberikare   1.400   197   100  0.049  0.046  0.003
10      N1_förortsgäng   0.635    29    10  0.131  0.122  0.013
11       N1C_berikareX   0.596     8     2  0.269  0.256  0.022
12     N2_återvandrare   0.410    10    11  0.154  0.148  0.014
13         N1_berikare  -0.046   141    88  0.057  0.058  0.006
14  V1_hjälpa_på_plats  -0.493    49    

In [None]:
q_show(df_bert_sts, "gch", as_table = True)

In [None]:
q_show(df_bert_sts, "spr", as_table = True, transition=False)

In [None]:
q_show(df_bert_sts, "anospr", as_table = True, transition=False)

#### STS (big)

#### KB

In [None]:
#q_show(toypol, "rch", as_table = True)

In [None]:
#q_show(toypol, "gsim", as_table = True)

### Time bin

In [None]:
#q_show(df_time_bin, "rch", as_table = True)

In [36]:
q_show(df_tbn_sts, "rch", as_table = True)


rch_2003:2007 jaccard = None
                              Word      RCH  n_i   n_j    GCH  Mctrl  Sctrl
0                        V1_berika  101.638  460  6014  0.082  0.010  0.001
1                   N1C_globalistX   67.424    7   457  0.283  0.038  0.003
2                      N1_berikare   52.883   34  2026  0.093  0.019  0.001
3                    N1C_berikareX   51.851    0   280  0.228  0.056  0.003
4                N1_kulturberikare   41.973  142  2793  0.044  0.013  0.001
5                  V1_kulturberika   17.618   30   906  0.079  0.026  0.003
6                  N1_återvandring   16.346   22   289  0.093  0.036  0.003
7                  A1_globalistisk   14.326   12   166  0.132  0.056  0.005
8                     N1_globalist   13.072   62   459  0.078  0.037  0.003
9               V1_hjälpa_på_plats   12.744    0    84  0.200  0.068  0.010
10              N1C_återvandringsX    9.298    9   198  0.087  0.045  0.004
11                  N1_förortsgäng    9.069    0    33  0.

## Change Show

```
def overview(
    df, 
    var, 
    targets, 
    prefixes = ("N", "A", "V"), 
    th=4.781, 
    transition = True, 
    min_freq = 10, 
    return_md = False
)
```

### Yearly

#### SGNS - only DWTS

In [None]:
_ = overview(df_yearly_dwt, "rch", get_dwts(df_yearly_dwt, dwt_path), return_md=True)
print(_)

In [None]:
# change_show(df, var, targets, th=4.781)
change_show(df_yearly_dwt, "rch", get_dwts(df_yearly_dwt, dwt_path))

In [None]:
change_show(df_yearly_dwt, "rch", get_dwts(df_yearly_dwt, dwt_path), th=None)

In [None]:
change_show(df_yearly_dwt, "gsim", get_dwts(df_yearly_dwt, dwt_path), th=None)

#### NLI

In [None]:
overview(df_bert_nli, "anospr", get_dwts(df_yearly_dwt, dwt_path), th=None, transition=False)

In [None]:
overview(df_bert_nli, "difspr", get_dwts(df_yearly_dwt, dwt_path), th=None, transition=False)

#### STS (small)

In [None]:
_ = overview(df_bert_sts, "frq", get_dwts(df_yearly_dwt, dwt_path), th= None, transition = False, min_freq = None)
print(_)

In [None]:
_ = overview(df_bert_sts, "gch", get_dwts(df_yearly_dwt, dwt_path), th= None, transition = True, min_freq = None, return_md=True)
print(_)

In [None]:
_ = overview(df_bert_sts, "rch", get_dwts(df_yearly_dwt, dwt_path), min_freq = 10, return_md=True)
print(_)

In [None]:
change_show(df_bert_sts, "rch", get_dwts(df_yearly_dwt, dwt_path))

In [None]:
df_bert_sts.loc["V1_berika"]["gsim_2008:2009"]

In [None]:
_ = overview(df_bert_sts, "anospr", get_dwts(df_yearly_dwt, dwt_path), th=None, transition = False, min_freq = None, return_md=True, rounder = 2)
print(_)

In [None]:
_ = overview(df_bert_sts, "anospr", get_dwts(df_yearly_dwt, dwt_path), th=-2, transition = False, min_freq = None, return_md=True, rounder = 2)
print(_)

#### STS (big)

#### KB

In [None]:
_ = overview(df_bert_kb, "rch", get_dwts(df_yearly_dwt, dwt_path), return_md=True)
print(_)

### Time bin

In [None]:
change_show(df_time_bin_dwt, "rch", get_dwts(df_time_bin_dwt, dwt_path))

In [None]:
change_show(df_time_bin_dwt, "gsim", get_dwts(df_time_bin_dwt, dwt_path), th=None)

In [None]:
a=load_metric("/srv/data/gusbohom/root/corpora/fb_pol/time_bin/radical3/vocab/2015.txt")["V1_berika"]
b=load_metric("/srv/data/gusbohom/root/corpora/fb_pol/time_bin/radical3/vocab/2019.txt")["V1_berika"]
print(a, b)

In [38]:
change_show(df_tbn_sts, "rch", get_dwts(df_tbn_sts, dwt_path))


rch_2003:2007
                Word       Value  n_2003  n_2007  gch_2003:2007  M_2003:2007  Std_2003:2007
0          V1_berika  101.638235     460    6014       0.082196     0.010324       0.000674
1  N1_kulturberikare   41.972630     142    2793       0.044481     0.013360       0.000707
2       N1_globalist   13.071699      62     459       0.077816     0.036770       0.002994
3        N1_berikare   52.882744      34    2026       0.092718     0.018655       0.001335
4    V1_kulturberika   17.617830      30     906       0.079005     0.026139       0.002861
5    N1_återvandring   16.346126      22     289       0.093373     0.036198       0.003335
6    A1_globalistisk   14.325943      12     166       0.132307     0.055879       0.005087

rch_2007:2011
                 Word      Value  n_2007  n_2011  gch_2007:2011  M_2007:2011  Std_2007:2011
0           V1_berika  10.745189    6014    6137       0.017253     0.007617       0.000855
1   N1_kulturberikare  10.902428    2793    1885  

## Correlation

In [None]:
get_variables(df_yearly_dwt)

### Yearly

#### All words (SGNS)

In [None]:
universal_correlation(
    df=df_yearly, 
    var1="gch", 
    var2="frq", 
    var1cut = None, 
    var2cut = -1, 
    norm1=None, 
    norm2=np.log, 
    metric="pearson")

In [None]:
universal_correlation(
    df=df_yearly, 
    var1="gsim", 
    var2="frq", 
    var1cut = None, 
    var2cut = -1, 
    norm1=None, 
    norm2=np.log, 
    metric="pearson")

In [None]:
universal_correlation(
    df=df_yearly, 
    var1="rch", 
    var2="frq", 
    var1cut = None, 
    var2cut = -1, 
    norm1=None, 
    norm2=np.log, 
    metric="pearson")

In [None]:
universal_correlation(
    df=df_yearly, 
    var1="rch", 
    var2="stdc", 
    var1cut = None, 
    var2cut = None, 
    norm1=None, 
    norm2=None, 
    metric="pearson")

#### DWTs

###### SGNS

In [None]:
correlation(df=df_yearly_dwt, 
            var1="diffpm", 
            var2="rch", 
            mode=2, 
            metric = "pearson")

In [None]:
correlation(df=df_yearly_dwt, var1="stdc", var2="rch", corpus=corpus, mode=2, metric = "pearson")

In [None]:
correlation(df=df_yearly_dwt, var1="diffpm", var2="rch", corpus=corpus, mode=2, metric = "spearman")

```
def universal_correlation(
    df, 
    var1, 
    var2, 
    var1cut = None, 
    var2cut = None, 
    norm1=None, 
    norm2=None, 
    metric="pearson")
```

In [None]:
##### Universal

In [None]:
universal_correlation(df_yearly, "fpm", "gch", -1, norm1=np.log)

In [None]:
universal_correlation(df_yearly, "fpm", "gch", 0, norm1=np.log)

In [None]:
universal_correlation(df_yearly_dwt.loc[[w for w in df_yearly_dwt.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      -1, 
                      norm1=np.log)

In [None]:
universal_correlation(df_yearly_dwt.loc[[w for w in df_yearly_dwt.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      0, 
                      norm1=np.log)

#### XXXXX MYSTERY --- WHAT IS WRONG????

In [None]:
term = "N1_kulturberikare"

In [None]:
df_bert_sts.loc[term, [f"gch_{year}:{year+1}" for year in range(2000, 2021)]]

In [None]:
df_yearly.loc[term, [f"gch_{year}:{year+1}" for year in range(2000, 2021)]]

##### NLI

In [None]:
universal_correlation(df_bert_nli.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      -1, 
                      norm1=np.log,
                      min_freq = 10 
                     )

In [None]:
universal_correlation(df_bert_nli.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      0, 
                      norm1=np.log,
                      min_freq = 10
                     )

##### STS (small)

In [48]:
trend(df_bert_sts, var="frq", norm=None, transition = False, metric="pearson")

Unnamed: 0,Word,Trend,p,N
0,N1_kulturberikare,0.08,0.71,23
1,V1_berika,0.47,0.02,23
2,N1_berikare,0.08,0.72,23
3,N1_förortsgäng,0.80,0.00,23
4,V1_kulturberika,0.17,0.43,23
...,...,...,...,...
127,återvandrig,0.57,0.00,23
128,vänsterglobalisternas,0.68,0.00,23
129,pkglobalist,0.53,0.01,23
130,vänsterglobalistisk,0.45,0.03,23


In [52]:
trend(df_bert_sts, var="gch", norm=None, transition = True, metric="pearson")

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U32'), dtype('<U32')) -> dtype('<U32')

In [None]:
universal_correlation(df_bert_sts.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      -1, 
                      norm1=np.log,
                      min_freq = 10
                     )

In [None]:
universal_correlation(df_bert_sts.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      0, 
                      norm1=np.log,
                      min_freq = 10
                     )

##### STS (big)

In [None]:
universal_correlation(df_bert_big.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      -1, 
                      norm1=np.log,
                      min_freq = 10
                     )

In [None]:
universal_correlation(df_bert_big.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      0, 
                      norm1=np.log,
                      min_freq = 10
                     )

##### KB

In [None]:
universal_correlation(df_bert_kb.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      -1, 
                      norm1=np.log,
                      min_freq = 10
                     )

In [None]:
universal_correlation(df_bert_big.loc[[w for w in df_bert_sts.index if w.startswith(("N", "A", "V"))],:], 
                      "fpm", 
                      "gch", 
                      0, 
                      norm1=np.log,
                      min_freq = 10
                     )