In [70]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import re
from util import load_metric
from scipy.stats import spearmanr, pearsonr, zscore, rankdata
#from collections import Counter

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>div.output_area pre {white-space: pre;}</style>"))

In [3]:
pd.set_option('display.expand_frame_repr', False)

In [4]:
pd.set_option('display.max_columns', None)

## Functions

In [5]:
def jaccard(a, b):
    return len(a.intersection(b)) / len(a.union(b))

In [6]:
def q_show(df, var, k = 40, as_table = False, transition=True, min_freq=None):
    """
    Given a dataframe, shows the top k words for variable var each year.
    Note: as_table only sypported for transition variables!
    """
    
    COLUMNS = [col for col in sorted(df.columns) if col.startswith(var)]

    if min_freq == None:    
        WORDS = [df[col].sort_values(ascending=False)[:k].index for col in COLUMNS]
    else:
        TRANSITIONS = [tuple(col.split("_")[-1].split(":")) for col in COLUMNS]
        WORDS = [df[(df[f"frq_{trs[0]}"] >= min_freq) & (df[f"frq_{trs[1]}"] >= min_freq)][col].sort_values(ascending=False)[:k].index for col, trs in zip(COLUMNS, TRANSITIONS)]
    
    ser = [(c, d) for c, d in zip(COLUMNS, WORDS)]
    
    for i, (col, s) in enumerate(ser):
        if i > 0:
            jac = round(jaccard(set(s), ser[i-1][-1]), 2)
        else:
            jac = None
        
        print()
        print(col, "jaccard =", jac)

        if transition:
            if as_table == False:
                print(s)
            else:
                trans  = col.split("_")[-1]
                ti, tj = tuple(trans.split(":"))
                table  = [] 
                for word in s:
                    v    = df.loc[word][col]
                    f_ti = int(df.loc[word][f"frq_{ti}"])
                    f_tj = int(df.loc[word][f"frq_{tj}"])
                    gch  = df.loc[word][f"gch_{trans}"]
                    m    = df.loc[word][f"mccc_{trans}"]
                    std  = df.loc[word][f"stdc_{trans}"]
                    columns = ["Word", var.upper(), "n_i", "n_j", "GCH", "Mctrl", "Sctrl"]
                    table.append([word, v, f_ti, f_tj, gch, m, std])
                print(pd.DataFrame(table, columns=columns).dropna().round(3))
        
        else:
            year = col.split("_")[-1]
            table = []
            for word in s:
                v    = df.loc[word][col]
                f = int(df.loc[word][f"frq_{year}"])
                columns = ["Word", var.upper(), "Freq"]
                table.append([word, v, f])
            print(pd.DataFrame(table, columns=columns).dropna().round(3))    

In [7]:
def change_show(df, var, targets, th=4.781):
    """
    Given a dataframe, shows the value for a variable of target 
    each transition/year.
    Provide threshold to only show the targets meeting the threshold
    """

    for col in sorted([col for col in df.columns if col.startswith(var)]):
        trans  = col.split("_")[-1]
        ti, tj = tuple(trans.split(":"))
        table  = [] 
        
        for word in targets:
            v    = df.loc[word][col]
            f_ti = int(df.loc[word][f"frq_{ti}"])
            f_tj = int(df.loc[word][f"frq_{tj}"])
            
            if th != None:
                if v > th:
                    table.append([word, v, f_ti, f_tj])
            else:
                table.append([word, v, f_ti, f_tj])
        
        columns = ["Word", "Value", f"n_{ti}", f"n_{tj}"]
        if table != []:
            display = pd.DataFrame(table, columns=columns)
            print()
            print(col)
            print(display)                

In [8]:
def v_renamer(var_string):
    var_string = var_string.split("_")
    yi, yj     = tuple(var_string[-1].split(":"))
    yi         = yi[-2:]
    yj         = yj[-2:]
    return f"{yi}:{yj}"

In [9]:
def overview(df, var, targets, prefixes = ("N", "A", "V"), th=4.781, transition = True):
    """ 
    Similar to change_show but:
    * Show data as one table
    * Only show variable (change_show display additional data)
    * Provide th to show True/False
    """
    
    cols = sorted([col for col in df.columns if col.startswith(var)])
    targets = [w for w in targets if w.startswith(prefixes)]
    targets.sort()
    if transition:
        renamer = {k: v_renamer(k) for k in cols}
    else:
        renamer = {k: k.split("_")[-1][-2:] for k in cols}
    
    if th != None:
        out = df.loc[targets, cols] > th
        out.rename(columns = renamer, inplace = True)
        print(var.upper())
        print(out)
    else:
        out = df.loc[targets, cols].round(3)
        out.rename(columns = renamer, inplace = True)
        print(var.upper())
        print(out)                

In [10]:
def read_csv(path):
    return pd.read_csv(path, sep=";", index_col=0)

In [11]:
def get_dwts(df, path):
    with open(Path(path), "r") as f:
        dwt_roots = [w.strip("\n") for w in f.readlines()]
    dwt_regex = re.compile(f"({'|'.join(dwt_roots)})")    
    dwts = [str(w) for w in df.index if re.search(dwt_regex, str(w)) != None]
    return dwts    

In [12]:
def get_variables(df):
    """
    Summeraises variables of a dataframe
    """
    
    yr_prefix = set()
    tr_prefix = set()
    years = set()
    transitions = set()
    
    for v in df.columns:
        prefix, suffix = tuple(v.split("_"))
        if ":" in suffix:
            tr_prefix.add(prefix)
            transitions.add(suffix)
        else:
            yr_prefix.add(prefix)
            years.add(suffix)
    
    return {
        "yr_prefix": yr_prefix, 
        "tr_prefix": tr_prefix,
        "years": years,
        "transitions": transitions
        }    

In [13]:
def checker(word, transition, controls_dir, n_ctrl=10, variable="cosine_change"):
    """
    Goes to original data, shows the control change/similarity of a word 
    at a transition.
    param word
    param transition    tupple of ti and tj 
    param controls_dir  where to find controls
    param variable      "cosine_change" or "cosine_sim"
    """
    
    ti, tj = transition
    basename = Path(controls_dir) / variable 
    
    filenames = [f"{ti}_{tj}_control{n}.txt" for n in range(1, n_ctrl+1)]
    
    values = []
    
    for file in filenames:
        data = load_metric(basename / file)
        value = data[word] if word in data else "NO MEASURE"
        values.append(value)
        print(file, value)
        
    return values   

In [14]:
def ncd(DATA, CORPUS, VAR, VAL): # No Change Detector

    corpus = Path(CORPUS)
    transitions = find_transitions(corpus / "vocab")    

    for yi, yj in transitions:
        print()
        print(f"{yi}:{yj}")
        
        A = list(DATA[DATA[f"{VAR}_{yi}:{yj}"] == VAL].index)
        print("No change (A):", len(A))
        B = list(DATA[DATA[f"{VAR}_{yi}:{yj}"] != VAL].index)
        print("Other (B):", len(B))
        print()
        print("{: <20} {}".format("A", "B"))
        print("{: <20} {}".format("---", "---"))
        for w1, w2 in zip(A[:100], B[:100]):
            print(f"{w1: <20} {w2}")

In [15]:
def find_transitions(source, mode="file", var = None):
    """
    List transitions. 
    For mode = "file", expected source: filepath
    For mode = "df", expected source: pandas DataFrame; provide varible var
    """
    if mode == "file":
        years = [int(file.strip(".txt")) for file in os.listdir(source)]
        years.sort()
        transitions = [(year, years[i]) for i, year in enumerate(years[:-1], start=1)]
    if mode == "df":
        cols = [col for col in df.columns if col.startswith(var)]
        cols.sort()
        transitions = [tuple(col.split("_")[-1].split(":")) for col in cols]
    return transitions

In [16]:
def w_overlap_checker(corpus, th_c):
    
    corpus = Path(corpus)
    transitions = find_transitions(corpus / "vocab")
    
    for yi, yj in transitions:
        print()
        print(f"{yi}:{yj}")
        
        voc_a = load_metric(corpus / f"vocab/{yi}.txt")
        voc_b = load_metric(corpus / f"vocab/{yj}.txt")

        voc_a = {w: c for w, c in voc_a.items() if c >= th_c}
        voc_b = {w: c for w, c in voc_b.items() if c >= th_c}
        print(f"{yi}:", len(voc_a))
        print(f"{yj}:", len(voc_b))
        print(f"{yi} and {yj}:", len([w for w in voc_a.keys() if w in voc_b.keys()]))
        print(f"{yi} or {yj}:", len(set(voc_a.keys()).union(set(voc_b.keys()))))
        print(f"{yi} - {yj}:", len(set(voc_a.keys()).difference(set(voc_b.keys()))))

In [17]:
def correlation(df, var1, var2, mode=1, norm1=None, norm2=None, metric="pearson"):    
    """
    
    ...
    param norm1  function to normalize/transform var1 with (default None); provide function 
                 e.g. zscore or np.log 
    """

    transitions = find_transitions(df, "df", var1) 
    # Assumes `var1` and `var2` are both transition variables
    # Consider implement `var1cut` parameter as in `universal_correlation`

    if mode == 1:
        correlation = df[[f"{var1}_{ti}:{tj}" for ti, tj in transitions]].corrwith(df[[f"{var2}_{ti}:{tj}" for ti, tj in transitions]], axis=1)
    
    if mode == 2:
        table = []
        for w in df.index:
            valid = []
            X = df[[f"{var1}_{ti}:{tj}" for ti, tj in transitions]].loc[w]
            Y = df[[f"{var2}_{ti}:{tj}" for ti, tj in transitions]].loc[w]
            for x, y in zip(X, Y):
                if pd.isna(x):
                    continue
                if pd.isna(y):
                    continue
                valid.append((x, y))
            
            N = len(valid)
            
            if N < 2:
                v = np.nan
                p = np.nan
            else:
                X, Y = zip(*valid)
                if norm1 != None:
                    X = norm1(X)
                if norm2 != None:
                    Y = norm2(Y)
                if metric == "pearson":
                    R_data = pearsonr(X, Y)
                    v = R_data.statistic
                    p = R_data.pvalue

                if metric == "spearman":
                    R_data = spearmanr(X, Y)
                    v = R_data.statistic
                    p = R_data.pvalue
                    
            table.append([w, round(v, 2), round(p, 2), N])
        correlation = pd.DataFrame(table, columns=["Word", "Correlation", "p", "N"])
        
    return correlation

In [27]:
def nonaninf(x, y):
    if pd.isna(x):
        return False
    if pd.isna(y):
        return False   
    if abs(x) == np.inf:
        return False
    if abs(y) == np.inf:
        return False
    return True

In [53]:
def nonalist(lst):
    for x in lst:
        if pd.isna(x):
            return False
    return True

In [31]:
def collect_var(df, var, varcut):
    X = []
    varcol = sorted([col for col in df.columns if col.startswith(var)])
    if varcut != None:
        del varcol[varcut]
    for col in varcol:
        X.extend(list(df[col]))
    return X    

In [40]:
def universal_correlation(
    df, 
    var1, 
    var2, 
    var1cut = None, 
    var2cut = None, 
    norm1=None, 
    norm2=None, 
    metric="pearson"):
    # https://stackoverflow.com/questions/16031056/how-to-form-tuple-column-from-two-columns-in-pandas
    
    X = collect_var(df, var1, var1cut)
    Y = collect_var(df, var2, var2cut)
            
    print("Length:")
    print("X:", len(X))
    print("Y:", len(Y))
    
    X, Y = zip(*[(x, y) for x, y in zip(X, Y) if nonaninf(x,y)])
        
    if norm1 != None:
        X = norm1(X)
    if norm2 != None:
        Y = norm2(Y)
    
    if metric == "pearson":
        R_data = pearsonr(X, Y)
    if metric == "spearman":
        R_data = spearmanr(X, Y)
        
    v = R_data.statistic
    p = R_data.pvalue
    
    return v, p

In [74]:
def model_comparison(
    dfs,
    mnames,
    var, 
    norm=None, 
    targets = Path("../data/utils/dwts.txt"), # =get_dwts(df_yearly_dwt, dwt_path)
    prefix = ("N", "A", "V"),
    mode="universal", # no other mode supported at the moment ...
    word=None, # not supported at the moment ... 
    metric="pearson"
):
    
    cols = [col for col in dfs[0].columns if col.startswith(var)] # based on first df
    trgs = [trg for trg in get_dwts(dfs[0], targets)]              # based on first df
    trgs = [trg for trg in trgs if trg.startswith(prefix)] if prefix != None else trgs
    
    _XY = []
    if mode == "universal":    
        for model, mname in zip(dfs, mnames):
            this_model = []
            for col in cols:
                this_model.extend(model.loc[trgs, col])
            _XY.append(this_model)
    
    _XY = list(zip(*[xyz for xyz in zip(*_XY) if nonalist(xyz)])) # note: xyz is a variable for a tuple
    
    print("Length:")
    for vector in _XY:
        print(len(vector))
    
    _XY = [norm(model) for model in _XY] if norm != None else _XY
    
    if metric == "pearson":
        R_data = np.corrcoef(_XY)
    if metric == "spearman":
        _XY = [rankdata(model) for model in _XY]
        R_data = np.corrcoef(_XY)
        
    out = pd.DataFrame(R_data, columns=mnames, index=mnames).round(3)

    return out

In [21]:
def visualize(df, var):
    pass

## Files

In [22]:
#file_path = Path("../../dw_results/fb_pol-yearly-radical3.csv")
#file_path = Path("fb_pol-yearly-radical3.csv")
results_dir = Path("../../dw_results")

In [23]:
files = os.listdir(results_dir)
_ = [print(file) for file in files]

fb_pol-yearly-radical3-restricted.csv
fb_pol-yearly-bert-sentence-bert-swedish-cased.csv
fb_pol-yearly-bert-sts_fbmodel_big_40epochs.csv
fb_pol-yearly-bert-fb_nli.csv
fb_pol-yearly-radical3-full.csv
fb_pol-yearly-bert-sts_fbmodel.csv


In [24]:
dwt_path = "../data/utils/dwts.txt"

In [25]:
#crp_tib = Path("/srv/data/gusbohom/root/corpora/toypol/time_bin/radical3/")
corpus = Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical3")

In [89]:
df_yearly = read_csv(results_dir / "fb_pol-yearly-radical3-full.csv")

In [26]:
df_yearly_dwt = read_csv(results_dir / "fb_pol-yearly-radical3-restricted.csv")

In [45]:
df_bert_nli = read_csv(results_dir / "fb_pol-yearly-bert-fb_nli.csv")

In [76]:
df_bert_sts = read_csv(results_dir / "fb_pol-yearly-bert-sts_fbmodel.csv")

In [90]:
df_bert_big = read_csv(results_dir / "fb_pol-yearly-bert-sts_fbmodel_big_40epochs.csv")

In [91]:
df_bert_kb  = read_csv(results_dir / "fb_pol-yearly-bert-sentence-bert-swedish-cased.csv")

In [None]:
#toypol = read_csv(Path("../../toypol-time_bin.csv"))

In [None]:
#df_time_bin = read_csv(results_dir / "fb_pol-time_bin-radical3-full.csv")

In [None]:
#df_time_bin_dwt = read_csv(results_dir / "fb_pol-time_bin-radical3-restricted.csv")

## Q-Show

### Check

### Yearly

#### SGNS

In [None]:
q_show(df_yearly, "rch", as_table = True)
# q_show("rsim")
# q_show("gch")
# q_show("gsim")

In [None]:
checker(
    word="gyllene", 
    transition=(2000,2001), 
    controls_dir="/home/max/Results/fb_pol-yearly-rad3", 
    n_ctrl=10, 
    variable="cosine_change")

In [None]:
q_show(df_yearly, "rch", as_table = True, min_freq=50)

#### NLI

In [None]:
q_show(df_bert_nli, "rch", as_table = True)

In [None]:
q_show(df_bert_nli, "gch", as_table = True)

In [None]:
q_show(df_bert_nli, "spr", as_table = True, transition=False)

In [88]:
q_show(df_bert_nli, "anospr", as_table = True, transition=False)


anospr_2002 jaccard = None
                Word  ANOSPR  Freq
0    V1_kulturberika   0.936     2
1          V1_berika  -1.569    25
2  N1_kulturberikare  -4.568     9

anospr_2003 jaccard = 1.0
                Word  ANOSPR  Freq
0  N1_kulturberikare   2.437     5
1          V1_berika   1.582    14
2        N1_berikare  -0.171     8

anospr_2004 jaccard = 1.0
                Word  ANOSPR  Freq
0          V1_berika  -0.705    54
1  N1_kulturberikare  -3.209    20
2    V1_kulturberika  -4.016     2

anospr_2005 jaccard = 1.0
                Word  ANOSPR  Freq
0    V1_kulturberika  -0.167     4
1          V1_berika  -0.851    55
2        N1_berikare  -0.982     4
3  N1_kulturberikare  -1.127    14

anospr_2006 jaccard = 1.0
                Word  ANOSPR  Freq
0  N1_kulturberikare   0.781    51
1    V1_kulturberika  -0.264     9
2          V1_berika  -0.436   162
3        N1_berikare  -1.097    14
4       N1_globalist  -1.133    26

anospr_2007 jaccard = 1.0
                Word  ANOSPR  Fr

                  Word  ANOSPR  Freq
0      V1_kulturberika   0.600    52
1        V1_återvandra   0.543    96
2        N1C_berikareX   0.426     2
3       N1_förortsgäng   0.075     8
4         N1_globalist  -0.076  2726
5      N2_återvandrare  -0.083     3
6          N1_berikare  -0.283    72
7    N1_kulturberikare  -0.347    79
8   V1_hjälpa_på_plats  -0.437    42
9      A1_globalistisk  -0.447   624
10           V1_berika  -0.468   709
11      N1C_globalistX  -0.514   672
12     N1_återvandring  -0.998   873
13  N1C_återvandringsX  -1.271   202

anospr_2021 jaccard = 1.0
                              Word  ANOSPR  Freq
0                    V1_återvandra   0.214   130
1                N1_kulturberikare   0.174   101
2                  N1_återvandring   0.043   901
3                  V1_kulturberika  -0.023    32
4                     N1_globalist  -0.093  2021
5                  A1_globalistisk  -0.125   430
6                   N1_förortsgäng  -0.170    13
7               V1_hjälpa_

#### STS (small)

#### STS (big)

#### KB

In [None]:
#q_show(toypol, "rch", as_table = True)

In [None]:
#q_show(toypol, "gsim", as_table = True)

### Time bin

In [None]:
q_show(df_time_bin, "rch", as_table = True)

## Change Show

### Yearly

#### SGNS - only DWTS

In [None]:
overview(df_yearly_dwt, "rch", get_dwts(df_yearly_dwt, dwt_path))

In [None]:
# change_show(df, var, targets, th=4.781)
change_show(df_yearly_dwt, "rch", get_dwts(df_yearly_dwt, dwt_path))

In [None]:
change_show(df_yearly_dwt, "rch", get_dwts(df_yearly_dwt, dwt_path), th=None)

In [None]:
change_show(df_yearly_dwt, "gsim", get_dwts(df_yearly_dwt, dwt_path), th=None)

#### NLI

In [None]:
overview(df_bert_nli, "anospr", get_dwts(df_yearly_dwt, dwt_path), th=None, transition=False)

In [None]:
overview(df_bert_nli, "difspr", get_dwts(df_yearly_dwt, dwt_path), th=None, transition=False)

#### STS (small)

#### STS (big)

#### KB

### Time bin

In [None]:
change_show(df_time_bin_dwt, "rch", get_dwts(df_time_bin_dwt, dwt_path))

In [None]:
change_show(df_time_bin_dwt, "gsim", get_dwts(df_time_bin_dwt, dwt_path), th=None)

In [None]:
a=load_metric("/srv/data/gusbohom/root/corpora/fb_pol/time_bin/radical3/vocab/2015.txt")["V1_berika"]
b=load_metric("/srv/data/gusbohom/root/corpora/fb_pol/time_bin/radical3/vocab/2019.txt")["V1_berika"]
print(a, b)

## Correlation

In [None]:
get_variables(df_yearly_dwt)

### Yearly

#### All words (SGNS)

In [96]:
universal_correlation(
    df=df_yearly, 
    var1="gch", 
    var2="frq", 
    var1cut = None, 
    var2cut = -1, 
    norm1=None, 
    norm2=np.log, 
    metric="pearson")

Length:
X: 3342680
Y: 3342680


(0.3803103896395752, 0.0)

In [97]:
universal_correlation(
    df=df_yearly, 
    var1="gsim", 
    var2="frq", 
    var1cut = None, 
    var2cut = -1, 
    norm1=None, 
    norm2=np.log, 
    metric="pearson")

Length:
X: 3342680
Y: 3342680


(-0.36867454175092923, 0.0)

In [100]:
universal_correlation(
    df=df_yearly, 
    var1="rch", 
    var2="frq", 
    var1cut = None, 
    var2cut = -1, 
    norm1=None, 
    norm2=np.log, 
    metric="pearson")

Length:
X: 3342680
Y: 3342680


(0.06659562110360326, 0.0)

In [102]:
universal_correlation(
    df=df_yearly, 
    var1="rch", 
    var2="stdc", 
    var1cut = None, 
    var2cut = None, 
    norm1=None, 
    norm2=None, 
    metric="pearson")

Length:
X: 3342680
Y: 3342680


(0.054552105535179966, 0.0)

#### DWTs

###### SGNS

In [94]:
correlation(df=df_yearly_dwt, 
            var1="diffpm", 
            var2="rch", 
            corpus=corpus, 
            mode=2, 
            metric = "pearson")

Unnamed: 0,Word,Correlation,p,N
0,V1_berika,-0.13,0.56,22
1,N1_kulturberikare,-0.39,0.1,19
2,N1_berikare,-0.43,0.1,16
3,V1_kulturberika,-0.53,0.04,16
4,N1_förortsgäng,0.02,0.98,6
5,N1_globalist,-0.19,0.47,17
6,N1_återvandring,-0.4,0.14,15
7,A1_globalistisk,-0.32,0.27,14
8,antiglobalister,-0.88,0.02,6
9,V1_återvandra,-0.56,0.03,15


In [95]:
correlation(df=df_yearly_dwt, var1="stdc", var2="rch", corpus=corpus, mode=2, metric = "pearson")

Unnamed: 0,Word,Correlation,p,N
0,V1_berika,0.21,0.35,22
1,N1_kulturberikare,-0.29,0.23,19
2,N1_berikare,-0.11,0.69,16
3,V1_kulturberika,0.04,0.88,16
4,N1_förortsgäng,0.36,0.49,6
5,N1_globalist,-0.13,0.63,17
6,N1_återvandring,-0.39,0.15,15
7,A1_globalistisk,0.26,0.36,14
8,antiglobalister,0.17,0.75,6
9,V1_återvandra,-0.01,0.97,15


In [None]:
correlation(df=df_yearly_dwt, var1="diffpm", var2="rch", corpus=corpus, mode=2, metric = "spearman")

##### NLI

##### STS (small)

##### STS (big)

##### KB

## Model comparison

In [93]:
model_comparison(
    dfs=[df_yearly_dwt, df_bert_nli, df_bert_sts, df_bert_big, df_bert_kb],
    mnames=["SGNS", "NLI", "STS", "BIG", "KB"],
    var="gch", 
    norm=None, 
    targets = Path("../data/utils/dwts.txt"), # =get_dwts(df_yearly_dwt, dwt_path)
    prefix = ("N", "A", "V"),
    mode="universal", # no other mode supported at the moment ...
    word=None, # not supported at the moment ... 
    metric="pearson"
)

Length:
154
154
154
154
154


Unnamed: 0,SGNS,NLI,STS,BIG,KB
SGNS,1.0,-0.251,-0.328,-0.304,-0.266
NLI,-0.251,1.0,0.959,0.967,0.946
STS,-0.328,0.959,1.0,0.99,0.946
BIG,-0.304,0.967,0.99,1.0,0.957
KB,-0.266,0.946,0.946,0.957,1.0
