In [35]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import re
from util import load_metric
#from collections import Counter

## Functions

In [2]:
def jaccard(a, b):
    return len(a.intersection(b)) / len(a.union(b))

In [31]:
def q_show(df, var, k = 40, as_table = False):
    # Note: as_table only for transition variables!
    
    ser = [(col, set(df[col].sort_values(ascending=False)[:k].index)) for col in df.columns if col.startswith(var)]
    
    for i, (col, s) in enumerate(ser):
        if i > 0:
            jac = round(jaccard(s, ser[i-1][-1]), 2)
        else:
            jac = None
        
        print()
        print(col, "jaccard =", jac)
        
        if as_table == False:
            print(s)
        else:
            trans  = col.split("_")[-1]
            ti, tj = tuple(trans.split(":"))
            table  = [] 
            for word in s:
                v    = df.loc[word][col]
                f_ti = int(df.loc[word][f"frq_{ti}"])
                f_tj = int(df.loc[word][f"frq_{tj}"])
                gch  = df.loc[word][f"gch_{trans}"]
                m    = df.loc[word][f"mccc_{trans}"]
                std  = df.loc[word][f"stdc_{trans}"]
                columns = ["Word", "RCH", "n_i", "n_j", "GCH", "Mctrl", "Sctrl"]
                table.append([word, v, f_ti, f_tj, gch, m, std])
            print(pd.DataFrame(table, columns=columns))
            
    
    

In [5]:
def change_show(df, var, targets, th=4.781):
    for w in targets:
        for col in [col for col in df.columns if col.startswith(var)]:
            change = df.loc[w][col]
            if change > th:
                print(col, w, round(change, 3))

In [6]:
def read_csv(path):
    return pd.read_csv(path, sep=";", index_col=0)

In [7]:
def get_dwts(df, path):
    with open(Path(path), "r") as f:
        dwt_roots = [w.strip("\n") for w in f.readlines()]
    dwt_regex = re.compile(f"({'|'.join(dwt_roots)})")    
    dwts = [str(w) for w in df.index if re.search(dwt_regex, str(w)) != None]
    return dwts    

In [8]:
def get_variables(df):
    
    yr_prefix = set()
    tr_prefix = set()
    years = set()
    transitions = set()
    
    for v in df.columns:
        prefix, suffix = tuple(v.split("_"))
        if ":" in suffix:
            tr_prefix.add(prefix)
            transitions.add(suffix)
        else:
            yr_prefix.add(prefix)
            years.add(suffix)
    
    return {"yr_prefix": yr_prefix, 
            "tr_prefix": tr_prefix,
            "years": years,
            "transitions": transitions
           }
            

In [40]:
def checker(word, transition, controls_dir, n_ctrl=10, variable="cosine_change"):
    """
    param word
    param transition    tupple of ti and tj 
    param controls_dir  where to find controls
    param variable      "cosine_change" or "cosine_sim"
    """
    
    ti, tj = transition
    basename = Path(controls_dir) / variable 
    
    filenames = [f"{ti}_{tj}_control{n}.txt" for n in range(1, n_ctrl+1)]
    
    values = []
    
    for file in filenames:
        data = load_metric(basename / file)
        value = data[word]
        values.append(value)
        print(file, value)
        
    return values   

In [42]:
mammamu = checker("datum", (2004, 2005), "/home/max/Results/toy_pol-output")
np.mean(mammamu)

2004_2005_control1.txt 0.25342339277267456
2004_2005_control2.txt 0.00015542474284302443
2004_2005_control3.txt 0.24431970715522766
2004_2005_control4.txt 0.26190248131752014
2004_2005_control5.txt 0.0
2004_2005_control6.txt 0.25231683254241943
2004_2005_control7.txt 0.0
2004_2005_control8.txt 0.0
2004_2005_control9.txt 0.0
2004_2005_control10.txt 0.0


0.10121178385306848

## Files

In [9]:
#file_path = Path("../../dw_results/fb_pol-yearly-radical3.csv")
#file_path = Path("fb_pol-yearly-radical3.csv")
results_dir = Path("../../dw_results")

In [10]:
files = os.listdir(results_dir)
[print(file) for file in files]

fb_pol-yearly-radical3-restricted.csv
fb_pol-yearly-radical3-restricted.xlsx
fb_pol-time_bin-radical3-restricted.csv
fb_pol-time_bin-radical3-full.csv
fb_pol-yearly-radical3-full.csv


[None, None, None, None, None]

In [11]:
dwt_path = "../data/utils/dwts.txt"

## Q-Show

In [12]:
df_yearly = read_csv(results_dir / "fb_pol-yearly-radical3-full.csv")
df_time_bin = read_csv(results_dir / "fb_pol-time_bin-radical3-full.csv")

In [32]:
q_show(df_yearly, "rch",as_table = True)
# q_show("rsim")
# q_show("gch")
# q_show("gsim")


rch_2000:2001 jaccard = None
          Word          RCH    n_i    n_j       GCH     Mctrl     Sctrl
0        datum  3634.660172     44    411  0.217145  0.000027  0.000057
1           få  5229.331046   2245   1480  0.190622  0.000011  0.000035
2           is          inf    552    580  0.087356  0.000000  0.000000
3     tidigare  4079.207533    486    277  0.242873  0.000066  0.000057
4          här  6984.946473   3029   2176  0.254615  0.000011  0.000035
5           ja  4007.948193   1224   1200  0.259292  0.000038  0.000062
6          bli          inf   1579   1104  0.172226  0.000000  0.000000
7    stockholm  3809.149768    198    159  0.227568  0.000027  0.000057
8         veta          inf    372    204  0.206615  0.000000  0.000000
9      snarare          inf    306    147  0.149441  0.000000  0.000000
10    historia  4400.960394    307    113  0.160427  0.000011  0.000035
11      ganska          inf    589    334  0.167017  0.000000  0.000000
12        mina  3968.665046    660

          Word          RCH   n_i    n_j       GCH     Mctrl     Sctrl
0        ordet  7137.109441   123    560  0.260161  0.000011  0.000035
1       längre  7384.851548   233   1142  0.269191  0.000011  0.000035
2        såsom  7510.854122    94    290  0.273784  0.000011  0.000035
3      kristen  6634.205377    46    197  0.241830  0.000011  0.000035
4         över          inf   931   4014  0.210165  0.000000  0.000000
5          läs  6428.178781   115    694  0.234320  0.000011  0.000035
6     politisk  5615.750565    87    475  0.289499  0.000016  0.000049
7     särskilt          inf   160    968  0.231163  0.000000  0.000000
8         bort          inf   333   1265  0.000110  0.000000  0.000000
9        annan  6448.794114   552   2367  0.235072  0.000011  0.000035
10  medborgare  6320.424545   104    466  0.230393  0.000011  0.000035
11        tror          inf  1324   5951  0.227840  0.000000  0.000000
12    givetvis  7294.062813   211    795  0.265882  0.000011  0.000035
13    

            Word          RCH    n_i    n_j       GCH     Mctrl     Sctrl
0     invandrare          inf   2838   9391  0.000155  0.000000  0.000000
1     frivilliga          inf    186    305  0.000110  0.000000  0.000000
2      frankrike          inf    785   1102  0.216135  0.000000  0.000000
3        sprider  7544.918490    264    470  0.388945  0.000016  0.000049
4       huruvida          inf    570    798  0.000155  0.000000  0.000000
5            hos  9111.588830   2857   4131  0.332131  0.000011  0.000035
6         kommit          inf   1474   2155  0.230026  0.000000  0.000000
7          slags  7444.503886   1120   1586  0.271366  0.000011  0.000035
8     feminister  8824.293803   1112   1179  0.321659  0.000011  0.000035
9   fullständigt          inf   1013   1219  0.257175  0.000000  0.000000
10   möjligheter          inf    706    865  0.237981  0.000000  0.000000
11          ökar          inf    829   1363  0.000110  0.000000  0.000000
12    medborgare          inf   1070  

            Word  RCH    n_i    n_j       GCH  Mctrl  Sctrl
0   invandringen  inf   5314  10830  0.156224    0.0    0.0
1      allmänhet  inf   1974   2782  0.229950    0.0    0.0
2         förmån  inf    478    638  0.226608    0.0    0.0
3           egen  inf  10065  15201  0.156115    0.0    0.0
4        tråkigt  inf   1297   1826  0.235048    0.0    0.0
5           fram  inf  14504  21828  0.000155    0.0    0.0
6         tankar  inf   1987   2747  0.224742    0.0    0.0
7       fungerar  inf   6264   8925  0.170573    0.0    0.0
8         mellan  inf  15164  21623  0.000110    0.0    0.0
9          rösta  inf   6650  12548  0.155303    0.0    0.0
10        kostar  inf   2683   4083  0.231779    0.0    0.0
11    konkurrens  inf   1013   1448  0.000110    0.0    0.0
12      resultat  inf   2714   3767  0.186733    0.0    0.0
13  konspiration  inf    476    751  0.257250    0.0    0.0
14        familj  inf   3561   5708  0.180950    0.0    0.0
15         hatar  inf   4366   7041  0.0

            Word  RCH    n_i    n_j       GCH  Mctrl  Sctrl
0           viss  inf   7606   7005  0.000110    0.0    0.0
1        moderna  inf   2009   1858  0.000155    0.0    0.0
2             on  inf   4259   4820  0.161557    0.0    0.0
3          tycks  inf   3854   4108  0.217512    0.0    0.0
4             ca  inf   1545   1798  0.214723    0.0    0.0
5        judiska  inf   4230   3241  0.164050    0.0    0.0
6       avsevärt  inf    626    704  0.248529    0.0    0.0
7          ryska  inf   2013  11748  0.214054    0.0    0.0
8          beror  inf   9190   8363  0.166898    0.0    0.0
9          neger  inf   2514   1812  0.000155    0.0    0.0
10      åtgärder  inf   1534   1511  0.195482    0.0    0.0
11       största  inf   9448  10470  0.157735    0.0    0.0
12        status  inf   2987   2502  0.192676    0.0    0.0
13       florida  inf     54     65  0.173605    0.0    0.0
14     majoritet  inf   5253   5544  0.154271    0.0    0.0
15       sociala  inf   5009   4796  0.0

            Word  RCH     n_i      n_j       GCH  Mctrl  Sctrl
0        utifrån  inf    3350     3898  0.187495    0.0    0.0
1   lagstiftning  inf     897     1149  0.219291    0.0    0.0
2       polisens  inf    1290      779  0.204836    0.0    0.0
3         knappt  inf    2990     3535  0.202450    0.0    0.0
4           risk  inf    2298     2682  0.165018    0.0    0.0
5       snabbare  inf     972     1084  0.195501    0.0    0.0
6        utsätts  inf     702      744  0.225118    0.0    0.0
7           mina  inf    6949     8754  0.151494    0.0    0.0
8          skett  inf    2302     2542  0.181162    0.0    0.0
9        reagera  inf     788      784  0.000110    0.0    0.0
10          sina  inf   34538    40145  0.000155    0.0    0.0
11         gamla  inf    7279     8294  0.157447    0.0    0.0
12        syrien  inf    8956     9602  0.149933    0.0    0.0
13          utav  inf    2855     3406  0.246327    0.0    0.0
14           att  inf  990685  1155557  0.000110    0.0

           Word  RCH     n_i     n_j       GCH  Mctrl  Sctrl
0        lägger  inf    3859    6239  0.171626    0.0    0.0
1       skjuter  inf    1403    5145  0.199266    0.0    0.0
2       extrema  inf    1548    2018  0.213463    0.0    0.0
3       omkring  inf    2171    3013  0.188028    0.0    0.0
4       mentala  inf     569     781  0.218366    0.0    0.0
5             +  inf    2503    2959  0.260525    0.0    0.0
6    definition  inf    1785    1855  0.220878    0.0    0.0
7       korrekt  inf    3577    4635  0.203626    0.0    0.0
8          valt  inf    2942    3741  0.199090    0.0    0.0
9          igår  inf    1951    4492  0.214947    0.0    0.0
10        öppet  inf    2991    4368  0.226222    0.0    0.0
11          ena  inf    4086    5492  0.180936    0.0    0.0
12       hunter  inf     557    1841  0.000110    0.0    0.0
13      rörelse  inf    1111    1055  0.214040    0.0    0.0
14     övertyga  inf     888    1101  0.254146    0.0    0.0
15        heter  inf    

In [None]:
q_show(df_time_bin, "rch")

## Change Show

In [None]:
df_yearly_dwt = read_csv(results_dir / "fb_pol-yearly-radical3-restricted.csv")
df_time_bin_dwt = read_csv(results_dir / "fb_pol-time_bin-radical3-restricted.csv")

In [None]:
# change_show(df, var, targets, th=4.781)
change_show(df_yearly_dwt, "rch", get_dwts(df_yearly_dwt, dwt_path))

In [None]:
change_show(df_time_bin_dwt, "rch", get_dwts(df_time_bin_dwt, dwt_path))

In [None]:
#change_show(df_time_bin_dwt, "gsim", get_dwts(df_time_bin_dwt, dwt_path))

## Correlation

In [None]:
get_variables(df_yearly_dwt)

In [None]:
ts = ["2002:2003", "2003:2004", "2004:2005"]
df_yearly_dwt[[f"diffpm_{t}" for t in ts]].corrwith(df_yearly_dwt[[f"rch_{t}" for t in ts]], axis=1)
#does not share columns ...

In [None]:
for word in ["V1_berika", "N1_kulturberikare"]:
    # df.loc[w][col]
    a = df_yearly_dwt.loc[word][[f"diffpm_{t}" for t in ts]]
    b = df_yearly_dwt.loc[word][[f"rch_{t}" for t in ts]]
    print(word)
    print(a)
    print(b)
    print(np.corrcoef(a,b))

## Appendix

In [None]:
df.index

In [None]:
dwt_regex

In [None]:
dwts

In [None]:
dwt_df = df.loc[dwts]

In [None]:
dwt_df

In [None]:
change_show("rch", dwts)

In [None]:
change_show("rsim", dwts, th=0.5)

In [None]:
change_show("gch", dwts)

In [None]:
change_show("gsim", dwts)

In [None]:
df.describe()

In [None]:
list(df.columns)