In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import re
from util import load_metric
#from collections import Counter

## Functions

In [2]:
def jaccard(a, b):
    return len(a.intersection(b)) / len(a.union(b))

In [3]:
def q_show(df, var, k = 40, as_table = False):
    # Note: as_table only for transition variables!
    
    ser = [(col, set(df[col].sort_values(ascending=False)[:k].index)) for col in df.columns if col.startswith(var)]
    
    for i, (col, s) in enumerate(ser):
        if i > 0:
            jac = round(jaccard(s, ser[i-1][-1]), 2)
        else:
            jac = None
        
        print()
        print(col, "jaccard =", jac)
        
        if as_table == False:
            print(s)
        else:
            trans  = col.split("_")[-1]
            ti, tj = tuple(trans.split(":"))
            table  = [] 
            for word in s:
                v    = df.loc[word][col]
                f_ti = int(df.loc[word][f"frq_{ti}"])
                f_tj = int(df.loc[word][f"frq_{tj}"])
                gch  = df.loc[word][f"gch_{trans}"]
                m    = df.loc[word][f"mccc_{trans}"]
                std  = df.loc[word][f"stdc_{trans}"]
                columns = ["Word", var.upper(), "n_i", "n_j", "GCH", "Mctrl", "Sctrl"]
                table.append([word, v, f_ti, f_tj, gch, m, std])
            print(pd.DataFrame(table, columns=columns))
            
    
    

In [4]:
def change_show(df, var, targets, th=4.781):

    for col in [col for col in df.columns if col.startswith(var)]:
        trans  = col.split("_")[-1]
        ti, tj = tuple(trans.split(":"))
        table  = [] 
        
        for word in targets:
            v    = df.loc[word][col]
            f_ti = int(df.loc[word][f"frq_{ti}"])
            f_tj = int(df.loc[word][f"frq_{tj}"])
            
            if th != None:
                if v > th:
                    table.append([word, v, f_ti, f_tj])
            else:
                table.append([word, v, f_ti, f_tj])
        
        columns = ["Word", "Value", "n_i", "n_j"]
        if table != []:
            display = pd.DataFrame(table, columns=columns)
            print()
            print(col)
            print(display)
                

In [5]:
def read_csv(path):
    return pd.read_csv(path, sep=";", index_col=0)

In [6]:
def get_dwts(df, path):
    with open(Path(path), "r") as f:
        dwt_roots = [w.strip("\n") for w in f.readlines()]
    dwt_regex = re.compile(f"({'|'.join(dwt_roots)})")    
    dwts = [str(w) for w in df.index if re.search(dwt_regex, str(w)) != None]
    return dwts    

In [7]:
def get_variables(df):
    
    yr_prefix = set()
    tr_prefix = set()
    years = set()
    transitions = set()
    
    for v in df.columns:
        prefix, suffix = tuple(v.split("_"))
        if ":" in suffix:
            tr_prefix.add(prefix)
            transitions.add(suffix)
        else:
            yr_prefix.add(prefix)
            years.add(suffix)
    
    return {"yr_prefix": yr_prefix, 
            "tr_prefix": tr_prefix,
            "years": years,
            "transitions": transitions
           }
            

In [8]:
def checker(word, transition, controls_dir, n_ctrl=10, variable="cosine_change"):
    """
    param word
    param transition    tupple of ti and tj 
    param controls_dir  where to find controls
    param variable      "cosine_change" or "cosine_sim"
    """
    
    ti, tj = transition
    basename = Path(controls_dir) / variable 
    
    filenames = [f"{ti}_{tj}_control{n}.txt" for n in range(1, n_ctrl+1)]
    
    values = []
    
    for file in filenames:
        data = load_metric(basename / file)
        value = data[word]
        values.append(value)
        print(file, value)
        
    return values   

## Files

In [None]:
#file_path = Path("../../dw_results/fb_pol-yearly-radical3.csv")
#file_path = Path("fb_pol-yearly-radical3.csv")
results_dir = Path("../../dw_results")

In [None]:
files = os.listdir(results_dir)
[print(file) for file in files]

In [None]:
dwt_path = "../data/utils/dwts.txt"

## Q-Show

In [None]:
df_yearly = read_csv(results_dir / "fb_pol-yearly-radical3-full.csv")
df_time_bin = read_csv(results_dir / "fb_pol-time_bin-radical3-full.csv")

In [9]:
# note time mode!
toypol = read_csv(Path("../../toypol-time_bin.csv"))

In [10]:
toypol.shape

(11859, 16)

In [None]:
toypol

In [None]:
np.nan

In [11]:
VAR = "gsim"
VAL = 1.0
A = list(toypol[toypol[f"{VAR}_2007:2011"] == VAL].index)
print("No change (A):", len(A))
B = list(toypol[toypol[f"{VAR}_2007:2011"] != VAL].index)
print("Other (B):", len(B))
# u = list(toypol[toypol[f"{VAR}_2007:2011"] == np.nan].index)
# print("Undefined (U):", len(u))
print()
print("{: <20} {}".format("A", "B"))
print("{: <20} {}".format("---", "---"))
for w1, w2 in zip(A[:100], B[:100]):
    print(f"{w1: <20} {w2}")

No change (A): 2372
Other (B): 9487

A                    B
---                  ---
att                  det
är                   och
inte                 som
jag                  i
har                  en
så                   på
ett                  för
men                  om
sig                  med
eller                du
ju                   de
vara                 av
då                   man
från                 till
detta                den
finns                kan
än                   vi
nu                   vad
andra                var
hade                 skulle
mig                  alla
mot                  ska
dig                  kommer
ut                   bara
blir                 hur
även                 när
göra                 han
folk                 vill
varför               här
dem                  där
lite                 får
tycker               sverige
ta                   mer
vet                  dom
måste                ha
ser                  något
samma    

In [12]:
voc_a = load_metric("/srv/data/gusbohom/root/corpora/toypol/time_bin/radical3/vocab/2007.txt")
voc_b = load_metric("/srv/data/gusbohom/root/corpora/toypol/time_bin/radical3/vocab/2011.txt")

In [13]:
voc_a_100 = {w: c for w, c in voc_a.items() if c >= 100}
voc_b_100 = {w: c for w, c in voc_b.items() if c >= 100}

In [14]:
print("A:", len(voc_a_100))
print("B:", len(voc_b_100))
print("A and B:", len([w for w in voc_a_100.keys() if w in voc_b_100.keys()]))
print("A or B:", len(set(voc_a_100.keys()).union(set(voc_b_100.keys()))))
print("A - B:", len(set(voc_a_100.keys()).difference(set(voc_b_100.keys()))))

A: 7038
B: 7178
A and B: 6419
A or B: 7797
A - B: 619


In [15]:
dd = set(voc_a_100.keys()).union(set(voc_b_100.keys()))
mystery = [w for w in toypol.index if w not in dd]
len(mystery)

4062

In [None]:
type(np.nan)

In [17]:
for i, w in enumerate(mystery, 1):
    print("{:<5}{: <25} {: <5} {: <5} {}".format(i, w, voc_a[w], voc_b[w], toypol.loc[w]["gsim_2007:2011"]))

1    oundvikligt               99.0  87.0  nan
2    grabbarna                 99.0  80.0  nan
3    betänk                    99.0  89.0  nan
4    identiteten               99.0  99.0  nan
5    direktdemokrati           99.0  92.0  nan
6    year                      99.0  91.0  nan
7    funderade                 99.0  60.0  nan
8    ana                       99.0  77.0  nan
9    förlita                   99.0  78.0  nan
10   handeln                   99.0  77.0  nan
11   zog                       99.0  57.0  nan
12   relationerna              99.0  70.0  nan
13   ägd                       99.0  64.0  nan
14   emellanåt                 99.0  93.0  nan
15   drogen                    99.0  70.0  nan
16   idiotisk                  99.0  92.0  nan
17   örebro                    99.0  74.0  nan
18   hänvisade                 99.0  92.0  nan
19   kärnkraften               99.0  82.0  nan
20   lyckligtvis               99.0  78.0  nan
21   grannländerna             99.0  89.0  nan
22   krigade 

1659 redogöra                  69.0  76.0  nan
1660 pysslar                   69.0  53.0  nan
1661 misslyckat                69.0  67.0  nan
1662 parallell                 69.0  75.0  nan
1663 påven                     69.0  60.0  nan
1664 uran                      69.0  47.0  nan
1665 nationerna                69.0  59.0  nan
1666 värdigt                   69.0  90.0  nan
1667 nyttig                    69.0  89.0  nan
1668 cool                      69.0  70.0  nan
1669 mammans                   69.0  59.0  nan
1670 pålästa                   69.0  68.0  nan
1671 gripen                    69.0  85.0  nan
1672 småland                   69.0  44.0  nan
1673 analysen                  69.0  86.0  nan
1674 others                    69.0  58.0  nan
1675 hösten                    69.0  67.0  nan
1676 smyg                      69.0  71.0  nan
1677 ingrepp                   69.0  65.0  nan
1678 idéerna                   69.0  80.0  nan
1679 införs                    69.0  64.0  nan
1680 kunniga 

3379 spåret                    51.0  76.0  nan
3380 omfördelning              51.0  52.0  nan
3381 tillår                    51.0  58.0  nan
3382 släppts                   51.0  50.0  nan
3383 drös                      51.0  65.0  nan
3384 automatvapen              51.0  53.0  nan
3385 sko                       51.0  52.0  nan
3386 spåren                    51.0  50.0  nan
3387 verkat                    51.0  74.0  nan
3388 borgerligheten            51.0  65.0  nan
3389 batikhäxor                51.0  75.0  nan
3390 independent               51.0  51.0  nan
3391 riktningen                51.0  83.0  nan
3392 föreslagit                51.0  53.0  nan
3393 platserna                 51.0  64.0  nan
3394 familjemedlemmar          51.0  64.0  nan
3395 funtad                    51.0  64.0  nan
3396 science                   51.0  55.0  nan
3397 återinföra                51.0  52.0  nan
3398 rättar                    51.0  60.0  nan
3399 mätta                     51.0  56.0  nan
3400 repressa

In [16]:
def inspector(my_list):
    proper = 0
    for i, w in enumerate(my_list, 1):
        n_a = int(voc_a[w]) if w in voc_a else 0
        n_b = int(voc_b[w]) if w in voc_b else 0
        if n_a > 100 and n_b > 100:
            proper += 1
        print("{:<5}{: <25} {: <10} {: <10} {}".format(i, w, n_a, n_b, toypol.loc[w]["gsim_2007:2011"]))
    return proper, i

In [18]:
inspector(A)

1    att                       394185     410383     1.0
2    är                        314035     326643     1.0
3    inte                      207114     206420     1.0
4    jag                       157629     147035     1.0
5    har                       129761     131254     1.0
6    så                        119409     123145     1.0
7    ett                       77300      79278      1.0
8    men                       76572      75396      1.0
9    sig                       54352      58194      1.0
10   eller                     52733      53635      1.0
11   ju                        44793      43044      1.0
12   vara                      37574      40365      1.0
13   då                        35693      35921      1.0
14   från                      32305      33132      1.0
15   detta                     31650      33931      1.0
16   finns                     29957      30700      1.0
17   än                        29212      29297      1.0
18   nu                        

1595 soldaterna                187        127        1.0
1596 hemlighet                 187        197        1.0
1597 polacker                  187        206        1.0
1598 känslan                   187        248        1.0
1599 rättsväsendet             186        161        1.0
1600 brinner                   186        168        1.0
1601 kinas                     186        200        1.0
1602 republikanerna            186        154        1.0
1603 sköt                      185        262        1.0
1604 åstadkomma                185        236        1.0
1605 brudar                    185        183        1.0
1606 aktioner                  185        168        1.0
1607 sur                       185        134        1.0
1608 back                      185        186        1.0
1609 åh                        185        183        1.0
1610 borgerlig                 185        104        1.0
1611 trådstarten               185        295        1.0
1612 röker                     

(2356, 2372)

In [19]:
inspector(B)

1    det                       335986     346903     0.7081381678581238
2    och                       285058     303013     0.6960806250572205
3    som                       246094     261374     0.9999999403953552
4    i                         225572     235339     0.7215818166732788
5    en                        169871     176312     0.7928011417388916
6    på                        162597     170172     0.9999998807907104
7    för                       145582     154998     0.649419903755188
8    om                        129063     131881     0.6993564367294312
9    med                       119060     124090     0.58650803565979
10   du                        118198     115708     0.9999999403953552
11   de                        115732     121272     0.7328811287879944
12   av                        100700     103782     0.6166704893112183
13   man                       92510      96367      0.9999998807907104
14   till                      90981      93491      0.596774816513

234  tråd                      3183       2674       0.9999999403953552
235  nån                       3179       2464       0.7046223282814026
236  kvar                      3168       3492       0.728459894657135
237  brukar                    3150       3115       0.9999999403953552
238  tack                      3132       2933       0.7294309139251709
239  skrev                     3127       3178       0.8070677518844604
240  enbart                    3126       3511       0.748158872127533
241  kosovo                    3099       484        0.9999998807907104
242  parti                     3076       4600       0.7725061774253845
243  kvinna                    3070       4787       0.9999998807907104
244  innebär                   3063       2976       0.9999998807907104
245  gick                      3038       3084       0.7781503200531006
246  invandring                3034       4683       0.9999998807907104
247  först                     3030       3003       0.99999994039

1708 löst                      337        349        0.6560671329498291
1709 beskriva                  337        389        0.8392239809036255
1710 kommentera                337        374        0.8680617809295654
1711 lämnade                   336        335        0.7956057190895081
1712 offentligt                336        466        0.9999999403953552
1713 mccain                    336        37         nan
1714 elever                    336        393        0.7197173833847046
1715 tillfälle                 335        344        0.7106336951255798
1716 svenskarnas               335        788        0.5325796604156494
1717 farliga                   335        356        0.7621057033538818
1718 rådande                   335        359        0.9999998807907104
1719 stött                     334        321        0.6902251243591309
1720 trygghet                  334        396        0.7980430126190186
1721 tolkning                  334        368        0.9999998807907104
1722 rö

1911 killarna                  291        267        0.7927168607711792
1912 grejer                    291        270        0.6938362121582031
1913 stjäla                    290        258        0.8068769574165344
1914 argumenten                290        258        0.8228517174720764
1915 process                   290        272        0.7813464999198914
1916 personal                  290        331        0.8021590709686279
1917 personerna                290        329        0.7045192718505859
1918 händelsen                 290        321        0.7709972262382507
1919 låna                      289        271        0.7970414757728577
1920 bedriva                   289        312        0.77772057056427
1921 öht                       289        281        0.7729797959327698
1922 röven                     289        277        0.8281356692314148
1923 decennier                 289        352        0.9999998807907104
1924 ironi                     289        201        0.91077184677

3427 stilla                    143        139        0.7082855105400085
3428 nassarna                  143        59         nan
3429 tito                      143        41         nan
3430 österrike                 143        102        0.8899235129356384
3431 förrädare                 143        171        0.6802704930305481
3432 brutit                    143        125        0.9999998807907104
3433 laga                      143        179        0.798454999923706
3434 uppvuxen                  143        132        0.8327207565307617
3435 fabriker                  143        70         nan
3436 konvertera                143        128        0.9999998807907104
3437 mladic                    143        37         nan
3438 castro                    143        30         nan
3439 missnöjda                 143        181        0.7752858996391296
3440 totalitär                 143        106        0.7802076935768127
3441 frihandel                 143        52         nan
3442 repres

3631 styras                    134        194        0.7422088980674744
3632 ärenden                   134        139        0.6765041947364807
3633 moder                     134        138        0.6499508023262024
3634 konsumerar                134        96         nan
3635 inträffat                 134        109        0.7286567687988281
3636 skänker                   134        129        0.8165353536605835
3637 väpnad                    134        103        0.8280121088027954
3638 islamism                  134        154        0.7838078141212463
3639 skickat                   134        201        0.9999999403953552
3640 nysvenskar                134        104        0.9999999403953552
3641 valen                     134        171        0.7109339237213135
3642 blint                     133        126        0.7605453133583069
3643 onödig                    133        114        0.7339332103729248
3644 anmäler                   133        128        0.9999999403953552
3645 pa

5148 beviset                   91         88         nan
5149 djupa                     91         94         nan
5150 iårs                      91         92         nan
5151 förövare                  91         123        nan
5152 control                   91         120        nan
5153 anmälda                   91         98         nan
5154 segregationen             91         104        nan
5155 häftigt                   91         71         nan
5156 kooperativ                91         57         nan
5157 opinionsundersökningar    91         138        nan
5158 motsättning               91         67         nan
5159 profil                    91         119        nan
5160 evolution                 91         137        nan
5161 kristdemokraterna         91         79         nan
5162 bedöms                    91         78         nan
5163 niqab                     91         99         nan
5164 psykologi                 91         104        nan
5165 maktens                   

5354 genomslag                 88         109        nan
5355 folkeparti                88         85         nan
5356 taskig                    88         66         nan
5357 nagra                     88         58         nan
5358 islamisk                  88         97         nan
5359 leverera                  88         125        nan
5360 arbetsläger               88         73         nan
5361 civilbefolkningen         88         88         nan
5362 ångest                    88         92         nan
5363 pucko                     88         55         nan
5364 ud                        88         48         nan
5365 luta                      88         104        nan
5366 lager                     87         87         nan
5367 inkompetens               87         91         nan
5368 endaste                   87         75         nan
5369 göteborgs                 87         82         nan
5370 ~                         87         109        nan
5371 klagade                   

6892 ikke                      68         177        nan
6893 behandlat                 68         71         nan
6894 tony                      68         55         nan
6895 ösa                       68         101        nan
6896 kortfattat                68         72         nan
6897 promille                  68         91         nan
6898 mobiltelefon              68         45         nan
6899 samhällssystem            68         43         nan
6900 behåll                    68         66         nan
6901 förav                     68         58         nan
6902 högerextremister          68         98         nan
6903 privatliv                 68         62         nan
6904 importerat                68         80         nan
6905 valutan                   68         59         nan
6906 kontor                    68         94         nan
6907 skamligt                  68         80         nan
6908 bortförklara              68         63         nan
6909 obligatoriska             

7097 svenskheten               66         126        nan
7098 fester                    66         54         nan
7099 mörkar                    66         61         nan
7100 diskrimineringen          66         48         nan
7101 folkslagen                66         56         nan
7102 beskyller                 66         84         nan
7103 allvarligare              66         65         nan
7104 instans                   66         37         nan
7105 särklass                  66         49         nan
7106 khat                      66         77         nan
7107 fortgå                    66         91         nan
7108 saudierna                 66         63         nan
7109 iranian                   66         43         nan
7110 principerna               66         50         nan
7111 tummen                    66         70         nan
7112 mötas                     66         68         nan
7113 jakten                    66         58         nan
7114 tolkat                    

8637 förser                    51         52         nan
8638 hysteriska                51         76         nan
8639 gisslan                   51         98         nan
8640 mellanoch                 51         51         nan
8641 förstörs                  51         60         nan
8642 odemokratisk              51         56         nan
8643 mångkulturalister         51         143        nan
8644 hassan                    51         74         nan
8645 professorer               51         53         nan
8646 struntprat                51         52         nan
8647 isolering                 51         62         nan
8648 förenkla                  51         57         nan
8649 förfallet                 51         74         nan
8650 illdåd                    51         53         nan
8651 relaterade                51         63         nan
8652 opinionsbildning          51         53         nan
8653 manipulation              51         60         nan
8654 kallats                   

8843 jönköping                 49         80         nan
8844 undervisa                 49         53         nan
8845 stängd                    49         70         nan
8846 styrelser                 49         67         nan
8847 symboliserar              49         57         nan
8848 läsarna                   49         54         nan
8849 official                  49         76         nan
8850 fittja                    49         64         nan
8851 cb                        49         69         nan
8852 kontinuerligt             49         69         nan
8853 stolen                    49         51         nan
8854 sedvanliga                49         58         nan
8855 plåga                     49         82         nan
8856 kvinnlighet               49         67         nan
8857 rasistkortet              49         74         nan
8858 greklands                 49         98         nan
8859 figurer                   49         94         nan
8860 olämplig                  

(4022, 9487)

In [48]:
for w, c in sorted(voc_a.items(), key = lambda x: x[1], reverse=True)[:100]:
    c = int(c)
    c2 = int(voc_b[w])
    
    print("{:<20}{:<7}{:<7}{:>7}   {:.2f}".format(w, c, c2, c2-c, toypol.loc[w]['gsim_2007:2011']))

att                 394185 410383   16198   1.00
det                 335986 346903   10917   0.71
är                  314035 326643   12608   1.00
och                 285058 303013   17955   0.70
som                 246094 261374   15280   1.00
i                   225572 235339    9767   0.72
inte                207114 206420    -694   1.00
en                  169871 176312    6441   0.79
på                  162597 170172    7575   1.00
jag                 157629 147035  -10594   1.00
för                 145582 154998    9416   0.65
har                 129761 131254    1493   1.00
om                  129063 131881    2818   0.70
så                  119409 123145    3736   1.00
med                 119060 124090    5030   0.59
du                  118198 115708   -2490   1.00
de                  115732 121272    5540   0.73
av                  100700 103782    3082   0.62
man                 92510  96367     3857   1.00
till                90981  93491     2510   0.60
den                 

In [None]:
toypol.loc["idiotisk"]

In [None]:
for i, j in [(i, i+1) for i in range(2004,2014]:
    print(toypol[f"gsim_{i}:{j}"].mean())

In [None]:
toypol[[f"gsim_{i}:{i+1}" for i in range(2004,2014)]].head(50)

### Yearly

In [None]:
q_show(df_yearly, "rch", as_table = True)
# q_show("rsim")
# q_show("gch")
# q_show("gsim")

In [None]:
q_show(toypol, "rch", as_table = True)

In [None]:
q_show(toypol, "gsim", as_table = True)

In [None]:
MODE="time_bin"
checker("att", 
        (2007, 2011), 
        f"/srv/data/gusbohom/root/experiment/toypol-{MODE}-radical3/", 
        n_ctrl=5, 
        variable="cosine_change")

### Time bin

In [None]:
q_show(df_time_bin, "rch", as_table = True)

## Change Show

In [None]:
df_yearly_dwt = read_csv(results_dir / "fb_pol-yearly-radical3-restricted.csv")
df_time_bin_dwt = read_csv(results_dir / "fb_pol-time_bin-radical3-restricted.csv")

### Yearly

In [None]:
# change_show(df, var, targets, th=4.781)
change_show(df_yearly_dwt, "rch", get_dwts(df_yearly_dwt, dwt_path))

In [None]:
change_show(df_yearly_dwt, "gsim", get_dwts(df_time_bin_dwt, dwt_path), th=None)

### Time bin

In [None]:
change_show(df_time_bin_dwt, "rch", get_dwts(df_time_bin_dwt, dwt_path))

In [None]:
change_show(df_time_bin_dwt, "gsim", get_dwts(df_time_bin_dwt, dwt_path), th=None)

In [None]:
a=load_metric("/srv/data/gusbohom/root/corpora/fb_pol/time_bin/radical3/vocab/2015.txt")["V1_berika"]
b=load_metric("/srv/data/gusbohom/root/corpora/fb_pol/time_bin/radical3/vocab/2019.txt")["V1_berika"]
print(a, b)

## Correlation

In [None]:
get_variables(df_yearly_dwt)

In [None]:
ts = ["2002:2003", "2003:2004", "2004:2005"]
df_yearly_dwt[[f"diffpm_{t}" for t in ts]].corrwith(df_yearly_dwt[[f"rch_{t}" for t in ts]], axis=1)
#does not share columns ...

In [None]:
for word in ["V1_berika", "N1_kulturberikare"]:
    # df.loc[w][col]
    a = df_yearly_dwt.loc[word][[f"diffpm_{t}" for t in ts]]
    b = df_yearly_dwt.loc[word][[f"rch_{t}" for t in ts]]
    print(word)
    print(a)
    print(b)
    print(np.corrcoef(a,b))

## Appendix

In [None]:
df.index

In [None]:
dwt_regex

In [None]:
dwts

In [None]:
dwt_df = df.loc[dwts]

In [None]:
dwt_df

In [None]:
change_show("rch", dwts)

In [None]:
change_show("rsim", dwts, th=0.5)

In [None]:
change_show("gch", dwts)

In [None]:
change_show("gsim", dwts)

In [None]:
df.describe()

In [None]:
list(df.columns)