In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr

## Data

In [2]:
years = [y for y in range(2000, 2022)]

In [None]:
years

In [3]:
variables = [f"fpm_{y}" for y in years]
variables.extend([f"gch_{y}:{y+1}" for y in years[:-1]])
variables.extend([f"rch_{y}:{y+1}" for y in years[:-1]])

In [None]:
variables

In [4]:
terms = [
'N1_kulturberikare',
'V1_berika',
'N1_berikare',
'N1_globalist',
'V1_kulturberika',
'N1_återvandring',
'V1_återvandra',
'A1_globalistisk',
'N1_förortsgäng',
]

In [5]:
def get_df(path, variables, terms):
    df = pd.read_csv(Path(path), sep=";", index_col = 0)
    df = df[variables]
    df = df.loc[terms]
    return df

### Change and FPM

In [6]:
sgns_lsc = get_df("/home/max/Documents/mlt/thesis/dw_results/fb_pol-yearly-radical3-restricted.csv", variables, terms)
kb_lsc   = get_df("/home/max/Documents/mlt/thesis/dw_results/fb_pol-yearly-bert-sentence-bert-swedish-cased.csv", variables, terms)
sts_lsc  = get_df("/home/max/Documents/mlt/thesis/dw_results/fb_pol-yearly-bert-sts_fbmodel.csv", variables, terms)

### Ingroup--Outgroup Dimension

In [16]:
sgns_iod = pd.read_csv("/home/max/Results/replacements/results/sgns_results.csv", index_col=0)
kb_iod   = pd.read_csv("/home/max/Results/replacements/results/kb_results.csv", index_col=0)
sts_iod  = pd.read_csv("/home/max/Results/replacements/results/sts_results.csv", index_col=0)

In [135]:
#sgns_iod

## Correlation

In [144]:
def correlation(
    iod_data, 
    iod_var, 
    lsc_data, 
    lsc_var, 
    method=pearsonr, 
    only_sign = True, 
    years = [y for y in range(2000, 2022)]
):
    
    results = []
    
    if type(iod_var) == dict:
        iod_var = iod_var.items()
        
    X = iod_data.copy()
    for var, val in iod_var:
        X = X[X[var]==val]
        
    if lsc_var != "fpm":
        X = X[[str(y) for y in years[:-1]]]
    else:
        X = X[[str(y) for y in years]]
    
    col = [v for v in lsc_data.columns if v.startswith(lsc_var)]
    
    idx = []
    for i in X.index:
        x = X.loc[i]
        dwe = iod_data.iloc[i]["DWE"]
        y = lsc_data.loc[dwe][col]
        
        x,y = zip(*[(a,b) for a,b in zip(x,y) if not pd.isna(a) and not pd.isna(b)])
        
        N = len(x)

        stat = method(x, y)
        
        d = {
            "Corr": stat[0],
            "p": stat[1],
            "N": N,
        }
        
        if only_sign:
            if stat[1] < 0.05:
                idx.append(i)
                results.append(d)
        else:
            idx.append(i)
            results.append(d)
    
    df = pd.DataFrame(results, index=idx)
    headings = [h for h in iod_data.columns if not any(ch.isdigit() for ch in h)]
    df = pd.concat([iod_data.loc[idx][headings], df], axis=1)
    
    return df
    

In [141]:
all(i.isdigit() for i in "mu")

False

In [132]:
correlation(sgns_iod, {"Method": "I-cnt", "B-strategy": "lazy"}, sgns_lsc, "fpm", method=pearsonr)

Unnamed: 0,DWE,A-Strategy,B-strategy,Method,Corr,p,N
0,V1_berika,top3,lazy,I-cnt,-0.517095,0.013725,22
32,V1_berika,ms1,lazy,I-cnt,-0.517095,0.013725,22
128,V1_kulturberika,top3,lazy,I-cnt,-0.613875,0.011422,16
160,V1_kulturberika,ms1,lazy,I-cnt,-0.613875,0.011422,16
256,N1_globalist,top3,lazy,I-cnt,0.847436,1.8e-05,17
288,N1_globalist,ms1,lazy,I-cnt,0.74371,0.000621,17
320,A1_globalistisk,top3,lazy,I-cnt,0.593908,0.02513,14
352,A1_globalistisk,ms1,lazy,I-cnt,0.620838,0.017819,14
384,N1_återvandring,top3,lazy,I-cnt,0.638818,0.010362,15
448,V1_återvandra,top3,lazy,I-cnt,0.619754,0.013728,15


In [137]:
correlation(sgns_iod, {}, sgns_lsc, "rch", method=pearsonr)

Unnamed: 0,DWE,A-Strategy,B-strategy,Method,Corr,p,N
64,N1_berikare,top3,lazy,I-cnt,0.527904,0.043109,15
72,N1_berikare,top3,greedy,I-cnt,0.56893,0.026874,15
73,N1_berikare,top3,greedy,O-cnt,0.547765,0.034533,15
76,N1_berikare,top3,greedy,I-pwn,0.57611,0.024593,15
77,N1_berikare,top3,greedy,O-pwn,0.569324,0.026745,15
96,N1_berikare,ms1,lazy,I-cnt,0.527904,0.043109,15
97,N1_berikare,ms1,lazy,O-cnt,0.586735,0.021492,15
101,N1_berikare,ms1,lazy,O-pwn,0.601691,0.017644,15
104,N1_berikare,ms1,greedy,I-cnt,0.56893,0.026874,15
105,N1_berikare,ms1,greedy,O-cnt,0.666797,0.006627,15


In [145]:
correlation(sts_iod, {}, sts_lsc, "rch", method=pearsonr)

Unnamed: 0,DWE,A-Strategy,Method,Corr,p,N
216,A1_globalistisk,top1,I-cnt,0.723524,0.001535,16
220,A1_globalistisk,top1,I-pwn,0.723597,0.001532,16
224,A1_globalistisk,top3,I-cnt,0.708538,0.002124,16
228,A1_globalistisk,top3,I-pwn,0.708616,0.002121,16


In [147]:
correlation(sts_iod, {"Method":"cnt-ssc"}, sts_lsc, "gch", method=pearsonr)

Unnamed: 0,DWE,A-Strategy,Method,Corr,p,N
2,V1_berika,rn,cnt-ssc,-0.484998,0.025851,21
10,V1_berika,nno,cnt-ssc,-0.446033,0.042698,21
34,V1_berika,ms1,cnt-ssc,-0.472781,0.030434,21
42,N1_berikare,rn,cnt-ssc,0.797626,4.3e-05,19
50,N1_berikare,nno,cnt-ssc,0.873251,1e-06,19
58,N1_berikare,top1,cnt-ssc,0.79175,5.4e-05,19
66,N1_berikare,top3,cnt-ssc,0.837304,8e-06,19
74,N1_berikare,ms1,cnt-ssc,0.839808,7e-06,19
122,N1_kulturberikare,rn,cnt-ssc,0.697751,0.000437,21
130,N1_kulturberikare,nno,cnt-ssc,0.726828,0.00019,21


In [150]:
correlation(kb_iod, {"Method":"cnt-ssc"}, kb_lsc, "gch", method=pearsonr)

Unnamed: 0,DWE,A-Strategy,Method,Corr,p,N
42,N1_berikare,rn,cnt-ssc,0.933014,5.767117e-09,19
50,N1_berikare,nno,cnt-ssc,0.93587,4.021647e-09,19
58,N1_berikare,top1,cnt-ssc,0.916352,3.596636e-08,19
66,N1_berikare,top3,cnt-ssc,0.916719,3.469166e-08,19
74,N1_berikare,ms1,cnt-ssc,0.904201,1.092027e-07,19
82,V1_kulturberika,rn,cnt-ssc,0.479941,0.03756286,19
90,V1_kulturberika,nno,cnt-ssc,0.584876,0.008530735,19
98,V1_kulturberika,top1,cnt-ssc,0.511466,0.02520115,19
106,V1_kulturberika,top3,cnt-ssc,0.546098,0.01556656,19
114,V1_kulturberika,ms1,cnt-ssc,0.491667,0.0325134,19


### FPM

In [68]:
fpm = sgns_lsc.loc["V1_berika"][[v for v in sgns_lsc.columns if v.startswith("fpm")]];fpm

fpm_2000    25.589115
fpm_2001    19.195817
fpm_2002    18.193032
fpm_2003    19.445548
fpm_2004    15.528739
fpm_2005    11.799032
fpm_2006    18.679661
fpm_2007    39.095494
fpm_2008    53.596328
fpm_2009    59.836667
fpm_2010    41.482074
fpm_2011    40.041816
fpm_2012    35.842475
fpm_2013    31.257997
fpm_2014    24.913670
fpm_2015    26.625442
fpm_2016    32.075819
fpm_2017    24.641511
fpm_2018    23.351810
fpm_2019    29.408475
fpm_2020    22.908498
fpm_2021    15.674893
Name: V1_berika, dtype: float64

In [67]:
X = sgns_iod[sgns_iod["DWE"]=="V1_berika"][[str(y) for y in years]];X

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,0.986395,0.715155,0.818471,0.853153,0.696961,0.619241,0.427914,0.349003,0.355434,0.295341,...,0.321971,0.353835,0.316448,0.357412,0.414762,0.378341,0.367202,0.375845,0.374658,0.390267
1,0.969148,0.686442,0.790991,0.829854,0.693168,0.605649,0.452419,0.412641,0.405081,0.323259,...,0.336184,0.354648,0.311637,0.361755,0.339383,0.280309,0.291818,0.348174,0.330762,0.399016
2,0.504410,0.510243,0.508537,0.506922,0.501364,0.505548,0.486082,0.458223,0.467360,0.477434,...,0.489202,0.499426,0.503830,0.496980,0.549976,0.574419,0.557194,0.519109,0.531113,0.494457
3,0.504312,0.507178,0.506870,0.505824,0.500948,0.503398,0.493874,0.484096,0.487591,0.493021,...,0.496447,0.499797,0.501203,0.498914,0.518836,0.524488,0.518837,0.506917,0.510972,0.497813
4,0.982027,0.698244,0.772756,0.791636,0.594525,0.472354,0.302618,0.245109,0.252204,0.219858,...,0.241374,0.260455,0.229991,0.263792,0.296300,0.270373,0.269239,0.269341,0.263698,0.280599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.498179,0.502378,0.515546,0.534403,0.537785,0.510797,0.531057,0.545525,0.513797,0.506560,...,0.505320,0.510196,0.497261,0.508479,0.516992,0.512371,0.517137,0.527621,0.518596,0.504488
60,0.969328,0.697416,0.820151,0.808358,0.619142,0.473482,0.302054,0.270741,0.273357,0.249301,...,0.238122,0.241107,0.230127,0.224102,0.271527,0.241601,0.228371,0.257468,0.248336,0.247833
61,0.989268,0.685298,0.745411,0.710614,0.528711,0.505197,0.255842,0.187067,0.258090,0.242648,...,0.244640,0.236412,0.257510,0.219633,0.247222,0.233902,0.207583,0.221150,0.219889,0.265804
62,0.494910,0.504382,0.523870,0.532174,0.539392,0.483797,0.541417,0.591385,0.514364,0.506762,...,0.493250,0.504916,0.471923,0.505035,0.523426,0.508095,0.523843,0.537940,0.530378,0.482507


In [79]:
for i in range(X.shape[0]):
    stat = pearsonr(X.loc[i], fpm)
    
    print(stat[0], stat[1])

-0.5170946416407137 0.013724848312816694
-0.441195970597395 0.03983836325547259
-0.39794499110501846 0.06662788240929739
-0.43978550499855473 0.04055163197557247
-0.46178879606686263 0.03050027116643225
-0.40875601887253005 0.0589195510896363
-0.430311644913694 0.045604682100826625
-0.4221058230849073 0.05036517936175232
-0.4444217841183224 0.038243964022533675
-0.3968308931801581 0.06746376684515451
-0.08455836012661605 0.7083046830938842
-0.359746481316216 0.10008142641134263
-0.4355297980443208 0.04276435546030111
-0.3827518903259052 0.07872521159221053
-0.4136158532053096 0.055685605498095245
-0.6729860197793893 0.00059863951351767
-0.45343919391597765 0.034051236632840556
-0.41728860812433033 0.05333342839185407
-0.11898278219889327 0.5979281677945458
-0.3043706498210686 0.16844148828218128
-0.43261486520731923 0.044333468745491524
-0.3940927741843964 0.06955191749899067
-0.3580141665601996 0.10184640651276367
-0.4816407219667284 0.02322832259634425
-0.45012808363052564 0.03554644

### Change

In [106]:
col = [v for v in sgns_lsc.columns if v.startswith("rch")]
print(len(col))
gch = sgns_lsc.loc["V1_berika"][col];gch

21


rch_2000:2001    3.112465
rch_2001:2002    2.056634
rch_2002:2003   -2.531236
rch_2003:2004    3.490279
rch_2004:2005    0.202163
rch_2005:2006    0.222672
rch_2006:2007    1.606794
rch_2007:2008    1.556598
rch_2008:2009    1.028856
rch_2009:2010   -0.252072
rch_2010:2011   -0.696753
rch_2011:2012   -0.885508
rch_2012:2013    1.428116
rch_2013:2014   -0.400075
rch_2014:2015    1.363301
rch_2015:2016   -0.781521
rch_2016:2017    2.582885
rch_2017:2018    1.324375
rch_2018:2019    0.281219
rch_2019:2020    1.654064
rch_2020:2021    0.405180
Name: V1_berika, dtype: float64

In [108]:
X = sgns_iod[sgns_iod["Method"]=="cnt-ssc"][[str(y) for y in years[:-1]]];X

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
2,0.504410,0.510243,0.508537,0.506922,0.501364,0.505548,0.486082,0.458223,0.467360,0.477434,...,0.494854,0.489202,0.499426,0.503830,0.496980,0.549976,0.574419,0.557194,0.519109,0.531113
10,0.505728,0.505801,0.521616,0.520157,0.538088,0.529435,0.526892,0.535009,0.518534,0.522555,...,0.543950,0.542520,0.501678,0.531061,0.533565,0.543292,0.558045,0.544465,0.535889,0.578603
18,0.501469,0.503864,0.512775,0.513595,0.506599,0.529899,0.501450,0.494067,0.501677,0.509128,...,0.524844,0.511131,0.502665,0.516583,0.507913,0.539417,0.544915,0.534551,0.505920,0.547239
26,0.501264,0.501004,0.507079,0.517960,0.509020,0.525125,0.499824,0.487585,0.484847,0.493591,...,0.498029,0.493168,0.481544,0.487774,0.499249,0.535136,0.552964,0.529077,0.525935,0.532934
34,0.497294,0.533454,0.521429,0.518552,0.558604,0.487818,0.533888,0.535225,0.482861,0.438861,...,0.487841,0.527954,0.528144,0.490931,0.534214,0.563337,0.552470,0.607251,0.573772,0.612252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538,,,,,,,,,,0.615427,...,,,,0.459011,0.490735,0.494640,0.502202,0.547411,0.527627,
546,,,,,,,,,,0.489666,...,,,,0.479628,0.479006,0.514852,0.523588,0.499786,0.513244,
554,,,,,,,,,,0.427137,...,,,,0.449542,0.424690,0.466682,0.485237,0.462857,0.486358,
562,,,,,,,,,,0.475539,...,,,,0.456439,0.465208,0.490731,0.515836,0.464963,0.484956,


In [109]:
for i in X.index:
    x = X.loc[i]
    
    dwe = sgns_iod.iloc[i]["DWE"]
#     print(dwe)
    
    A = sgns_iod.iloc[i]["A-Strategy"]
    B = sgns_iod.iloc[i]["B-strategy"]
    
    y = sgns_lsc.loc[dwe][col]
    
    x,y = zip(*[(a,b) for a,b in zip(x,y) if not pd.isna(a) and not pd.isna(b)])
    
    stat = pearsonr(x, y)
    
    if stat[1] < 0.05:
        
        print(dwe, A, B, stat[0], stat[1])

V1_kulturberika top3 greedy -0.5635218953576699 0.02869519603306043
N1_kulturberikare top3 lazy -0.6211977261415077 0.005930839525901088
