In [1]:
import pandas as pd
import numpy as np
import time
import os
from pathlib import Path
from util import load_metric
#from collections import Counter

For each word **w**:
* **count**(w)_t0, ... t.max
    * For each transition *ti*, where *ti* is an element of {t0:t1, ..., t.max-1:t.max}
        * **dif_count**(w, ti)
        * **genuine_change**(w, ti)
        * control_change(w, ti)1, ..., n, where *n* is the sample size for controls
        * **mean_contol_change**(w, ti)
        * **rectified_change**(w, ti)
        * **cos_sim**(w, ti)
        * control_cos_sim(w, ti)1, ..., n, where *n* is the sample size for controls
        * **mean_control_cos_sim**(w, ti)
        * **rectified_cos_sim**(w, ti)

## Setup

In [2]:
corpus   = Path("/home/max/Corpora/toy_diapol-sample")
measures = Path("/home/max/Results/toy_diapol-output")

In [14]:
file_path = Path("dwtch_results.csv")

In [3]:
years = [int(file.strip(".txt")) for file in os.listdir(corpus/"yearly")]
years.sort()
first_year = min(years)
last_year  = max(years)
c_numbers = set(int(n) for n in ["".join([ch for ch in file.strip(".txt").split("_")[-1] if ch.isdigit()]) for file in os.listdir(measures / "cosine_change") if "control" in file])
c_span = min(c_numbers), max(c_numbers)

In [None]:
#years

In [None]:
#c_span

In [4]:
df = pd.DataFrame()

## Add Word Frequencies

In [5]:
for year in years:
    freqs = {w: c for w, c in load_metric(corpus / f"vocab/{year}.txt").items() if c >= 5}
    df[f"frq{year}"] = pd.Series(freqs)

In [None]:
df

## Add Difference in Frequencies

In [6]:
for ti in years[:-1]:
    tj = ti + 1
    df[f"diff_{ti}:{tj}"] = df[f"frq{ti}"] - df[f"frq{tj}"]

In [None]:
df

## Add Genuine Change

In [7]:
for file in os.listdir(measures / "cosine_change"):
    if file.strip(".txt").endswith("genuine"):
        c_name = file.strip("_genuine.txt").replace("_", ":")
        c_name = "gch_" + c_name # Genuine Cosine Change
        df[c_name] = pd.Series(load_metric(measures / f"cosine_change/{file}"))

In [None]:
df

## Add Mean and Std. of Change Controls

In [8]:
start, end = c_span

for ti in years[:-1]:
    tj = ti + 1
    control = []
    for i in range(start, end + 1):
        s = pd.Series(load_metric(measures / f"cosine_change/{ti}_{tj}_control{i}.txt"))
        control.append(s)

    control = pd.concat(control, axis=1)
    df[f"mccc_{ti}:{tj}"] = control.mean(axis=1) # Mean Cosine Change Controle
    df[f"stdc_{ti}:{tj}"] = control.std(axis=1, ddof=1)

In [None]:
df

## Add Rectified Change

In [9]:
for ti in years[:-1]:
    tj = ti + 1
    df[f"rch_{ti}:{tj}"] = (df[f"gch_{ti}:{tj}"] - df[f"mccc_{ti}:{tj}"]) / (df[f"stdc_{ti}:{tj}"] * np.sqrt(1 + 1/end))

In [None]:
df

## Add Genuine Similarity

In [10]:
for file in os.listdir(measures / "cosine_sim"):
    if file.strip(".txt").endswith("genuine"):
        c_name = file.strip("_genuine.txt").replace("_", ":")
        c_name = "gsim_" + c_name # Genuine Cosine Similarity
        df[c_name] = pd.Series(load_metric(measures / f"cosine_sim/{file}"))

In [None]:
df

## Add Mean and Std. of Similarity Controls

In [11]:
for ti in years[:-1]:
    tj = ti + 1
    control = []
    for i in range(start, end + 1):
        s = pd.Series(load_metric(measures / f"cosine_sim/{ti}_{tj}_control{i}.txt"))
        control.append(s)

    control = pd.concat(control, axis=1)
    df[f"mcsim_{ti}:{tj}"] = control.mean(axis=1) # Mean Cosine Similarity Controle
    df[f"stdsim_{ti}:{tj}"] = control.std(axis=1, ddof=1)

In [None]:
df

## Add Rectified Similarity

In [12]:
for ti in years[:-1]:
    tj = ti + 1
    df[f"rsim_{ti}:{tj}"] = (df[f"gsim_{ti}:{tj}"] - df[f"mcsim_{ti}:{tj}"]) / (df[f"stdsim_{ti}:{tj}"] * np.sqrt(1 + 1/end))

In [13]:
df

Unnamed: 0,frq2004,frq2005,frq2006,frq2007,frq2008,frq2009,frq2010,frq2011,frq2012,frq2013,...,rsim_2004:2005,rsim_2005:2006,rsim_2006:2007,rsim_2007:2008,rsim_2008:2009,rsim_2009:2010,rsim_2010:2011,rsim_2011:2012,rsim_2012:2013,rsim_2013:2014
att,97400.0,99526.0,98762.0,97083.0,98499.0,97225.0,101378.0,100638.0,103579.0,103715.0,...,0.592157,0.703526,0.862924,0.301511,0.545455,0.301511,,0.738549,0.592157,0.545455
det,79026.0,85296.0,82918.0,81550.0,83997.0,83708.0,86731.0,86465.0,87003.0,87504.0,...,-1.971088,-0.541083,-2.843776,-0.192308,-0.183872,-1.714984,-1.529039,0.444682,-1.680547,-0.111873
är,70956.0,75735.0,75740.0,76980.0,78461.0,77925.0,80669.0,81784.0,83019.0,82689.0,...,0.452267,0.738549,0.452267,0.452267,0.738549,0.301511,0.592157,0.301511,0.301511,0.904534
och,65916.0,68760.0,69664.0,70406.0,70639.0,70337.0,73676.0,74699.0,76127.0,76901.0,...,-0.914725,0.646312,-2.035064,-0.568679,-1.840084,-3.231827,0.883636,1.530147,0.395339,-0.674248
som,60568.0,60950.0,60856.0,60847.0,60890.0,61336.0,63021.0,64125.0,65773.0,66585.0,...,0.452267,,0.560968,0.818182,0.301511,0.452267,0.545455,0.423793,0.452267,0.423793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
lif,5.0,,,,,,,,,,...,,,,,,,,,,
brommaskolorna,5.0,,,,,,,,,,...,,,,,,,,,,
språktestet,5.0,,,,,,,,,,...,,,,,,,,,,
lovdagar,5.0,,,,,,,,,,...,,,,,,,,,,


## Save

In [41]:
df.to_csv(path_or_buf=file_path, sep=';')