In [1]:
import pandas as pd
import numpy as np
import time
import os
from pathlib import Path
from util import load_metric
#from collections import Counter

For each word **w**:
* **count**(w)_t0, ... t.max
    * For each transition *ti*, where *ti* is an element of {t0:t1, ..., t.max-1:t.max}
        * **dif_count**(w, ti)
        * **genuine_change**(w, ti)
        * control_change(w, ti)1, ..., n, where *n* is the sample size for controls
        * **mean_contol_change**(w, ti)
        * **rectified_change**(w, ti)
        * **cos_sim**(w, ti)
        * control_cos_sim(w, ti)1, ..., n, where *n* is the sample size for controls
        * **mean_control_cos_sim**(w, ti)
        * **rectified_cos_sim**(w, ti)

## Setup

In [5]:
corpus   = Path("/home/max/Corpora/toy_pol-sample")
measures = Path("/home/max/Results/toy_pol-output")

In [6]:
file_path = Path("../../dwtch_results.csv")

In [8]:
years = [int(file.strip(".txt")) for file in os.listdir(corpus/"files")]
years.sort()
first_year = min(years)
last_year  = max(years)
c_numbers = set(int(n) for n in ["".join([ch for ch in file.strip(".txt").split("_")[-1] if ch.isdigit()]) for file in os.listdir(measures / "cosine_change") if "control" in file])
c_span = min(c_numbers), max(c_numbers)

In [None]:
#years

In [None]:
#c_span

In [9]:
df = pd.DataFrame()

In [24]:
def check():
    with open("../data/utils/dwts.txt", "r") as f:
        dwts = [w.strip("\n") for w in f.readlines()]
    
    hits = set()
    
    for w in df.index:
        for dw in dwts:
            if dw in w:
                hits.add(w)
    
    print(", ".join(list(hits)))

## Add Word Frequencies

In [10]:
for year in years:
    freqs = {w: c for w, c in load_metric(corpus / f"vocab/{year}.txt").items() if c >= 5}
    df[f"frq{year}"] = pd.Series(freqs)

In [25]:
check()

V1_berika, N1_kulturberikare


## Add Difference in Frequencies

In [18]:
for ti in years[:-1]:
    tj = ti + 1
    df[f"diff_{ti}:{tj}"] = df[f"frq{ti}"] - df[f"frq{tj}"]

In [19]:
check()

V1_berika, N1_kulturberikare


## Add Genuine Change

In [None]:
for file in os.listdir(measures / "cosine_change"):
    if file.strip(".txt").endswith("genuine"):
        c_name = file.strip("_genuine.txt").replace("_", ":")
        c_name = "gch_" + c_name # Genuine Cosine Change
        df[c_name] = pd.Series(load_metric(measures / f"cosine_change/{file}"))

In [20]:
check()

V1_berika, N1_kulturberikare


## Add Mean and Std. of Change Controls

In [21]:
start, end = c_span

for ti in years[:-1]:
    tj = ti + 1
    control = []
    for i in range(start, end + 1):
        s = pd.Series(load_metric(measures / f"cosine_change/{ti}_{tj}_control{i}.txt"))
        control.append(s)

    control = pd.concat(control, axis=1)
    df[f"mccc_{ti}:{tj}"] = control.mean(axis=1) # Mean Cosine Change Controle
    df[f"stdc_{ti}:{tj}"] = control.std(axis=1, ddof=1)

In [22]:
check()

V1_berika, N1_kulturberikare


## Add Rectified Change

In [23]:
for ti in years[:-1]:
    tj = ti + 1
    df[f"rch_{ti}:{tj}"] = (df[f"gch_{ti}:{tj}"] - df[f"mccc_{ti}:{tj}"]) / (df[f"stdc_{ti}:{tj}"] * np.sqrt(1 + 1/end))

KeyError: 'gch_2004:2005'

In [None]:
df

## Add Genuine Similarity

In [None]:
for file in os.listdir(measures / "cosine_sim"):
    if file.strip(".txt").endswith("genuine"):
        c_name = file.strip("_genuine.txt").replace("_", ":")
        c_name = "gsim_" + c_name # Genuine Cosine Similarity
        df[c_name] = pd.Series(load_metric(measures / f"cosine_sim/{file}"))

In [None]:
df

## Add Mean and Std. of Similarity Controls

In [None]:
for ti in years[:-1]:
    tj = ti + 1
    control = []
    for i in range(start, end + 1):
        s = pd.Series(load_metric(measures / f"cosine_sim/{ti}_{tj}_control{i}.txt"))
        control.append(s)

    control = pd.concat(control, axis=1)
    df[f"mcsim_{ti}:{tj}"] = control.mean(axis=1) # Mean Cosine Similarity Controle
    df[f"stdsim_{ti}:{tj}"] = control.std(axis=1, ddof=1)

In [None]:
df

## Add Rectified Similarity

In [None]:
for ti in years[:-1]:
    tj = ti + 1
    df[f"rsim_{ti}:{tj}"] = (df[f"gsim_{ti}:{tj}"] - df[f"mcsim_{ti}:{tj}"]) / (df[f"stdsim_{ti}:{tj}"] * np.sqrt(1 + 1/end))

In [None]:
df

## Save

In [None]:
df.to_csv(path_or_buf=file_path, sep=';')