# Import and Inputs

In [1]:
%load_ext autoreload
%autoreload 2
import re
import os
import pandas as pd
from os import makedirs

In [2]:
# Input target
target_lemma = 'air'
target_pos = 'N'

# Periods, corpus path, pattern of the file names and regex patterns of the CONLLU file
period = '1750-1819'
corpus_path = '/home/volt/bach/pilot_data/RSC/1750-1819_che_half_decades'
fname_pattern = re.compile(
    r'Royal_Society_Corpus_open_v6\.0_text_(?P<id>\d+)_(?P<year>\d+)\.txt$'
)
corpus_pattern = re.compile(
    r'([^\t]+)\t'      # word form
    r'([^\t]+)\t'      # lemma
    r'([^\t])[^\t]*\t' # POS (UPOS or XPOS)
    r'([^\t]+)\t'      # ID
    r'([^\t]+)\t'      # HEAD
    r'([^\t]+)'        # DEPREL
)

# Input and output folders
input_SCD = f'/home/volt/bach/SynFlow/input/SCD/{target_lemma}-{target_pos}-{period}'
output_folder = f'/home/volt/bach/SynFlow/output/{target_lemma}-{target_pos}-{period}'
output_explorer = f'{output_folder}/Explorer'
output_embedding = f'{output_folder}/Embedding'

makedirs(output_explorer, exist_ok=True)
makedirs(output_embedding, exist_ok=True)
makedirs(input_SCD, exist_ok=True)

# The path to the slot count JSON file
slot_json_path = "/home/volt/bach/SynFlow/output/air-N-1750-1819/Explorer/air_N_spaths.json"

# Get the slot fillers df of all slots

In [5]:
# Calculate relative frequencies of all slots by period
from SynFlow.SCD import freq_all_slots_by_period_relative
rel_freq_all_slots_by_period_df = freq_all_slots_by_period_relative(json_path=slot_json_path)
rel_freq_all_slots_by_period_df

Unnamed: 0,Period,Slot Type,Frequency
0,1750,chi_det,0.231206
1,1755,chi_det,0.200000
2,1760,chi_det,0.262032
3,1765,chi_det,0.163133
4,1770,chi_det,0.149857
...,...,...,...
793,1795,chi_flat,0.000000
794,1800,chi_flat,0.000000
795,1805,chi_flat,0.000000
796,1810,chi_flat,0.000000


In [12]:
# Get the labels of all slots to build a slot filler df
all_slots = ''.join(f'[{s}]' for s in rel_freq_all_slots_by_period_df['Slot Type'].unique())
all_slots

# Building a slot filler df
from SynFlow.Explorer import build_sfiller_df

df_slots = build_sfiller_df(
    corpus_folder=corpus_path,
    template=all_slots, 
    target_lemma=target_lemma,
    target_pos=target_pos,
    pattern=corpus_pattern,
    # freq_path='/home/volt/bach/pilot_data/RSC/lemma_pos_init_freq.txt', # Be sure that the freq_path matches that of the filter format
    # freq_min=1,
    # freq_max=100_000_000,
    filtered_pos=[],
    filler_format='lemma/pos', # lemma/deprel or 'lemma/pos'
    output_folder= output_embedding
)

Wrote slot‐fillers to /home/volt/bach/SynFlow/output/air-N-1750-1819/Embedding/air_samples_sfillerdf_all.csv (13459 rows), dropped 0 tokens.


In [8]:
# Note that it is NECESSARY to manually check and correct for the spellings in the CSV file
from SynFlow.Explorer import replace_in_sfiller_df_column

replace_column = 'chi_amod'
replacements = {
    "dephlogisticate/V": "dephlogisticated/A",
    "phlogisticate/V": "phlogisticated/A",
    'plllogisticate/V': 'phlogisticated/A',
    'deplllogisticate/V': 'dephlogisticated/A',
    'gisticate/V': 'phlogisticated/A',
    'Open/A': 'open/A',
    'atmospheric/A': 'atmospherical/A',
}

all_sfillers_csv_path = '/home/volt/bach/SynFlow/output/air-N-1750-1819/Embedding/air_samples_sfillerdf_all.csv'

replace_in_sfiller_df_column(all_sfillers_csv_path, replace_column, replacements)

# Calculate weighted slot divergences of all slots

In [21]:
from SynFlow.SCD import total_divergence_slots

total_divergence_df = total_divergence_slots(slot_json_path=slot_json_path,
                                            all_sfillers_csv_path=all_sfillers_csv_path,
                                            min_freq=14)
total_divergence_df

Unnamed: 0,Slot Type,Weighted Total Divergence
0,pa_nmod,0.048899
1,chi_amod,0.040993
2,pa_obl,0.02798
3,pa_obj,0.018005
4,pa_nsubj,0.01684
5,chi_nmod,0.014933
6,chi_case,0.010658
7,pa_nsubj:pass,0.009703
8,chi_acl,0.008092
9,chi_acl:relcl,0.007718


In [22]:
import numpy as np
import pandas as pd

def elbow_by_line(df, value_col="Weighted Total Divergence", name_col="Slot Type"):
    d = df[[name_col, value_col]].dropna().copy()
    d = d.sort_values(value_col, ascending=False).reset_index(drop=True)
    if d.empty or d[value_col].max() <= 0:
        return d.iloc[:0], 0.0, 0
    s = d[value_col].to_numpy(float)
    n = len(s)
    x = np.arange(1, n + 1) / n
    y = s / s[0]  # normalize to 1 at top

    # line through (x1,y1)=(x[0],1) and (xN,yN)=(x[-1], y[-1])
    x1, y1, xN, yN = x[0], 1.0, x[-1], y[-1]
    A = yN - y1
    B = x1 - xN
    C = xN*y1 - x1*yN
    # perpendicular distance
    dist = np.abs(A*x + B*y + C) / np.sqrt(A*A + B*B)
    elbow_idx = int(np.argmax(dist))  # 0-based
    cutoff = s[elbow_idx]
    return d.iloc[:elbow_idx+1].copy(), float(cutoff), elbow_idx+1

selected, cutoff, K = elbow_by_line(total_divergence_df)
selected

Unnamed: 0,Slot Type,Weighted Total Divergence
0,pa_nmod,0.048899
1,chi_amod,0.040993
2,pa_obl,0.02798
3,pa_obj,0.018005
4,pa_nsubj,0.01684
5,chi_nmod,0.014933
6,chi_case,0.010658
7,pa_nsubj:pass,0.009703
8,chi_acl,0.008092
9,chi_acl:relcl,0.007718


# Test with 1 col

In [None]:
# Get all the slot cols
all_slot_df = pd.read_csv(all_slot_csv_path, encoding="utf-8")
# exception = ['id', 'subfolder', 'target']
# cols = [c for c in all_slot_df.columns if c not in exception]
# print(cols)

In [None]:
# Test with 1 col

# col = 'pa_acl' # replace later by for col in cols
# df_temp = all_slot_df[['subfolder', col]].copy()
# import ast
# df_temp[col] = df_temp[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
# df_temp = df_temp.explode(col, ignore_index=True).dropna(subset=[col]).reset_index(drop=True)
# df_temp

In [None]:
# from SynFlow.SCD.jsd import consecutive_jsd

# consecutive_jsd_df = consecutive_jsd(df_temp, period_col='subfolder', slot_col=col)
# consecutive_jsd_df

In [None]:
# freq_col_slot_by_period_df = rel_freq_all_slots_by_period_df[rel_freq_all_slots_by_period_df["Slot Type"] == col].reset_index(drop=True)
# freq_col_slot_by_period_df

In [None]:
from SynFlow.SCD.jsd import pairs_with_pos_freq

pairs_with_pos_freq(freq_col_slot_by_period_df)


In [None]:
from SynFlow.SCD.jsd import weighted_total_divergence_col

weighted_total_divergence_col(consecutive_jsd_df, freq_col_slot_by_period_df)