# Import and Inputs

In [3]:
# Import
%load_ext autoreload
%autoreload 2
import re
import os
import pandas as pd
from pathlib import Path
import sys

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
# Specify required input

# Add SynFlow to path in order to import modules
repo_root = "/home/volt/bach/SynFlow"
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

# Input target
target_lemma = 'air'
target_pos = 'N'
keyword_string = f'{target_lemma}\t{target_pos}' # Or you can use the full POS for precision (e.g., {target_lemma}\tNOUN)

# Periods, pattern of the file names and regex patterns of the CONLLU file
period = '1750-1819'
fname_pattern = re.compile(
    r'Royal_Society_Corpus_open_v6\.0_text_(?P<id>\d+)_(?P<year>\d+)\.txt$'
)
corpus_pattern = re.compile(
    r'([^\t]+)\t'      # word form
    r'([^\t]+)\t'      # lemma
    r'([^\t])[^\t]*\t' # POS (UPOS or XPOS)
    r'([^\t]+)\t'      # ID
    r'([^\t]+)\t'      # HEAD
    r'([^\t]+)'        # DEPREL
)

# Specify corpus and output folders
period = '1750-1819'
corpus_folder = f'/home/volt/bach/pilot_data/RSC/1750-1819_che_half_decades'
output_folder = Path(f'/home/volt/bach/SynFlow/case_studies/RSC_air_water_acid/')

# The path to the slot count JSON file
slot_json_path = '/home/volt/bach/SynFlow/case_studies/RSC_air_water_acid/output/air-N-1750-1819/Explorer/air_N_spaths.json'

In [5]:
# Dont change below this line
output_folder_lemma = output_folder / 'output' / f'{target_lemma}-{target_pos}-{period}'
output_explorer = f'{output_folder_lemma}/Explorer'
output_embedding = f'{output_folder_lemma}/Embedding'
input_SCD = output_folder / 'input' / 'SCD' /f'{target_lemma}-{target_pos}-{period}'

os.makedirs(output_explorer, exist_ok=True)
os.makedirs(output_embedding, exist_ok=True)
os.makedirs(input_SCD, exist_ok=True)

# Slot Frequencies

In [6]:
# Token counts for normalisation
from SynFlow.SCD import count_keyword_tokens_by_period
token_counts = count_keyword_tokens_by_period(corpus_folder, keyword_string,
                                              fname_pattern=fname_pattern)
print(token_counts)

{'1750': 241, '1755': 327, '1760': 124, '1765': 977, '1770': 1578, '1775': 1773, '1780': 2928, '1785': 2430, '1790': 1498, '1795': 340, '1800': 654, '1805': 373, '1810': 279, '1815': 451}


In [11]:
from SynFlow.SCD import plot_freq_top_union_slots_by_period

plot_freq_top_union_slots_by_period(
    json_path=slot_json_path,
    top_n=10,
    normalized=False,
    relative=False,
    token_counts=token_counts,
)

In [12]:
from SynFlow.SCD import freq_all_slots_by_period
slot_raw_freq_df = freq_all_slots_by_period(json_path=slot_json_path).T
slot_raw_freq_df.head(30)

Unnamed: 0,1750,1755,1760,1765,1770,1775,1780,1785,1790,1795,1800,1805,1810,1815
chi_det,163.0,164.0,98.0,477.0,682.0,902.0,1187.0,919.0,606.0,127.0,297.0,120.0,130.0,133.0
chi_case,133.0,183.0,82.0,578.0,953.0,1047.0,1663.0,1455.0,887.0,188.0,450.0,232.0,185.0,281.0
pa_obl,65.0,72.0,31.0,206.0,303.0,338.0,478.0,382.0,236.0,75.0,171.0,86.0,81.0,83.0
pa_nmod,56.0,100.0,45.0,350.0,618.0,671.0,1126.0,1011.0,621.0,104.0,252.0,143.0,100.0,187.0
chi_amod,51.0,73.0,26.0,471.0,667.0,788.0,1889.0,1517.0,1009.0,125.0,164.0,137.0,81.0,105.0
chi_punct,48.0,33.0,14.0,91.0,168.0,157.0,250.0,193.0,143.0,30.0,76.0,35.0,24.0,42.0
pa_nsubj,26.0,31.0,20.0,152.0,171.0,238.0,342.0,256.0,141.0,36.0,35.0,27.0,16.0,22.0
chi_nmod,23.0,13.0,8.0,75.0,77.0,156.0,215.0,172.0,97.0,20.0,58.0,15.0,32.0,24.0
pa_compound,19.0,30.0,6.0,25.0,47.0,59.0,79.0,66.0,51.0,22.0,33.0,30.0,27.0,32.0
pa_nsubj:pass,17.0,16.0,2.0,68.0,145.0,139.0,295.0,206.0,123.0,21.0,22.0,15.0,12.0,16.0


# Get the slot fillers df of all slots

In [13]:
# Get all slots in the corect format
from SynFlow.Explorer.sfiller_df import get_all_slots
all_slots = get_all_slots(slot_raw_freq_df)
print(all_slots)

[chi_det][chi_case][pa_obl][pa_nmod][chi_amod][chi_punct][pa_nsubj][chi_nmod][pa_compound][pa_nsubj:pass][pa_obj][chi_acl:relcl][pa_conj][pa_appos][chi_advmod][chi_cc][chi_acl][chi_conj][chi_compound][pa_obl:agent][chi_mark][chi_discourse][chi_appos][pa_list][chi_obl][chi_nsubj][chi_cop][chi_aux][pa_parataxis][pa_nmod:poss][chi_det:predet][chi_parataxis][chi_nmod:unmarked][chi_nummod][pa_advcl][pa_obl:unmarked][chi_nmod:poss][chi_dep][pa_xcomp][chi_advcl][chi_cc:preconj][pa_acl:relcl][pa_ccomp][chi_obl:unmarked][chi_list][chi_orphan][pa_csubj][pa_iobj][pa_nsubj:outer][pa_dislocated][pa_vocative][pa_acl][chi_nsubj:outer][pa_advcl:relcl][pa_csubj:pass][pa_dep][chi_flat]


In [15]:
# Building a slot filler df
from SynFlow.Explorer import build_sfiller_df

df_slots = build_sfiller_df(
    corpus_folder=corpus_folder,
    template=all_slots, 
    target_lemma=target_lemma,
    target_pos=target_pos,
    pattern=corpus_pattern,
    # freq_path='/home/volt/bach/pilot_data/RSC/lemma_pos_init_freq.txt', # Be sure that the freq_path matches that of the filter format
    # freq_min=1,
    # freq_max=100_000_000,
    filtered_pos=[],
    filler_format='lemma/pos', # lemma/deprel or 'lemma/pos'
    output_folder= output_explorer
)

Wrote slot‚Äêfillers to /home/volt/bach/SynFlow/case_studies/RSC_air_water_acid/output/air-N-1750-1819/Explorer/air_samples_sfillerdf_all.csv (13459 rows), dropped 0 tokens.


In [16]:
all_sfillers_csv_path = '/home/volt/bach/SynFlow/case_studies/RSC_air_water_acid/output/air-N-1750-1819/Explorer/air_samples_sfillerdf_all.csv'

In [17]:
# Note that it is NECESSARY to manually check and correct for the spellings in the CSV file
from SynFlow.Explorer import replace_in_sfiller_df_column

# This is for the chi_amod of air
replace_column = 'chi_amod'
replacements = {
    "dephlogisticate/V": "dephlogisticated/A",
    "phlogisticate/V": "phlogisticated/A",
    'plllogisticate/V': 'phlogisticated/A',
    'deplllogisticate/V': 'dephlogisticated/A',
    'gisticate/V': 'phlogisticated/A',
    'Open/A': 'open/A',
    'atmospheric/A': 'atmospherical/A',
}

replace_in_sfiller_df_column(all_sfillers_csv_path, replace_column, replacements)

# Calculate divergences of all slots

In [21]:
from SynFlow.SCD import consecutive_JSD_dict
sfiller_df_path = '/home/volt/bach/SynFlow/case_studies/RSC_air_water_acid/output/air-N-1750-1819/Explorer/air_samples_sfillerdf_all.csv'

In [22]:
consecutive_JSD_dictionary = consecutive_JSD_dict(all_sfillers_csv_path=sfiller_df_path,
                     min_freq=5,
                     mode='all' # or 'data_only' if you want to skip the empty periods
                     )

In [23]:
consecutive_JSD_dictionary

{'chi_det': {'1750-1755': 0.012166863241787743,
  '1755-1760': 0.03576176166510985,
  '1760-1765': 0.05877591355220801,
  '1765-1770': 0.013234489493126991,
  '1770-1775': 0.004180215283842363,
  '1775-1780': 0.027442656287276195,
  '1780-1785': 0.011386057157154692,
  '1785-1790': 0.017514666329030263,
  '1790-1795': 0.023850154019291216,
  '1795-1800': 0.017412334229138376,
  '1800-1805': 0.014997615813066988,
  '1805-1810': 0.010400795562520673,
  '1810-1815': 0.015203546045016593},
 'chi_case': {'1750-1755': 0.09291533203924009,
  '1755-1760': 0.05876343839185554,
  '1760-1765': 0.07900086912011152,
  '1765-1770': 0.01933563638038423,
  '1770-1775': 0.024775238216374163,
  '1775-1780': 0.03318840070774642,
  '1780-1785': 0.012137967397030342,
  '1785-1790': 0.019078851984438094,
  '1790-1795': 0.0864752876785844,
  '1795-1800': 0.04671068240523323,
  '1800-1805': 0.051274156001820025,
  '1805-1810': 0.09445090783692306,
  '1810-1815': 0.04424307828337718},
 'pa_obl': {'1750-1755': 