# Input

In [None]:
# Imports
%load_ext autoreload
%autoreload 2

import re
import os
from pathlib import Path
import sys

In [None]:
# Specify required input

# Add SynFlow to path in order to import modules
repo_root = "/home/volt/bach/SynFlow"
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

# Regex corpus_pattern to extract relevant information from CoNLL-U files
corpus_pattern = re.compile(
    r'([^\t]+)\t'      # word form
    r'([^\t]+)\t'      # lemma
    r'([^\t])[^\t]*\t' # POS (UPOS or XPOS)
    r'([^\t]+)\t'      # ID
    r'([^\t]+)\t'      # HEAD
    r'([^\t]+)'        # DEPREL
)

# Specify target lemma and part of speech
target_lemma = 'air'
target_pos = 'N'

# Specify corpus and output folders
period = '1750-1819'
corpus_folder = f'/home/volt/bach/pilot_data/RSC/1750-1819_che_half_decades'
output_folder = Path(f'/home/volt/bach/SynFlow/case_studies/RSC_air_water_acid/')

In [None]:
# Dont change below this line
output_folder_lemma = output_folder / 'output' / f'{target_lemma}-{target_pos}-{period}'
output_explorer = f'{output_folder_lemma}/Explorer'
output_embedding = f'{output_folder_lemma}/Embedding'
input_SCD = output_folder / 'input' / 'SCD' /f'{target_lemma}-{target_pos}-{period}'

os.makedirs(output_explorer, exist_ok=True)
os.makedirs(output_embedding, exist_ok=True)
os.makedirs(input_SCD, exist_ok=True)

# Explore the vocab freq

In [None]:
from SynFlow.Explorer.vocab_freq import gen_lemma_freq, analyze_corpus_vocab

In [None]:
gen_lemma_freq(corpus_folder, 
               '/home/volt/bach/pilot_data/RSC/1750-1819_che_half_decades_vocab', # Output folder for vocab files
               file_ext='.txt', mode='lemma_pos_init') # modes: 'lemma_pos', 'lemma_pos_init', or 'lemma_deprel'

In [None]:
analyze_corpus_vocab('/home/volt/bach/pilot_data/RSC/1750-1819_che_half_decades_vocab')

# Explore the distribution of different syntactic slots from the corpus

## Slot-path Explorer

In [None]:
from SynFlow.Explorer import spath_explorer
dist = spath_explorer(
    corpus_folder=corpus_folder,
    target_lemma=target_lemma,
    target_pos=target_pos,
    max_length=1,
    top_n=50,
    pattern=corpus_pattern,
    output_folder=output_explorer
)

## Unique Slot-path Combination Explorer

In [None]:
# 1 unique full-pattern string for 1 token
from SynFlow.Explorer import spath_comb_explorer

ctr = spath_comb_explorer(
    corpus_folder=corpus_folder,
    target_lemma=target_lemma,
    target_pos=target_pos,
    max_length=1,
    top_n=30,
    output_folder=output_explorer,
    pattern=corpus_pattern
)

## Rel Explorer

In [None]:
from SynFlow.Explorer import rel_explorer

rel_explorer_results = rel_explorer(
    corpus_folder=corpus_folder,
    pattern=corpus_pattern,            # or leave None to use default
    target_lemma=target_lemma,
    target_pos=target_pos,
    rel="pa_nsubj",
)

In [None]:
for fname, sent, sfillers, path in rel_explorer_results[:10000]:
    # ctx_list là một list các "lemma/pos", nối bằng ' > ' để in cho dễ nhìn
    sfillers = " > ".join(sfillers)
    print(f"{fname:15} | {path:10} | {sfillers:15} | {sent}")

# Save to csv
import csv
out_path = f'{output_explorer}/rel_explorer.tsv'
with open(out_path, "w", encoding="utf-8", newline="") as f:
    w = csv.writer(f, delimiter='\t')
    w.writerow(["file", "path", "sfillers", "sentence"])
    for fname, sent, sfillers, path in rel_explorer_results:
        sfillers = " > ".join(sfillers)
        w.writerow([fname, path, sfillers, sent])

## Full Rel Explorer

In [None]:
from SynFlow.Explorer import full_rel_explorer

full_rel_explorer_results = full_rel_explorer(
    corpus_folder=corpus_folder,
    pattern=corpus_pattern,            # or leave None to use default
    target_lemma=target_lemma,
    target_pos=target_pos,
    rel="chi_case & chi_det & pa_obl",
    # rel="chi_aux & chi_nsubj & chi_obj & chi_punct",
    # rel="chi_discourse > chi_punct & chi_punct",
    mode = 'close', # 'open', 'close', 'closeh'
)

In [None]:
# inspect a few
sent_num = 0
for fname, sent, paths_details_list in full_rel_explorer_results[:10]:
    sent_num += 1
    # paths_details_list is a list of (sfillers, path) tuples
    for sfillers, path in paths_details_list:
        # sfillers is a list of "lemma/pos", join using ' > ' to print
        sfillers = " > ".join(sfillers)
        print(f"{sent_num} | {fname:15} | {path:10} | {sfillers:15} | {sent}")

# Save to csv
import csv
out_path = f'{output_explorer}/full_rel_explorer.tsv'
with open(out_path, "w", encoding="utf-8", newline="") as f:
    w = csv.writer(f, delimiter='\t')
    w.writerow(["sent_num", "file", "path", "ctx_nodes", "sentence"])
    sent_num = 0
    for fname, sent, paths_details_list in full_rel_explorer_results:
        sent_num += 1
        for sfillers, path in paths_details_list:
            sfillers = " > ".join(sfillers)
            w.writerow([sent_num, fname, path, sfillers, sent])

## Trimming

In [None]:
import pandas as pd
# Load the CSV file
df = pd.read_csv('/home/volt/bach/SynFlow/case_studies/RSC_air_water_acid/output/air-N-1750-1819/Explorer/air_N_spath_combs_1_hops.csv', sep='&')
df.head(10)

In [None]:
from SynFlow.Explorer import trim_and_merge

# Trim and merge relations
spath_df = '/home/volt/bach/SynFlow/case_studies/RSC_air_water_acid/output/air-N-1750-1819/Explorer/air_N_spath_combs_1_hops.csv'
trimmed_rels = ['chi_case']
trim_and_merge(spath_df=spath_df, trimmed_rels=trimmed_rels)

## Specialisations Grouping

In [None]:
from SynFlow.Explorer.trimming import spe_group

spath_df = '/home/volt/bach/SynFlow/case_studies/RSC_air_water_acid/output/air-N-1750-1819/Explorer/air_N_spath_combs_1_hops.csv'
tree = spe_group(spath_df, output_folder=output_explorer, target_lemma=target_lemma)

# Get Slot-filler df

In [None]:
from SynFlow.Explorer import build_sfiller_df

sfiller_df = build_sfiller_df(
    corpus_folder=corpus_folder,
    template='[pa_conj]', # Example: '[chi_nsubj][chi_obj][chi_obl > chi_case]'
    target_lemma=target_lemma,
    target_pos=target_pos,
    pattern=corpus_pattern,
    # freq_path='/home/volt/bach/pilot_data/RSC/lemma_pos_init_freq.txt', # Be sure that the freq_path matches that of the filter format
    # freq_min=1,
    # freq_max=100_000_000,
    filtered_pos=[],
    filler_format='lemma/pos', # lemma/deprel or 'lemma/pos'
    output_folder= output_embedding
)

In [None]:
# Sampling from the general slots DataFrame
from SynFlow.Explorer import sample_sfiller_df
n = 5
sfiller_sample_df = sample_sfiller_df(
    input_csv=f"{output_embedding}/{target_lemma}_samples_sfillerdf_all.csv",
    output_csv=f"{output_embedding}/{target_lemma}_samples_sfillerdf_{n}.csv",
    n=n,
    seed=42,
    mode= 'NA'
)