# Test with Stanza parsing

In [None]:
import stanza
# stanza.download('en') # download English model
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse') # initialize English neural pipeline

In [None]:
sents = [
        # 'he sleeps.',
        # 'he sleeps his opponent in the tournament.', # Develope new slots

        # 'he is a big bad wolf', # Multiple similar slots of one target word
        # 'he runs the company and the firm' # Multiple similar slots of one target word

        # 'This is Microsoft Windows operating system.', # Changing POS from N -> ProN
        # 'This is the doors and the windows',
        # 'Windows cannot compete against MacOS',

        'This is a gay day.',
        'I am a gay!',
        'You are such a gay mofo!!!'
         ]

In [None]:
for sent in sents:
    doc = nlp(sent) # run annotation over a sentence
    print('sentence:', sent)
    # print(doc)
    # print(doc.entities)
    print(*[f'word: {word.text}\tlemma: {word.lemma}\tpos: {word.pos}\tid: {word.id}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')
    print('end')

# Input

In [None]:
%load_ext autoreload
%autoreload 2

import re
import os

pattern = re.compile(
    r'([^\t]+)\t'      # word form
    r'([^\t]+)\t'      # lemma
    r'([^\t])[^\t]*\t' # POS (UPOS or XPOS)
    r'([^\t]+)\t'      # ID
    r'([^\t]+)\t'      # HEAD
    r'([^\t]+)'        # DEPREL
)

target_lemma = 'air'
target_pos = 'N'

# All
period = '1750-1799'
corpus_folder = f'/home/volt/bach/pilot_data/RSC/1750-1799_che_half_decades'
output_folder = f'/home/volt/bach/SynFlow/output/{target_lemma}-{target_pos}-{period}'
output_explorer = f'{output_folder}/Explorer'
output_embedding = f'{output_folder}/Embedding'

if not os.path.exists(output_explorer):
    os.makedirs(output_explorer)

if not os.path.exists(output_embedding):
    os.makedirs(output_embedding)


# Explore the distribution of different syntactic relationships from the corpus

## Arguments Explorer

In [None]:
from SynFlow.Explorer import arg_explorer
dist = arg_explorer(
    corpus_folder=corpus_folder,
    target_lemma=target_lemma,
    target_pos=target_pos,
    max_length=1,
    top_n=30,
    pattern=pattern,
    output_folder=output_explorer
)

In [None]:
from SynFlow.Explorer import arg_explorer
dist = arg_explorer(
    corpus_folder=corpus_folder,
    target_lemma=target_lemma,
    target_pos=target_pos,
    max_length=2,
    top_n=30,
    pattern=pattern,
    output_folder=output_explorer
)

## Unique Argument Combination Explorer

In [None]:
# 1 unique full-pattern string for 1 token
from SynFlow.Explorer import arg_comb_explorer

ctr = arg_comb_explorer(
    corpus_folder=corpus_folder,
    target_lemma=target_lemma,
    target_pos=target_pos,
    max_length=1,
    top_n=30,
    output_folder=output_explorer,
    pattern=pattern
)

## Rel Explorer

In [None]:
from SynFlow.Explorer import rel_explorer

rel_explorer_results = rel_explorer(
    corpus_folder=corpus_folder,
    pattern=pattern,            # or leave None to use default
    target_lemma=target_lemma,
    target_pos=target_pos,
    rel="chi_amod",
)

# inspect a few
# for sent, ctx_list, path in triples[:10]:
#     print(f"{path:>15}  {' > '.join(ctx_list):20} | {sent}")

for fname, sent, ctx_list, path in rel_explorer_results[:1000]:
    # ctx_list là một list các "lemma/pos", nối bằng ' > ' để in cho dễ nhìn
    ctx_chain = " > ".join(ctx_list)
    print(f"{fname:20} | {path:15} | {ctx_chain:20} | {sent}")

# Save to csv
import csv
out_path = f'{output_explorer}/rel_explorer.csv'
with open(out_path, "w", encoding="utf-8", newline="") as f:
    w = csv.writer(f, delimiter='\t')
    w.writerow(["file", "path", "ctx_nodes", "sentence"])
    for fname, sent, ctx_nodes, path in rel_explorer_results:
        ctx_chain = " > ".join(ctx_nodes)
        w.writerow([fname, path, ctx_chain, sent])

## Full Rel Explorer

In [None]:
from SynFlow.Explorer import full_rel_explorer

triples = full_rel_explorer(
    corpus_folder=corpus_folder,
    pattern=pattern,            # or leave None to use default
    target_lemma=target_lemma,
    target_pos=target_pos,
    rel="chi_case & chi_det & pa_obl",
    # rel="chi_aux & chi_nsubj & chi_obj & chi_punct",
    # rel="chi_discourse > chi_punct & chi_punct",
    mode = 'close', # 'open', 'close', 'closeh'
)

print(len(triples))

# inspect a few
for fname, sent, found_paths_details_list in triples[:10]:
    # found_paths_details_list is a list of (ctx_nodes, path_str) tuples
    for ctx_nodes, path_str in found_paths_details_list:
        # ctx_nodes is a list of "lemma/pos", join using ' > ' to print
        ctx_chain = " > ".join(ctx_nodes)
        print(f"{fname:20} | {path_str:15} | {ctx_chain:20} | {sent}")

# Save to csv
import csv
out_path = f'{output_explorer}/full_rel_explorer.csv'
with open(out_path, "w", encoding="utf-8", newline="") as f:
    w = csv.writer(f, delimiter='\t')
    w.writerow(["file", "path", "ctx_nodes", "sentence"])
    for fname, sent, ctx_nodes, path in rel_explorer_results:
        ctx_chain = " > ".join(ctx_nodes)
        w.writerow([fname, path, ctx_chain, sent])

## Trimming

In [None]:
import pandas as pd
df = pd.read_csv('/home/volt/bach/SynFlow/output/air-N-1750-1799/Explorer/air_N_arg_comb_1_hops.csv', sep='&')
df.head(10)

In [None]:

from SynFlow.Explorer import trim_and_merge
df_file = '/home/volt/bach/SynFlow/output/air-N-1750-1799/Explorer/air_N_arg_comb_1_hops.csv'
trimmed_rels = ['chi_punct', 'chi_det', 'pa_parataxis','chi_discourse']
trim_and_merge(df_file=df_file, trimmed_rels=trimmed_rels)

## Specialisations Grouping

In [None]:
from SynFlow.Explorer.trimming import spe_group

df_path = '/home/volt/bach/SynFlow/output/air-N-1750-1799/Explorer/air_N_arg_comb_1_hops_trimmed.csv'
tree = spe_group(df_path, output_folder=output_explorer, target_lemma=target_lemma)

# Get Slot df

In [5]:
from SynFlow.Explorer import build_slot_df

df_slots = build_slot_df(
    corpus_folder=corpus_folder,
    template='[chi_amod]', # Example: '[chi_nsubj][chi_obj][chi_obl > chi_case]'
    target_lemma=target_lemma,
    target_pos=target_pos,
    pattern=pattern,
    freq_path='/home/volt/bach/pilot_data/RSC/lemma_pos_init_freq.txt', # Be sure that the freq_path matches that of the filter format
    freq_min=1,
    freq_max=100_000_000,
    filtered_pos=[],
    filler_format='lemma/pos', # lemma/deprel or 'lemma/pos'
    output_folder= output_embedding
)

Wrote slot‐fillers to /home/volt/bach/SynFlow/output/air-N-1750-1799/Embedding/air_samples_slotdf_all.csv (6156 rows), dropped 5695 tokens.


In [None]:
# Sampling from the general slots DataFrame
from SynFlow.Explorer import sample_slot_df
n = 5
df_sample = sample_slot_df(
    input_csv=f"{output_embedding}/{target_lemma}_samples_slotdf_all.csv",
    output_csv=f"{output_embedding}/{target_lemma}_samples_slotdf_{n}.csv",
    n=n,
    seed=42,
    mode= 'NA'
)

In [None]:
# template='[chi_nsubj][chi_obj][chi_obl > chi_case]'
# slots     = template.strip("[]").split("][")
# print(slots)