# Test with Stanza parsing

In [None]:
import stanza
# stanza.download('en') # download English model
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse') # initialize English neural pipeline

In [None]:
sents = [
        "hell yeah!"
        #  'The boss runs the company.',
        #  'The company is run by the boss', 
        #  'The company is run in the dark.', 
        #  'He runs in the jungle.',
        #  'The roads run through the city.',
        #  'He runs his finger through his hair.',
        #  'The computer runs fast.',
        #  'The car runs really fast.'
         ]

# sents = ['MISS NORMAN : Will you do me the honour to meet me at the bridgehead at half-past nine practically at once ?']
target = 'miss'

In [None]:
for sent in sents:
    doc = nlp(sent) # run annotation over a sentence
    print('sentence:', sent)
    # print(doc)
    # print(doc.entities)
    print(*[f'word: {word.text}\tlemma: {word.lemma}\tpos: {word.pos}\tid: {word.id}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')
    print('end')

In [None]:
import stanza
from collections import defaultdict

sents = ['The boss runs the company.',
         'The company is run by the boss', 
         'The company is run in the dark.', 
         'He runs in the jungle.',
         'The roads run through the city.',
         'He runs his finger through his hair.',
         'The computer runs fast.',
         'The car runs really fast.']
target = 'run'

# 1. load the English pipeline (tokeniser-POS-lemma-dependency)
nlp = stanza.Pipeline(
        'en', processors='tokenize,pos,lemma,depparse',
        tokenize_no_ssplit=True,  # treat each string as a single sentence
        verbose=False)

results = []               # (sent_id, dep_lemma, deprel)
for sent_id, text in enumerate(sents, 1):
    doc = nlp(text)
    sent = doc.sentences[0]               # exactly one per string
    for w in sent.words:                  # iterate over tokens/words
        if w.lemma == target:             # <- our target lemma
            head_id = w.id
            # collect *immediate* dependents of this “run”
            for d in sent.words:
                if d.head == head_id:
                    results.append((sent_id, d.lemma, d.deprel))

# pretty-print
for sent_id, lem, rel in results:
    print(f'S{sent_id}: {lem:<10}  {rel}')


In [None]:
# sents = ['The boss runs the company.',
#          'The company is run by the boss', 
#          'The company is run in the dark.', 
#          'He runs in the jungle.',
#          'The roads run through the city.',
#          'He runs his finger through his hair.',
#          'The computer runs fast.',
#          'The car runs really fast.'
#          ]

# sents = [
#     "Freedom is priceless.",
#     "She fought for freedom during the revolution.",
#     "The court finally granted him the freedom to speak openly.",
#     "Within the classroom, freedom of thought nurtures creativity.",
#     "The towering bronze sculpture, Freedom, dominates the plaza.",
#     "After the last exam, the students burst outside in pure freedom.",
#     "Digital tracking can quietly erode freedom online.",
#     "We debated whether freedom or security mattered more.",
#     "Without self-control, freedom often collapses into chaos.",
#     "He inhaled deeply, freedom flooding his lungs at the prison gates."
# ]

# sents = [
#     "The table shook during the earthquake.",
#     "She carved her initials into the wooden table.",
#     "After dinner, they sat around the table and talked for hours.",
#     "The architect presented a glass table as the room's centerpiece.",
#     "Please table the motion until next week’s meeting.",
#     "We sorted the data into a table for easier comparison.",
#     "The cat leapt onto the table, knocking over a vase.",
#     "Negotiators agreed to table further discussion until sunrise.",
#     "Beneath the table, a hidden drawer contained old photographs.",
#     "A picnic table stood alone under the oak tree."
# ]

sents = [
    "This article is interesting.",
    "An interesting twist changed the plot completely.",
    "He found the lecture interesting despite the late hour.",
    "Someone interesting moved into the apartment next door.",
    "The most interesting of the artifacts was the jade mask.",
    "Keep your questions interesting and concise.",
    "They made the workshop interesting by adding hands-on demos.",
    "What I find interesting is how quickly trends shift.",
    "Do you have anything interesting to read on the train?",
    "Interesting, she thought, how silence can speak louder than words."
]


TARGET = 'interesting' 

MAX_DEPTH  = 2             # you can pass (1,), (2,), (1,2,3) …


In [None]:
import stanza
from collections import defaultdict, deque

nlp = stanza.Pipeline(
        "en",
        processors="tokenize,pos,lemma,depparse",
        tokenize_no_ssplit=True,
        verbose=False)

# ------------------------------------------------------------------ #
def collect_connected(sent, target_lemma, max_depth):
    """
    Return {depth: [(lemma, path)]} where 'path' is a string like
    '↓obj' or '↑nsubj:pass > ↓obl' showing the route from the target
    to the node.  Traversal is undirected, up to max_depth edges.
    """
    id2word   = {w.id: w for w in sent.words}
    neighbours = defaultdict(list)                  # id -> [(word, label)]

    # build bidirectional edges
    for w in sent.words:
        if w.head == 0:                             # ROOT has no parent
            continue
        head = id2word[w.head]
        neighbours[w.id].append((head, f"↑{w.deprel}"))   # child -> parent
        neighbours[head.id].append((w, f"↓{w.deprel}"))   # parent -> child

    result = defaultdict(list)                      # depth -> [(lemma, path)]
    for w in sent.words:
        if w.lemma != target_lemma:
            continue                                # other lemmas not our start
        q = deque([(w, 0, [])])                     # node, depth, path so far
        visited = {w.id}
        while q:
            node, d, path = q.popleft()
            if d == max_depth:                      # stop expanding beyond limit
                continue
            for nb, rel in neighbours[node.id]:
                if nb.id in visited:
                    continue
                nd     = d + 1
                npath  = path + [rel]
                result[nd].append((nb.lemma, " > ".join(npath)))
                visited.add(nb.id)
                q.append((nb, nd, npath))
    return result
# ------------------------------------------------------------------ #

all_hits = defaultdict(lambda: defaultdict(list))   # sent_id -> depth -> items
for sid, text in enumerate(sents, 1):
    sent = nlp(text).sentences[0]
    dep_map = collect_connected(sent, TARGET, MAX_DEPTH)
    for d, items in dep_map.items():
        all_hits[sid][d].extend(items)

# --- demo print ---------------------------------------------------- #
for sid in sorted(all_hits):
    print(f"\nSentence {sid}: {sents[sid-1]}")
    for d in sorted(all_hits[sid]):
        print(f"  depth {d}:")
        for lem, rel_path in all_hits[sid][d]:
            print(f"      {lem:<10}  {rel_path}")


# Input

In [None]:
%load_ext autoreload
%autoreload 2

import re
import os

pattern = re.compile(
    r'([^\t]+)\t'      # word form
    r'([^\t]+)\t'      # lemma
    r'([^\t])[^\t]*\t' # POS (UPOS or XPOS)
    r'([^\t]+)\t'      # ID
    r'([^\t]+)\t'      # HEAD
    r'([^\t]+)'        # DEPREL
)

target_lemma = 'air'
target_pos = 'N'

# # All
period = '1750-1799'
corpus_folder = f'/home/volt/bach/pilot_data/RSC/1750-1799_che'
output_folder = f'/home/volt/bach/SynFlow/output/{target_lemma}-{target_pos}-{period}'
output_explorer = f'{output_folder}/Explorer'
output_embedding = f'{output_folder}/Embedding'

# Decades
# period = '1790'
# corpus_folder = f'/home/volt/bach/pilot_data/RSC/1750-1799_che_decades/{period}'
# output_folder = f'/home/volt/bach/SynFlow/output/{target_lemma}-{target_pos}-{period}'
# visualisation_folder = f'/home/volt/bach/SynFlow/visualisation/{target_lemma}-{target_pos}-{period}'

# Half decades
# period = '1770-1774'
# corpus_folder = f'/home/volt/bach/pilot_data/RSC/1750-1799_che_half_decades/{period}'
# output_folder = f'/home/volt/bach/SynFlow/output/{target_lemma}-{target_pos}-{period}'
# output_explorer = f'{output_folder}/Explorer'
# output_embedding = f'{output_folder}/Embedding'

if not os.path.exists(output_explorer):
    os.makedirs(output_explorer)

if not os.path.exists(output_embedding):
    os.makedirs(output_embedding)


# Explore the distribution of different syntactic relationships from the corpus

## Arguments Explorer

In [None]:
from SynFlow.Explorer import arg_explorer
dist = arg_explorer(
    corpus_folder=corpus_folder,
    target_lemma=target_lemma,
    target_pos=target_pos,
    max_length=1,
    top_n=30,
    pattern=pattern,
    output_folder=output_explorer
)

## Unique Argument Combination Explorer

In [None]:
# 1 unique full-pattern string for 1 token
from SynFlow.Explorer import arg_comb_explorer

ctr = arg_comb_explorer(
    corpus_folder=corpus_folder,
    target_lemma=target_lemma,
    target_pos=target_pos,
    max_length=1,
    top_n=30,
    output_folder=output_explorer,
    pattern=pattern
)

## Rel Explorer

In [None]:
from SynFlow.Explorer import rel_explorer

triples = rel_explorer(
    corpus_folder=corpus_folder,
    pattern=pattern,            # or leave None to use default
    target_lemma=target_lemma,
    target_pos=target_pos,
    rel="chi_nmod",
)

# inspect a few
# for sent, ctx_list, path in triples[:10]:
#     print(f"{path:>15}  {' > '.join(ctx_list):20} | {sent}")

for fname, sent, ctx_list, path in triples[:100]:
    # ctx_list là một list các "lemma/pos", nối bằng ' > ' để in cho dễ nhìn
    ctx_chain = " > ".join(ctx_list)
    print(f"{fname:20} | {path:15} | {ctx_chain:20} | {sent}")

## Full Rel Explorer

In [None]:
from SynFlow.Explorer import full_rel_explorer

triples = full_rel_explorer(
    corpus_folder=corpus_folder,
    pattern=pattern,            # or leave None to use default
    target_lemma=target_lemma,
    target_pos=target_pos,
    rel="chi_case",
    # rel="chi_aux & chi_nsubj & chi_obj & chi_punct",
    # rel="chi_discourse > chi_punct & chi_punct",
    mode = 'open', # 'open', 'close', 'closeh'
)

print(len(triples))

# inspect a few
for fname, sent, found_paths_details_list in triples[:10]:
    # found_paths_details_list is a list of (ctx_nodes, path_str) tuples
    for ctx_nodes, path_str in found_paths_details_list:
        # ctx_nodes is a list of "lemma/pos", join using ' > ' to print
        ctx_chain = " > ".join(ctx_nodes)
        print(f"{fname:20} | {path_str:15} | {ctx_chain:20} | {sent}")

## Trimming

In [None]:
import pandas as pd
df = pd.read_csv('/home/volt/bach/SynFlow/output/air-N-1770-1774/Explorer/air_arg_comb_1_hops.csv', sep='&')

In [None]:

from SynFlow.Explorer import trim_and_merge
df_file = '/home/volt/bach/SynFlow/output/air-N-1770-1774/Explorer/air_arg_comb_1_hops.csv'
trimmed_rels = ['chi_punct', 'chi_det', 'pa_parataxis','chi_discourse']
trim_and_merge(df_file=df_file, trimmed_rels=trimmed_rels)

## Specialisations Grouping

In [None]:
from SynFlow.Explorer.trimming import spe_group

df_path = '/home/volt/bach/SynFlow/output/air-N-1770-1774/Explorer/air_arg_comb_1_hops_trimmed.csv'
tree = spe_group(df_path, output_folder=output_explorer, target_lemma=target_lemma)

# Get Slot df

In [None]:
from SynFlow.Explorer import build_slot_df

df_slots = build_slot_df(
    corpus_folder=corpus_folder,
    template='[chi_compound]',
    target_lemma=target_lemma,
    target_pos=target_pos,
    pattern=pattern,
    freq_path='/home/volt/bach/pilot_data/RSC/lemma_pos_init_freq.txt',
    freq_min=1,
    freq_max=100_000_000,
    filtered_pos=[],
    filler_format='lemma/pos', # lemma/deprel or 'lemma/pos'
    output_folder= output_embedding
)

In [None]:
# Sampling from the general slots DataFrame
from SynFlow.Explorer import sample_slot_df

df_sample = sample_slot_df(
    input_csv=f"{output_embedding}/{target_lemma}_samples_all_slots.csv",
    output_csv=f"{output_embedding}/{target_lemma}_samples_{n}_slots.csv",
    n=n,
    seed=42,
    mode= 'NA'
)

In [None]:
# template='[chi_nsubj][chi_obj][chi_obl > chi_case]'
# slots     = template.strip("[]").split("][")
# print(slots)