# Test with Stanza parsing

In [None]:
import stanza
# stanza.download('en') # download English model
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse') # initialize English neural pipeline

In [None]:
sents = [
        "hell yeah!"
        #  'The boss runs the company.',
        #  'The company is run by the boss', 
        #  'The company is run in the dark.', 
        #  'He runs in the jungle.',
        #  'The roads run through the city.',
        #  'He runs his finger through his hair.',
        #  'The computer runs fast.',
        #  'The car runs really fast.'
         ]

# sents = ['MISS NORMAN : Will you do me the honour to meet me at the bridgehead at half-past nine practically at once ?']
target = 'miss'

In [None]:
for sent in sents:
    doc = nlp(sent) # run annotation over a sentence
    print('sentence:', sent)
    # print(doc)
    # print(doc.entities)
    print(*[f'word: {word.text}\tlemma: {word.lemma}\tpos: {word.pos}\tid: {word.id}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')
    print('end')

In [None]:
import stanza
from collections import defaultdict

sents = ['The boss runs the company.',
         'The company is run by the boss', 
         'The company is run in the dark.', 
         'He runs in the jungle.',
         'The roads run through the city.',
         'He runs his finger through his hair.',
         'The computer runs fast.',
         'The car runs really fast.']
target = 'run'

# 1. load the English pipeline (tokeniser-POS-lemma-dependency)
nlp = stanza.Pipeline(
        'en', processors='tokenize,pos,lemma,depparse',
        tokenize_no_ssplit=True,  # treat each string as a single sentence
        verbose=False)

results = []               # (sent_id, dep_lemma, deprel)
for sent_id, text in enumerate(sents, 1):
    doc = nlp(text)
    sent = doc.sentences[0]               # exactly one per string
    for w in sent.words:                  # iterate over tokens/words
        if w.lemma == target:             # <- our target lemma
            head_id = w.id
            # collect *immediate* dependents of this “run”
            for d in sent.words:
                if d.head == head_id:
                    results.append((sent_id, d.lemma, d.deprel))

# pretty-print
for sent_id, lem, rel in results:
    print(f'S{sent_id}: {lem:<10}  {rel}')


In [None]:
# sents = ['The boss runs the company.',
#          'The company is run by the boss', 
#          'The company is run in the dark.', 
#          'He runs in the jungle.',
#          'The roads run through the city.',
#          'He runs his finger through his hair.',
#          'The computer runs fast.',
#          'The car runs really fast.'
#          ]

# sents = [
#     "Freedom is priceless.",
#     "She fought for freedom during the revolution.",
#     "The court finally granted him the freedom to speak openly.",
#     "Within the classroom, freedom of thought nurtures creativity.",
#     "The towering bronze sculpture, Freedom, dominates the plaza.",
#     "After the last exam, the students burst outside in pure freedom.",
#     "Digital tracking can quietly erode freedom online.",
#     "We debated whether freedom or security mattered more.",
#     "Without self-control, freedom often collapses into chaos.",
#     "He inhaled deeply, freedom flooding his lungs at the prison gates."
# ]

# sents = [
#     "The table shook during the earthquake.",
#     "She carved her initials into the wooden table.",
#     "After dinner, they sat around the table and talked for hours.",
#     "The architect presented a glass table as the room's centerpiece.",
#     "Please table the motion until next week’s meeting.",
#     "We sorted the data into a table for easier comparison.",
#     "The cat leapt onto the table, knocking over a vase.",
#     "Negotiators agreed to table further discussion until sunrise.",
#     "Beneath the table, a hidden drawer contained old photographs.",
#     "A picnic table stood alone under the oak tree."
# ]

sents = [
    "This article is interesting.",
    "An interesting twist changed the plot completely.",
    "He found the lecture interesting despite the late hour.",
    "Someone interesting moved into the apartment next door.",
    "The most interesting of the artifacts was the jade mask.",
    "Keep your questions interesting and concise.",
    "They made the workshop interesting by adding hands-on demos.",
    "What I find interesting is how quickly trends shift.",
    "Do you have anything interesting to read on the train?",
    "Interesting, she thought, how silence can speak louder than words."
]


TARGET = 'interesting' 

MAX_DEPTH  = 2             # you can pass (1,), (2,), (1,2,3) …


In [None]:
import stanza
from collections import defaultdict, deque

nlp = stanza.Pipeline(
        "en",
        processors="tokenize,pos,lemma,depparse",
        tokenize_no_ssplit=True,
        verbose=False)

# ------------------------------------------------------------------ #
def collect_connected(sent, target_lemma, max_depth):
    """
    Return {depth: [(lemma, path)]} where 'path' is a string like
    '↓obj' or '↑nsubj:pass > ↓obl' showing the route from the target
    to the node.  Traversal is undirected, up to max_depth edges.
    """
    id2word   = {w.id: w for w in sent.words}
    neighbours = defaultdict(list)                  # id -> [(word, label)]

    # build bidirectional edges
    for w in sent.words:
        if w.head == 0:                             # ROOT has no parent
            continue
        head = id2word[w.head]
        neighbours[w.id].append((head, f"↑{w.deprel}"))   # child -> parent
        neighbours[head.id].append((w, f"↓{w.deprel}"))   # parent -> child

    result = defaultdict(list)                      # depth -> [(lemma, path)]
    for w in sent.words:
        if w.lemma != target_lemma:
            continue                                # other lemmas not our start
        q = deque([(w, 0, [])])                     # node, depth, path so far
        visited = {w.id}
        while q:
            node, d, path = q.popleft()
            if d == max_depth:                      # stop expanding beyond limit
                continue
            for nb, rel in neighbours[node.id]:
                if nb.id in visited:
                    continue
                nd     = d + 1
                npath  = path + [rel]
                result[nd].append((nb.lemma, " > ".join(npath)))
                visited.add(nb.id)
                q.append((nb, nd, npath))
    return result
# ------------------------------------------------------------------ #

all_hits = defaultdict(lambda: defaultdict(list))   # sent_id -> depth -> items
for sid, text in enumerate(sents, 1):
    sent = nlp(text).sentences[0]
    dep_map = collect_connected(sent, TARGET, MAX_DEPTH)
    for d, items in dep_map.items():
        all_hits[sid][d].extend(items)

# --- demo print ---------------------------------------------------- #
for sid in sorted(all_hits):
    print(f"\nSentence {sid}: {sents[sid-1]}")
    for d in sorted(all_hits[sid]):
        print(f"  depth {d}:")
        for lem, rel_path in all_hits[sid][d]:
            print(f"      {lem:<10}  {rel_path}")


# Input

In [None]:
%load_ext autoreload
%autoreload 2

import re
import os

pattern = re.compile(
    r'([^\t]+)\t'      # word form
    r'([^\t]+)\t'      # lemma
    r'([^\t])[^\t]*\t' # POS (UPOS or XPOS)
    r'([^\t]+)\t'      # ID
    r'([^\t]+)\t'      # HEAD
    r'([^\t]+)'        # DEPREL
)

target_lemma = 'air'
target_pos = 'N'

# # All
period = '1750-1799'
corpus_folder = f'/home/volt/bach/pilot_data/RSC/1750-1799_che'
output_folder = f'/home/volt/bach/SynFlow/output/{target_lemma}-{target_pos}-{period}'
output_explorer = f'{output_folder}/Explorer'
output_embedding = f'{output_folder}/Embedding'

# Decades
# period = '1790'
# corpus_folder = f'/home/volt/bach/pilot_data/RSC/1750-1799_che_decades/{period}'
# output_folder = f'/home/volt/bach/SynFlow/output/{target_lemma}-{target_pos}-{period}'
# visualisation_folder = f'/home/volt/bach/SynFlow/visualisation/{target_lemma}-{target_pos}-{period}'

# Half decades
# period = '1770-1774'
# corpus_folder = f'/home/volt/bach/pilot_data/RSC/1750-1799_che_half_decades/{period}'
# output_folder = f'/home/volt/bach/SynFlow/output/{target_lemma}-{target_pos}-{period}'
# output_explorer = f'{output_folder}/Explorer'
# output_embedding = f'{output_folder}/Embedding'

if not os.path.exists(output_explorer):
    os.makedirs(output_explorer)

if not os.path.exists(output_embedding):
    os.makedirs(output_embedding)


# Get Embeddings

In [None]:
import pandas as pd
from SynFlow.Embedding.get_embeddings import build_embeddings

slot_mode = 'mult'
tok_mode='mult'
df_emb = build_embeddings(
    df_templates=pd.read_csv(f'{output_embedding}/{target_lemma}_samples_{n}_slots.csv', index_col=0), # df_slots,
    type_embedding_path='/home/volt/bach/SynFlow/input/type_embedding/coha_10_20_w2v.csv',
    dims=300,
    slot_mode=slot_mode,
    tok_mode=tok_mode,
    out_embedding=f'{output_embedding}/{target_lemma}_samples_{n}'
)

In [None]:
import pandas as pd
df = pd.read_csv('/home/volt/bach/SynFlow/type_embedding/coha_10_20_w2v.csv', index_col=0)
df.head()

# Calculate Cosine Similarity

In [None]:
import pandas as pd
from SynFlow.Embedding.compute_dist import compute_cosine_distmtx

# Example usage:
df_emb = pd.read_csv(f"{output_embedding}/{target_lemma}_samples_{n}_{slot_mode}_{tok_mode}_embedding.csv", index_col=0)
dist_df = compute_cosine_distmtx(df_emb)
dist_df.to_csv(f"{output_embedding}/{target_lemma}_samples_{n}_distance_matrix.csv")

In [None]:
# Play around with the cosine similarity
import numpy as np

def cosine_similarity_np(vec1, vec2):
    """
    Calculates the cosine similarity between two vectors using NumPy.

    Args:
        vec1 (numpy.ndarray or list): The first vector.
        vec2 (numpy.ndarray or list): The second vector.

    Returns:
        float: The cosine similarity between the two vectors.
               Returns 0 if either vector has a magnitude of zero.
    """
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)

    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0  # Or handle as an error, depending on desired behavior
    else:
        return dot_product / (norm_vec1 * norm_vec2)

# Example Usage:
vector_a = [1, 2, 3, 4, 5, 6, 7, 8]
vector_b = [3, 5, 7, 9, 11, 13, 15, 17]

vector_c = [5, 6, 7, 8, 1, 2, 3, 4]
vector_d = [11, 13, 15, 17, 3, 5, 7, 9]

# vector_a = [1, 1, 1, 1, 2, 2, 2, 2]
# vector_b = [1, 1, 1, 1, 2, 2, 2, 2]

similarity = cosine_similarity_np(vector_a, vector_b)
print(f"Cosine Similarity (NumPy): {similarity}")

similarity = cosine_similarity_np(vector_c, vector_d)
print(f"Cosine Similarity (NumPy): {similarity}")

# Get Context

In [None]:
import importlib
from SynFlow.Explorer.get_contexts import get_contexts
import re
pattern = re.compile(
    r'([^\t]+)\t'      # FORM
    r'([^\t]+)\t'      # LEMMA
    r'([^\t])[^\t]*\t' # POS
    r'([^\t]+)\t'      # ID
    r'([^\t]+)\t'      # HEAD
    r'([^\t]+)'        # DEPREL
)

In [None]:
import pandas as pd

slots_df = pd.read_csv(f"{output_embedding}/{target_lemma}_samples_{n}_slots.csv", index_col=0)

# Now attach contexts:
context_df = get_contexts(
    slots_df=slots_df,
    corpus_path="/home/volt/bach/pilot_data/COHA/10_20_parsed_1_SPOS",
    pattern=pattern,
    output_path=f"{output_embedding}/{target_lemma}_samples_{n}_contexts.csv"
)

# Get Clustering

In [None]:
import pandas as pd
import hdbscan

def hdbscan_clustering(dist_df: pd.DataFrame,
                          min_cluster_size: int = 5,
                          min_samples: int = None,
                          cluster_selection_epsilon: float = 0.0,
                          cluster_selection_method: str = 'eom',
                         ) -> pd.DataFrame:
    """
    Given a precomputed distance matrix `dist_df` (square DataFrame indexed and
    columned by token IDs), run HDBSCAN (metric='precomputed') and return a new
    DataFrame with two columns:
      • 'token'   : the token ID (index of dist_df)
      • 'cluster' : the HDBSCAN cluster label (-1 for noise)
    
    Parameters
    ----------
    dist_df : pd.DataFrame
        Square distance matrix (n × n), index and columns are identical token IDs.
    min_cluster_size : int, default=5
        The minimum size of clusters; see HDBSCAN docs.
    min_samples : int or None, default=None
        Controls how conservative the clustering is; if None, it defaults to
        min_cluster_size.
    cluster_selection_epsilon : float, default=0.0
        A distance threshold: clusters below this distance can be split off.
    cluster_selection_method : {'eom','leaf'}, default='eom'
        How to select clusters from the condensed tree.

    Returns
    -------
    pd.DataFrame with columns ['id','cluster'], index 0..n-1
    """
    # Extract the numpy distance matrix
    D = dist_df.values
    # Initialize HDBSCAN with precomputed distances
    clusterer = hdbscan.HDBSCAN(
        metric='precomputed',
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method
    )
    # Fit on the distance matrix
    clusterer.fit(D)
    labels = clusterer.labels_  # array of length n, -1 means noise

    # Prefix each label with 'c'
    clusters_prefixed = [f"c{lab}" for lab in labels]

    result = pd.DataFrame({
        'id': dist_df.index,
        'clusters': clusters_prefixed
    }).reset_index(drop=True)

    return result


In [None]:
import pandas as pd

# Suppose you already computed dist_df (square DataFrame with token IDs as index & columns)
dist_df = pd.read_csv(fr'{output_embedding}/{target_lemma}_samples_{n}_distance_matrix.csv', index_col=0)

# Cluster with HDBSCAN
cluster_df = hdbscan_clustering(
    dist_df,
    min_cluster_size=10,
    min_samples=10
)

# Save to CSV if desired
cluster_df.to_csv(fr'{output_embedding}/{target_lemma}_samples_{n}_clusters.csv', index=False)

# Merge to context
context_df = pd.read_csv(fr'{output_embedding}/{target_lemma}_samples_{n}_contexts.csv', index_col=0)
cluster_context_df = context_df.merge(cluster_df, left_index=True, right_on='id')

# Save to CSV if desired
cluster_context_df.to_csv(fr'{output_embedding}/{target_lemma}_samples_{n}_clusters_contexts.csv', index=False)


# Get Coordinates with tsne, umap, mds

In [None]:
import pandas as pd
import importlib
import SynFlow.Embedding.get_coordinates
importlib.reload(SynFlow.Embedding.get_coordinates)
from SynFlow.Embedding.get_coordinates import get_token_coordinates

In [None]:
# Example usage:
dist_df = pd.read_csv(f'{output_embedding}/{target_lemma}_samples_{n}_distance_matrix.csv', index_col=0)
coord_tsne = get_token_coordinates(lemma=f'{target_lemma}_samples', dist_df=dist_df, method='tsne', perplexity=30, output_path=output_embedding, n = n)
# coord_mds  = get_token_coordinates(lemma=f'{target_lemma}_samples', dist_df=dist_df, method='mds', max_iter=300, output_path=visualisation_folder, n = n)
coord_umap = get_token_coordinates(lemma=f'{target_lemma}_samples', dist_df=dist_df, method='umap', n_neighbors=30, min_dist=0.1, output_path=output_embedding, n = n)

# Visualise with plotly

In [None]:
import importlib
import SynFlow.Embedding.visualisation
importlib.reload(SynFlow.Embedding.visualisation)
from SynFlow.Embedding.visualisation import get_token_ids

In [None]:
input_coords = fr'{output_embedding}/{target_lemma}_samples_{n}_tsne.csv'
input_ctxs = fr'{output_embedding}/{target_lemma}_samples_{n}_clusters_contexts.csv'

get_token_ids(input_coords, input_ctxs)