# Dep Path Distribution

In [None]:
import os
import re
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# Regex to extract tab-separated token info
pattern = re.compile(r'([^\t]+)\t([^\t]+)\t([^\t])[^\t]*\t([^\t]+)\t([^\t]+)\t([^\t]+)')

def build_dependency_tree(tokens):
    tree = defaultdict(list)
    roots = []
    for tok in tokens:
        m = pattern.match(tok)
        if m:
            _, _, _, idx, head, _ = m.groups()
            if head == "0":
                roots.append(idx)
            else:
                tree[head].append(idx)
    return tree, roots

def get_max_depth(tree, root):
    visited = set()
    stack = [(root, 1)]
    max_depth = 1

    while stack:
        node, depth = stack.pop()
        if node in visited:
            continue
        visited.add(node)
        max_depth = max(max_depth, depth)
        for child in tree.get(node, []):
            stack.append((child, depth + 1))

    return max_depth

def analyze_max_depth_distribution(corpus_dir):
    depths = []

    files = [f for f in os.listdir(corpus_dir) if f.endswith(".txt")]
    for fname in tqdm(files, desc="Analyzing syntactic depths"):
        with open(os.path.join(corpus_dir, fname), encoding='utf-8') as f:
            lines = f.readlines()

        sentence = []
        for line in lines:
            line = line.strip()
            if line.startswith("<s"):
                sentence = []
            elif line.startswith("</s>"):
                tree, roots = build_dependency_tree(sentence)
                for root in roots:
                    depth = get_max_depth(tree, root)
                    depths.append(depth)
            elif line:
                sentence.append(line)
    return depths

def plot_depth_distribution(depths, save_path=None, min_count=1):
    # Count frequencies
    depth_counter = Counter(depths)

    # Apply threshold filter
    filtered = [d for d in depths if depth_counter[d] >= min_count]
    if not filtered:
        print(f"[!] No depths with frequency ≥ {min_count}")
        return

    # Plot
    plt.figure(figsize=(10, 6))
    plt.hist(filtered, bins=range(1, max(filtered)+2), edgecolor='black')
    plt.title(f"Distribution of Max Syntactic Depth (min_count ≥ {min_count})")
    plt.xlabel("Max Depth")
    plt.ylabel("Number of Sentences")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
        print(f"[✓] Plot saved to {save_path}")
    else:
        plt.show()

    print("\nSummary Stats:")
    print(f"  Min: {min(filtered)}")
    print(f"  Max: {max(filtered)}")
    print(f"  Mean: {np.mean(filtered):.2f}")
    print(f"  Median: {np.median(filtered):.2f}")

In [None]:
depths = analyze_max_depth_distribution("/home/volt/bach/pilot_data/COHA/10_20_parsed_1_SPOS")
plot_depth_distribution(depths, save_path="./depth_distribution.png", min_count=10)

In [None]:
def print_files_with_depth_threshold(corpus_dir, target_depth):
    import os
    import re
    from collections import defaultdict
    from tqdm import tqdm

    pattern = re.compile(r'([^\t]+)\t([^\t]+)\t([^\t])[^\t]*\t([^\t]+)\t([^\t]+)\t([^\t]+)')

    def build_dependency_tree(tokens):
        tree = defaultdict(list)
        roots = []
        for tok in tokens:
            m = pattern.match(tok)
            if m:
                _, _, _, idx, head, _ = m.groups()
                if head == "0":
                    roots.append(idx)
                else:
                    tree[head].append(idx)
        return tree, roots

    def get_max_depth(tree, root):
        visited = set()
        stack = [(root, 1)]
        max_depth = 1
        while stack:
            node, depth = stack.pop()
            if node in visited:
                continue
            visited.add(node)
            max_depth = max(max_depth, depth)
            for child in tree.get(node, []):
                stack.append((child, depth + 1))
        return max_depth

    files = [f for f in os.listdir(corpus_dir) if f.endswith(".txt")]
    matching_files = set()

    for fname in tqdm(files, desc=f"Searching for depth ≥ {target_depth}"):
        path = os.path.join(corpus_dir, fname)
        with open(path, encoding='utf-8') as f:
            lines = f.readlines()

        sentence = []
        for line in lines:
            line = line.strip()
            if line.startswith("<s"):
                sentence = []
            elif line.startswith("</s>"):
                tree, roots = build_dependency_tree(sentence)
                for root in roots:
                    depth = get_max_depth(tree, root)
                    if depth >= target_depth:
                        matching_files.add(fname)
                        break  # only print once per file
            elif line:
                sentence.append(line)

    print(f"\n✅ Files with syntactic depth ≥ {target_depth}:")
    for f in sorted(matching_files):
        print(f"  - {f}")

print_files_with_depth_threshold(
    corpus_dir="/home/volt/bach/pilot_data/COHA/10_20_parsed_1_SPOS",
    target_depth=10
)


# Dependency Collocation Matrix

## Import

In [3]:
import importlib
import dep_colloc.dep_colloc
importlib.reload(dep_colloc.dep_colloc)
from dep_colloc.dep_colloc import generate_path_colloc_df, generate_syn_colloc_df

import re
import os

corpus_dir = "/home/volt/bach/pilot_data/test/"
output_folder = '/home/volt/bach/Embeddings/type_embeddings/w2vf/dep_w2v/test/'
# corpus_dir = '/home/volt/bach/pilot_data/COHA/10_20_parsed_1_SPOS/'
# output_folder = '/home/volt/bach/Embeddings/type_embeddings/w2vf/dep_w2v/COHA1020/'
pattern = re.compile(r'([^\t]+)\t([^\t]+)\t([^\t])[^\t]*\t([^\t]+)\t([^\t]+)\t([^\t]+)')


## Max Depth Filtering

In [None]:
# save to CSV (or you could use df.to_pickle)
pattern = re.compile(r'([^\t]+)\t([^\t]+)\t([^\t])[^\t]*\t([^\t]+)\t([^\t]+)\t([^\t]+)')
df = generate_path_colloc_df(corpus_dir=corpus_dir, max_depth=1, pattern=pattern)
print(df.shape)
df.to_csv('./test_path.csv')

## Max Depth filtering + Dep Rel

### Generate frequency for the lemma and lemma_rel

In [None]:
import importlib
import dep_colloc.freq
importlib.reload(dep_colloc.freq)
from dep_colloc.freq import gen_lemma_freq

In [None]:
# modes: 'lemma_pos', 'lemma_pos_init', or 'lemma_deprel'
result_file = gen_lemma_freq(corpus_folder, output_folder,
                                file_ext='.txt', mode='lemma_pos_init')
print(f"Saved frequencies to {result_file}")

In [None]:
# Calculate the number of tokens
def count_total_value(filepath):
    """
    Reads a text file with "lemma/POS count" format and calculates the total sum of the counts.

    Args:
        filepath (str): The path to the input text file.

    Returns:
        int: The total sum of all counts. Returns 0 if the file is empty or no valid counts are found.
    """
    total_value = 0
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()  # Remove leading/trailing whitespace
                if not line:  # Skip empty lines
                    continue

                parts = line.rsplit(':', 1) # Split from right, only once
                
                if len(parts) == 2:
                    try:
                        count_str = parts[1].strip()
                        # Clean up potential extra spaces or non-digit characters around the number
                        count = int(''.join(filter(str.isdigit, count_str)))
                        total_value += count
                    except ValueError:
                        print(f"Warning: Could not parse count from line: '{line}'. Skipping.")
                else:
                    print(f"Warning: Unexpected line format: '{line}'. Skipping.")
    except FileNotFoundError:
        print(f"Error: File not found at '{filepath}'")
        return 0
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return 0
    
    return total_value

# --- How to use the function ---

# 2. Call the function with your file path
file_path = "/home/volt/bach/SynFlow/COHA_10_20.nfreq"
total_sum = count_total_value(file_path)

if total_sum > 0:
    print(f"\nTotal sum of all counts: {total_sum}")

### Compute the colloc matrix of lemma and lemma_rel

In [4]:
df = generate_syn_colloc_df(corpus_dir=corpus_dir, output_dir=output_folder, max_depth=1, pattern=pattern)
# print(df.shape)
# print(df.iloc[:5, :5])   # peek at top‐left corner
# df.to_csv(f'{output_folder}/COHA1020_syn.csv')

Syn files: 100%|██████████| 10/10 [00:00<00:00, 3675.35it/s]

Writing syn counts to: /home/volt/bach/Embeddings/type_embeddings/w2vf/dep_w2v/test/syn_colloc_counts.txt





In [5]:
# Generate (vocab, context) pairs from countdictionary for w2vf
def expand_counts(input_path: str, output_path: str) -> None:
    """
    Reads lines of the form "word1 word2: count" from input_path,
    and writes each word pair count times (one per line) to output_path.
    """
    with open(input_path, 'r', encoding='utf-8') as infile:
        lines = infile.readlines()

    expanded = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        try:
            key, count_str = line.split('\t', 1)
            count = int(count_str.strip())
        except ValueError:
            # skip malformed lines or handle as needed
            continue

        key = key.strip()
        # Add `key` to the list `count` times
        expanded.extend([key] * count)

    # Write out all expanded lines
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for item in expanded:
            outfile.write(item + '\n')

expand_counts('/home/volt/bach/Embeddings/type_embeddings/w2vf/dep_w2v/test/syn_colloc_counts.txt',
              '/home/volt/bach/Embeddings/type_embeddings/w2vf/dep_w2v/test/syn_colloc_expanded.txt')

In [6]:
import sys
from collections import Counter

def read_lines(path):
    """Đọc file và trả về Counter các dòng (loại bỏ newline cuối)."""
    with open(path, encoding='utf-8') as f:
        # strip() để bỏ \n; filter bỏ dòng rỗng nếu cần
        lines = [line.rstrip('\n') for line in f if line.strip()]
    return Counter(lines)

def compare_files(path1, path2):
    """
    So sánh hai file:
    - in ra có cùng tập dòng hay không
    - hiển thị dòng chung, dòng chỉ ở file1, dòng chỉ ở file2
    """
    cnt1 = read_lines(path1)
    cnt2 = read_lines(path2)

    set1 = set(cnt1)
    set2 = set(cnt2)

    only1 = sorted(set1 - set2)
    only2 = sorted(set2 - set1)
    common = sorted(set1 & set2)

    print(f"\n✔️  Common lines ({len(common)}):")
    for line in common:
        print("  ", line)

    print(f"\n❌ Only in {path1} ({len(only1)}):")
    for line in only1:
        print("  ", line)

    print(f"\n❌ Only in {path2} ({len(only2)}):")
    for line in only2:
        print("  ", line)

    if not only1 and not only2:
        print("\n🎉 Hai file có cùng tập các dòng (không tính thứ tự).")
    else:
        print("\n⚠️  Hai file KHÔNG giống nhau về nội dung dòng.")

compare_files('/home/volt/bach/Embeddings/type_embeddings/w2vf/dep_w2v/test/dep.contexts',
              '/home/volt/bach/Embeddings/type_embeddings/w2vf/dep_w2v/test/syn_colloc_expanded.txt')



✔️  Common lines (25):
   april/n primary/nmod
   carry/v carry/conj
   carry/v carry/root
   committee/n chairman/nmod
   comply/v support/conj
   day/n day/appos
   day/n day/nmod
   freight/n ear/compound
   gain/v retort/parataxis
   gun-toters/n look/advcl
   have/v bring/conj
   hope/v see/parataxis
   labor/n demand/nmod
   napoleon/n empire/nmod
   old/j fisher/nmod
   only/r long/advmod
   overpower/v keep/advcl
   part/n unwilling/xcomp
   propose/v come/ccomp
   right/j retort/parataxis
   seat/n population/nmod
   so/r long/advmod
   state/n department/nmod
   very/r much/advmod
   wish/v convince/ccomp

❌ Only in /home/volt/bach/Embeddings/type_embeddings/w2vf/dep_w2v/test/dep.contexts (1750):
   "/" carry/conj
   "/" come/ccomp
   "/" deal/appos
   "/" depend/root
   "/" lie/root
   "/" look/root
   "/" quittin/ccomp
   "/" request/parataxis
   '/" em/nmod
   '/g a-leavin/root
   '/g deal/appos
   '/g quittin/ccomp
   '/g think/conj
   's/g clerk/nmod:poss
   's/g presid

In [None]:
# Generate (vocab, context) pairs from dataframe for w2vf
import pandas as pd

def generate_dep_contexts(csv_path, output_path):
    # Đọc CSV (giả định cột đầu là vocab, dòng đầu là context)
    df = pd.read_csv(csv_path, index_col=0)

    with open(output_path, 'w', encoding='utf8') as out:
        for vocab, row in df.iterrows():
            for context, count in row.items():
                # Convert count về số nguyên nếu cần
                count = int(round(float(count)))
                if count > 0:
                    for _ in range(count):
                        out.write(f"{vocab} {context}\n")

# Ví dụ sử dụng
generate_dep_contexts("/home/volt/bach/pilot_data/COHA/lemma_emb/dep_colloc/test_syn.csv", "./dep.contexts")


## Convert to PPMI

In [None]:
import importlib
import dep_colloc.ppmi
importlib.reload(dep_colloc.ppmi)
from dep_colloc.ppmi import PPMI_colloc_df

In [None]:
ppmi_df = PPMI_colloc_df(
    dep_colloc_path="/home/volt/bach/pilot_data/COHA/lemma_emb/dep_colloc/test_syn.csv",
    lemma_pos_freq_path="/home/volt/bach/pilot_data/COHA/lemma_emb/dep_colloc/lemma_pos_freq.txt",
    # lemma_pos_deprel_freq_path="/home/volt/bach/pilot_data/COHA/lemma_emb/dep_colloc/lemma_deprel_freq.txt",
    min_count=1,
    mode='lemma_pos'
)
print(ppmi_df.shape)
# ppmi_df.to_csv('./test_ppmi_deprel.csv')
ppmi_df.to_csv('./test_ppmi_path.csv')

## Save DataFrame to .pac

In [None]:
import os
import sys
from scipy.sparse import coo_matrix
import pandas as pd

# Add these only if you're not installing nephosem via pip
sys.path.append("/home/volt/bach/KUL/nephosem")
sys.path.append("/home/volt/bach/KUL/semasioFlow")
from nephosem import TypeTokenMatrix

def save_df_to_pac(df_path, output_path):
    if output_path.endswith("/"):
        output_path = os.path.join(output_path, "colloc_matrix")
    if not output_path.endswith(".pac"):
        output_path += ".pac"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Load the DataFrame, using the first column as the index (row labels)
    df = pd.read_csv(df_path, index_col=0)

    # Build a COO and immediately convert to CSR
    sparse_coo = coo_matrix(df.values)
    sparse_csr = sparse_coo.tocsr()

    row_items = df.index.tolist()
    col_items = df.columns.tolist()

    pac = TypeTokenMatrix(
        matrix=sparse_csr,
        row_items=row_items,
        col_items=col_items
    )
    pac.save(output_path)
    print(f"[DONE] Saved .pac to: {output_path}")

In [None]:
save_df_to_pac(df_path='./test_syn.csv', output_path='./')