In [1]:
import os
import spacy_udpipe
from spacy_udpipe import download, load
import pandas as pd
import spacy
from collections import Counter

In [2]:
# download and load Slovak and Turkish models

download("sk")
slovak_model = load("sk")
model_dir = os.path.join(os.path.dirname(spacy_udpipe.__file__), 'models')
for filename in os.listdir(model_dir):
    if 'slovak' in filename.lower() and filename.endswith('.udpipe'):
        print(f"Slovak model: {filename}")
        break

download("tr")
turkish_model = load("tr")
model_dir = os.path.join(os.path.dirname(spacy_udpipe.__file__), 'models')
for filename in os.listdir(model_dir):
    if 'turkish' in filename.lower() and filename.endswith('.udpipe'):
        print(f"Turkish model: {filename}")
        break

Downloaded pre-trained UDPipe model for 'sk' language
Slovak model: slovak-snk-ud-2.5-191206.udpipe
Already downloaded a model for the 'tr' language
Turkish model: turkish-imst-ud-2.5-191206.udpipe


In [3]:
slovak_test_sentence = "Toto je testovacia veta."
parsed_doc = slovak_model(slovak_test_sentence)
for token in parsed_doc:
    print(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\t{token.head.text}")
print()
turkish_test_sentence = "Bu bir test cümlesidir."
parsed_doc = turkish_model(turkish_test_sentence)
for token in parsed_doc:
    print(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\t{token.head.text}")

Toto	toto	DET	nsubj	testovacia
je	byť	AUX	cop	testovacia
testovacia	testovacia	NOUN	ROOT	testovacia
veta	veta	NOUN	nmod	testovacia
.	.	PUNCT	punct	testovacia

Bu	bu	DET	det	test
bir	bir	NUM	det	test
test	test	NOUN	nmod:poss	cümlesi
cümlesi	cümle	NOUN	ROOT	cümlesi
dir	i	AUX	cop	cümlesi
.	.	PUNCT	punct	cümlesi


In [4]:
with open("data/slk_newscrawl_2016_1M/slk_newscrawl_2016_1M-sentences.txt", "r", encoding="utf-8") as f:
    slk_df = pd.DataFrame(
        [line.strip().split("\t")[1] for line in f.readlines()], columns=["sentence"]
    )

with open("data/tur_news_2024_1M/tur_news_2024_1M-sentences.txt", "r", encoding="utf-8") as f:
    tur_df = pd.DataFrame(
        [line.strip().split("\t")[1] for line in f.readlines()], columns=["sentence"]
    )

In [5]:
# this takes around 5 mins for each (still not the full datasets)

slk_sample_df = slk_df.head(10000).copy()
slk_parsed_docs = list(slovak_model.pipe(slk_sample_df['sentence']))
slk_sample_df['parsed_doc'] = slk_parsed_docs

tur_sample_df = tur_df.head(10000).copy()
tur_parsed_docs = list(turkish_model.pipe(tur_sample_df['sentence']))
tur_sample_df['parsed_doc'] = tur_parsed_docs

In [6]:
def dependency_info(parsed_doc):
    print(f"{'Text':<15} {'POS':<10} {'Dependency':<15} {'Head Text':<15}")
    print("-" * 55)
    for token in parsed_doc:
        print(f"{token.text:<15} {token.pos_:<10} {token.dep_:<15} {token.head.text:<15}")

def sentence_depth(parsed_doc):
    if not parsed_doc:
        return 0
    def token_depth(token):
        if token.head == token:
            return 0
        return 1 + token_depth(token.head)
    return max(token_depth(token) for token in parsed_doc) if len(parsed_doc) > 0 else 0

def leaf_node_pos(parsed_doc):
    leaf_pos_tags = []
    for token in parsed_doc:
        if len(list(token.children)) == 0:
            leaf_pos_tags.append(token.pos_)
    return leaf_pos_tags

#  a token's degree is the number of its direct children
def corpus_degree_distribution(parsed_docs):
    corpus_degree_counts = Counter()
    for doc in parsed_docs:
        for token in doc:
            degree = len(list(token.children))
            corpus_degree_counts[degree] += 1  
    return corpus_degree_counts

In [7]:
slk_sample_df['tree_depth'] = slk_sample_df['parsed_doc'].apply(sentence_depth)
slk_sample_df['leaf_nodes'] = slk_sample_df['parsed_doc'].apply(leaf_node_pos)

tur_sample_df['tree_depth'] = tur_sample_df['parsed_doc'].apply(sentence_depth)
tur_sample_df['leaf_nodes'] = tur_sample_df['parsed_doc'].apply(leaf_node_pos)

In [8]:
# slovak

# average tree depth
avg_depth = slk_sample_df['tree_depth'].mean()
print(f"average tree depth: {avg_depth:.2f}")

# most common leaf node categories
all_leaf_nodes = [pos for sublist in slk_sample_df['leaf_nodes'] for pos in sublist]
leaf_node_distribution = Counter(all_leaf_nodes)
print("\ntop 10 most common leaf node categories:")
for pos, count in leaf_node_distribution.most_common(10):
    print(f"{pos}: {count}")

# degree distribution
print('\ndegree distribution for Slovak corpus:')
for degree, count in sorted(corpus_degree_distribution(slk_parsed_docs).items()):
    print(f"degree {degree}: {count} tokens")


average tree depth: 4.21

top 10 most common leaf node categories:
PUNCT: 28531
ADP: 15212
ADJ: 13137
CCONJ: 8250
NOUN: 6875
PRON: 6751
DET: 6243
ADV: 5691
PART: 5602
SCONJ: 4988

degree distribution for Slovak corpus:
degree 0: 114225 tokens
degree 1: 28555 tokens
degree 2: 17346 tokens
degree 3: 9650 tokens
degree 4: 7286 tokens
degree 5: 4813 tokens
degree 6: 2384 tokens
degree 7: 1048 tokens
degree 8: 458 tokens
degree 9: 185 tokens
degree 10: 79 tokens
degree 11: 41 tokens
degree 12: 18 tokens
degree 13: 9 tokens
degree 14: 4 tokens
degree 15: 2 tokens
degree 16: 2 tokens
degree 18: 1 tokens


In [9]:
# turkish

# average tree depth
avg_depth = tur_sample_df['tree_depth'].mean()
print(f"average tree depth: {avg_depth:.2f}")

# most common leaf node categories
all_leaf_nodes = [pos for sublist in tur_sample_df['leaf_nodes'] for pos in sublist]
leaf_node_distribution = Counter(all_leaf_nodes)
print("\ntop 10 most common leaf node categories:")
for pos, count in leaf_node_distribution.most_common(10):
    print(f"{pos}: {count}")

# degree distribution
print('\ndegree distribution for Turkish corpus:')
for degree, count in sorted(corpus_degree_distribution(tur_parsed_docs).items()):
    print(f"degree {degree}: {count} tokens")

average tree depth: 5.79

top 10 most common leaf node categories:
PUNCT: 24040
NOUN: 21256
ADJ: 9398
PROPN: 9212
NUM: 7789
ADP: 7768
CCONJ: 5839
VERB: 5579
ADV: 2514
DET: 1651

degree distribution for Turkish corpus:
degree 0: 97245 tokens
degree 1: 46380 tokens
degree 2: 27459 tokens
degree 3: 11319 tokens
degree 4: 5012 tokens
degree 5: 2433 tokens
degree 6: 1092 tokens
degree 7: 510 tokens
degree 8: 229 tokens
degree 9: 116 tokens
degree 10: 47 tokens
degree 11: 22 tokens
degree 12: 7 tokens
degree 13: 5 tokens
degree 14: 1 tokens
degree 15: 2 tokens
degree 16: 2 tokens


In [10]:
def token_distance_to_root(token):
    """Calculates the number of head links to reach the root."""
    if token.head == token:
        return 0
    # Use len(list(token.ancestors)) for a simpler implementation
    return len(list(token.ancestors))

def corpus_pos_distance_to_root(parsed_docs, target_pos_list):
    """Calculates the average distance to root for specified POS tags."""
    pos_distances = {pos: [] for pos in target_pos_list}
    
    for doc in parsed_docs:
        for token in doc:
            if token.pos_ in target_pos_list:
                distance = token_distance_to_root(token)
                pos_distances[token.pos_].append(distance)
                
    # Calculate the average for each POS
    avg_distances = {}
    for pos, distances in pos_distances.items():
        if distances:
            avg_distances[pos] = sum(distances) / len(distances)
        else:
            avg_distances[pos] = 0
            
    return avg_distances

In [12]:
def corpus_ancestor_patterns(parsed_docs, target_pos, n=10):
    """Finds the most common POS of the head for a target POS."""
    ancestor_pos_counts = Counter()
    for doc in parsed_docs:
        for token in doc:
            if token.pos_ == target_pos and token.head != token:
                # The pattern is: (Dependency Label, Head POS)
                ancestor_pos_counts[(token.dep_, token.head.pos_)] += 1
    
    print(f"\nTop {n} Ancestor Patterns for {target_pos}:")
    for pattern, count in ancestor_pos_counts.most_common(n):
        print(f"  {pattern[0]:<5} (governed by {pattern[1]}): {count}")


def corpus_descendant_patterns(parsed_docs, target_pos, n=10):
    """Finds the most common dependency labels of children for a target POS."""
    descendant_dep_counts = Counter()
    for doc in parsed_docs:
        for token in doc:
            if token.pos_ == target_pos:
                for child in token.children:
                    descendant_dep_counts[child.dep_] += 1
    
    print(f"\nTop {n} Descendant Relations for {target_pos}:")
    for dep, count in descendant_dep_counts.most_common(n):
        print(f"  {dep:<10}: {count}")

In [17]:
#Average Distance to Root

TARGET_POS = ["NOUN", "VERB", "ADJ", "DET", "CCONJ", "ADP", "ADV", "PUNCT", "PRON", "PART", "SCONJ", "PROPN", "NUM"]

# Slovak
slk_avg_dist = corpus_pos_distance_to_root(slk_parsed_docs, TARGET_POS)
print("\nAverage distance to root for Slovak POS:")
for pos, avg_dist in slk_avg_dist.items():
    print(f"{pos:<7}: {avg_dist:.2f}")

# Turkish
tur_avg_dist = corpus_pos_distance_to_root(tur_parsed_docs, TARGET_POS)
print("\nAverage distance to root for Turkish POS:")
for pos, avg_dist in tur_avg_dist.items():
    print(f"{pos:<7}: {avg_dist:.2f}")


Average distance to root for Slovak POS:
NOUN   : 2.41
VERB   : 1.10
ADJ    : 3.19
DET    : 2.89
CCONJ  : 2.46
ADP    : 3.50
ADV    : 2.25
PUNCT  : 2.31
PRON   : 2.12
PART   : 2.26
SCONJ  : 2.44
PROPN  : 2.42
NUM    : 2.65

Average distance to root for Turkish POS:
NOUN   : 3.43
VERB   : 2.14
ADJ    : 3.74
DET    : 3.92
CCONJ  : 4.42
ADP    : 4.33
ADV    : 3.22
PUNCT  : 2.31
PRON   : 2.83
PART   : 0.00
SCONJ  : 0.00
PROPN  : 3.87
NUM    : 3.80
