In [20]:
import os
import spacy_udpipe
from spacy_udpipe import download, load
import pandas as pd
import spacy
from collections import Counter

In [None]:
# download and load Slovak and Turkish models

download("sk")
slovak_model = load("sk")
model_dir = os.path.join(os.path.dirname(spacy_udpipe.__file__), 'models')
for filename in os.listdir(model_dir):
    if 'slovak' in filename.lower() and filename.endswith('.udpipe'):
        print(f"Slovak model: {filename}")
        break

download("tr")
turkish_model = load("tr")
model_dir = os.path.join(os.path.dirname(spacy_udpipe.__file__), 'models')
for filename in os.listdir(model_dir):
    if 'turkish' in filename.lower() and filename.endswith('.udpipe'):
        print(f"Turkish model: {filename}")
        break

Already downloaded a model for the 'sk' language
Slovak model: slovak-snk-ud-2.5-191206.udpipe
Downloaded pre-trained UDPipe model for 'tr' language
Turkish model: turkish-imst-ud-2.5-191206.udpipe


In [69]:
slovak_test_sentence = "Toto je testovacia veta."
parsed_doc = slovak_model(slovak_test_sentence)
for token in parsed_doc:
    print(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\t{token.head.text}")
print()
turkish_test_sentence = "Bu bir test cümlesidir."
parsed_doc = turkish_model(turkish_test_sentence)
for token in parsed_doc:
    print(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\t{token.head.text}")

Toto	toto	DET	nsubj	testovacia
je	byť	AUX	cop	testovacia
testovacia	testovacia	NOUN	ROOT	testovacia
veta	veta	NOUN	nmod	testovacia
.	.	PUNCT	punct	testovacia

Bu	bu	DET	det	test
bir	bir	NUM	det	test
test	test	NOUN	nmod:poss	cümlesi
cümlesi	cümle	NOUN	ROOT	cümlesi
dir	i	AUX	cop	cümlesi
.	.	PUNCT	punct	cümlesi


In [75]:
with open("data/slk_newscrawl_2016_1M/slk_newscrawl_2016_1M-sentences.txt", "r", encoding="utf-8") as f:
    slk_df = pd.DataFrame(
        [line.strip().split("\t")[1] for line in f.readlines()], columns=["sentence"]
    )

with open("data/tur_news_2024_1M/tur_news_2024_1M-sentences.txt", "r", encoding="utf-8") as f:
    tur_df = pd.DataFrame(
        [line.strip().split("\t")[1] for line in f.readlines()], columns=["sentence"]
    )

In [None]:
# this takes around 5 mins for each (still not the full datasets)

slk_sample_df = slk_df.head(10000).copy()
slk_parsed_docs = list(slovak_model.pipe(slk_sample_df['sentence']))
slk_sample_df['parsed_doc'] = slk_parsed_docs

tur_sample_df = tur_df.head(10000).copy()
tur_parsed_docs = list(turkish_model.pipe(tur_sample_df['sentence']))
tur_sample_df['parsed_doc'] = tur_parsed_docs

In [77]:
def dependency_info(parsed_doc):
    print(f"{'Text':<15} {'POS':<10} {'Dependency':<15} {'Head Text':<15}")
    print("-" * 55)
    for token in parsed_doc:
        print(f"{token.text:<15} {token.pos_:<10} {token.dep_:<15} {token.head.text:<15}")

def sentence_depth(parsed_doc):
    if not parsed_doc:
        return 0
    def token_depth(token):
        if token.head == token:
            return 0
        return 1 + token_depth(token.head)
    return max(token_depth(token) for token in parsed_doc) if len(parsed_doc) > 0 else 0

def leaf_node_pos(parsed_doc):
    leaf_pos_tags = []
    for token in parsed_doc:
        if len(list(token.children)) == 0:
            leaf_pos_tags.append(token.pos_)
    return leaf_pos_tags

#  a token's degree is the number of its direct children
def corpus_degree_distribution(parsed_docs):
    corpus_degree_counts = Counter()
    for doc in parsed_docs:
        for token in doc:
            degree = len(list(token.children))
            corpus_degree_counts[degree] += 1  
    return corpus_degree_counts

In [78]:
slk_sample_df['tree_depth'] = slk_sample_df['parsed_doc'].apply(sentence_depth)
slk_sample_df['leaf_nodes'] = slk_sample_df['parsed_doc'].apply(leaf_node_pos)

tur_sample_df['tree_depth'] = tur_sample_df['parsed_doc'].apply(sentence_depth)
tur_sample_df['leaf_nodes'] = tur_sample_df['parsed_doc'].apply(leaf_node_pos)

In [82]:
# slovak

# average tree depth
avg_depth = slk_sample_df['tree_depth'].mean()
print(f"average tree depth: {avg_depth:.2f}")

# most common leaf node categories
all_leaf_nodes = [pos for sublist in slk_sample_df['leaf_nodes'] for pos in sublist]
leaf_node_distribution = Counter(all_leaf_nodes)
print("\ntop 10 most common leaf node categories:")
for pos, count in leaf_node_distribution.most_common(10):
    print(f"{pos}: {count}")

# degree distribution
print('\ndegree distribution for Slovak corpus:')
for degree, count in sorted(corpus_degree_distribution(slk_parsed_docs).items()):
    print(f"degree {degree}: {count} tokens")


average tree depth: 4.21

top 10 most common leaf node categories:
PUNCT: 28531
ADP: 15212
ADJ: 13137
CCONJ: 8250
NOUN: 6875
PRON: 6751
DET: 6243
ADV: 5691
PART: 5602
SCONJ: 4988

degree distribution for Slovak corpus:
degree 0: 114225 tokens
degree 1: 28555 tokens
degree 2: 17346 tokens
degree 3: 9650 tokens
degree 4: 7286 tokens
degree 5: 4813 tokens
degree 6: 2384 tokens
degree 7: 1048 tokens
degree 8: 458 tokens
degree 9: 185 tokens
degree 10: 79 tokens
degree 11: 41 tokens
degree 12: 18 tokens
degree 13: 9 tokens
degree 14: 4 tokens
degree 15: 2 tokens
degree 16: 2 tokens
degree 18: 1 tokens


In [83]:
# turkish

# average tree depth
avg_depth = tur_sample_df['tree_depth'].mean()
print(f"average tree depth: {avg_depth:.2f}")

# most common leaf node categories
all_leaf_nodes = [pos for sublist in tur_sample_df['leaf_nodes'] for pos in sublist]
leaf_node_distribution = Counter(all_leaf_nodes)
print("\ntop 10 most common leaf node categories:")
for pos, count in leaf_node_distribution.most_common(10):
    print(f"{pos}: {count}")

# degree distribution
print('\ndegree distribution for Turkish corpus:')
for degree, count in sorted(corpus_degree_distribution(tur_parsed_docs).items()):
    print(f"degree {degree}: {count} tokens")

average tree depth: 5.79

top 10 most common leaf node categories:
PUNCT: 24040
NOUN: 21256
ADJ: 9398
PROPN: 9212
NUM: 7789
ADP: 7768
CCONJ: 5839
VERB: 5579
ADV: 2514
DET: 1651

degree distribution for Turkish corpus:
degree 0: 97245 tokens
degree 1: 46380 tokens
degree 2: 27459 tokens
degree 3: 11319 tokens
degree 4: 5012 tokens
degree 5: 2433 tokens
degree 6: 1092 tokens
degree 7: 510 tokens
degree 8: 229 tokens
degree 9: 116 tokens
degree 10: 47 tokens
degree 11: 22 tokens
degree 12: 7 tokens
degree 13: 5 tokens
degree 14: 1 tokens
degree 15: 2 tokens
degree 16: 2 tokens
