In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from transformers import AutoTokenizer
import os
from sklearn.model_selection import train_test_split

# Model configurations
MODEL_CONFIGS = {
    "gpt2": {
        "model_name": "gpt2",
        "tokenizer_name": "gpt2",
    },
    "pythia1.4b": {
        "model_name": "EleutherAI/pythia-1.4b",
        "tokenizer_name": "EleutherAI/pythia-1.4b",
    },
    "gemma2b": {
        "model_name": "google/gemma-2-2b",
        "tokenizer_name": "google/gemma-2-2b",
    },
    "qwen2": {
        "model_name": "Qwen/Qwen2.5-1.5B",
        "tokenizer_name": "Qwen/Qwen2.5-1.5B",
    },
    "bert-base-uncased": {
        "model_name": "bert-base-uncased",
        "tokenizer_name": "bert-base-uncased",
    },
    "bert-large-uncased": {
        "model_name": "bert-large-uncased",
        "tokenizer_name": "bert-large-uncased",
    },
    "deberta-v3-large": {
        "model_name": "microsoft/deberta-v3-large",
        "tokenizer_name": "microsoft/deberta-v3-large",
    },
}

# Create figures directory if it doesn't exist
os.makedirs('figures/', exist_ok=True)

# Load the dataset
df = pd.read_csv("../data/ud_gum_dataset.csv")

# Basic dataset statistics
print("=== BASIC DATASET STATISTICS ===")
total_points = len(df)
unique_sentences = df['Sentence'].nunique()
unique_lemmas = df['Lemma'].nunique()
unique_forms = df['Word Form'].nunique()
avg_tokens_per_sentence = df.groupby('Sentence').size().mean().round(1)

print(f"Total data points: {total_points}")
print(f"Unique sentences: {unique_sentences}")
print(f"Unique lemmas: {unique_lemmas}")
print(f"Unique word forms: {unique_forms}")
print(f"Average tokens per sentence: {avg_tokens_per_sentence}")

# Create a more detailed table for the paper
dataset_stats = pd.DataFrame({
    "Statistic": [
        "Total data points", 
        "Unique sentences", 
        "Unique lemmas", 
        "Unique word forms",
        "Average tokens per sentence"
    ],
    "Value": [
        total_points,
        unique_sentences,
        unique_lemmas,
        unique_forms,
        avg_tokens_per_sentence
    ]
})
print("\nDataset Statistics Table:")
print(dataset_stats)

=== BASIC DATASET STATISTICS ===
Total data points: 54816
Unique sentences: 8415
Unique lemmas: 7848
Unique word forms: 11720
Average tokens per sentence: 6.5

Dataset Statistics Table:
                     Statistic    Value
0            Total data points  54816.0
1             Unique sentences   8415.0
2                Unique lemmas   7848.0
3            Unique word forms  11720.0
4  Average tokens per sentence      6.5


In [29]:
# increase matplotlib font size
plt.rcParams.update({'font.size': 16})

# Distribution analysis and visualizations
print("\n=== CATEGORY AND INFLECTION DISTRIBUTION ANALYSIS ===")

# Distribution by category
category_counts = df['Category'].value_counts()
print("\nDistribution by Category:")
print(category_counts)
category_percentages = (100 * category_counts / len(df)).round(1)
print("\nCategory Percentages:")
print(category_percentages)

# Distribution by inflection label
inflection_counts = df['Inflection Label'].value_counts()
print("\nDistribution by Inflection Label:")
print(inflection_counts)
inflection_percentages = (100 * inflection_counts / len(df)).round(1)
print("\nInflection Label Percentages:")
print(inflection_percentages)

# Plot distribution of categories
plt.figure(figsize=(10, 5))
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Distribution of Word Categories')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('figures//category_distribution.png')
plt.close()

# Plot distribution of inflection labels
plt.figure(figsize=(12, 5))
sns.barplot(x=inflection_counts.index, y=inflection_counts.values)
plt.title('Distribution of Inflection Labels')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('figures//inflection_distribution.png')
plt.close()

# Check category-inflection combinations
category_inflection = df.groupby(['Category', 'Inflection Label']).size().unstack(fill_value=0)
print("\nCategory-Inflection Combinations:")
print(category_inflection)

# Create data splits statistics
# Create stratified splits by inflection label
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['Inflection Label'], random_state=42)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['Inflection Label'], random_state=42)

print("\nData split sizes:")
print(f"Train: {len(train_df)} examples ({len(train_df)/len(df):.1%})")
print(f"Dev: {len(dev_df)} examples ({len(dev_df)/len(df):.1%})")
print(f"Test: {len(test_df)} examples ({len(test_df)/len(df):.1%})")

# Check if stratification worked correctly
train_dist = train_df['Inflection Label'].value_counts(normalize=True)
dev_dist = dev_df['Inflection Label'].value_counts(normalize=True)
test_dist = test_df['Inflection Label'].value_counts(normalize=True)

# Combine into a dataframe for easy comparison
split_comparison = pd.DataFrame({
    'Train': train_dist,
    'Dev': dev_dist,
    'Test': test_dist
})
print("\nInflection label distribution across splits:")
print(split_comparison)


=== CATEGORY AND INFLECTION DISTRIBUTION ANALYSIS ===

Distribution by Category:
Category
Noun         27111
Verb         17093
Adjective    10612
Name: count, dtype: int64

Category Percentages:
Category
Noun         49.5
Verb         31.2
Adjective    19.4
Name: count, dtype: float64

Distribution by Inflection Label:
Inflection Label
singular       19830
base           10076
positive        9926
plural          7281
past            5604
3rd_pers        1413
comparative      403
superlative      283
Name: count, dtype: int64

Inflection Label Percentages:
Inflection Label
singular       36.2
base           18.4
positive       18.1
plural         13.3
past           10.2
3rd_pers        2.6
comparative     0.7
superlative     0.5
Name: count, dtype: float64

Category-Inflection Combinations:
Inflection Label  3rd_pers   base  comparative  past  plural  positive  \
Category                                                                 
Adjective                0      0          403 

In [None]:
# Tokenization analysis across all models
print("\n=== TOKENIZATION ANALYSIS ACROSS MODELS ===")

# Function to count tokens for a word
def count_tokens(tokenizer, word):
    try:
        return len(tokenizer.encode(word))
    except:
        return float('nan')  # In case of errors

# Create a sample of words to analyze (limit for speed)
sample_size = min(2000, len(df))
word_sample = df['Word Form'].sample(sample_size, random_state=42)

tokenizer_stats = {}
for model_name, config in MODEL_CONFIGS.items():
    print(f"Loading {model_name} tokenizer...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'])
        
        # Count tokens for each word
        token_counts = [count_tokens(tokenizer, word) for word in word_sample]
        token_counts = [c for c in token_counts if not np.isnan(c)]  # Remove nan values
        
        # Compute statistics
        tokenizer_stats[model_name] = {
            "avg_tokens_per_word": np.mean(token_counts),
            "median_tokens_per_word": np.median(token_counts),
            "max_tokens_per_word": max(token_counts),
            "percent_multitoken": 100 * sum(count > 1 for count in token_counts) / len(token_counts)
        }
    except Exception as e:
        print(f"Error loading {model_name}: {e}")
        continue

# Display tokenization statistics
tokenizer_df = pd.DataFrame(tokenizer_stats).T
print("\nTokenization Statistics Across Models:")
print(tokenizer_df)

plt.figure(figsize=(12, 6))
tokenizer_df['percent_multitoken'].plot(kind='bar')
plt.title('Percentage of Words Split into Multiple Tokens')
plt.ylabel('Percentage')
plt.xlabel('Model')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('figures//multitoken_percentages.png')
plt.close()

plt.figure(figsize=(12, 6))
tokenizer_df['avg_tokens_per_word'].plot(kind='bar')
plt.title('Average Tokens per Word')
plt.ylabel('Average Tokens')
plt.xlabel('Model')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('figures//avg_tokens_per_word.png')
plt.close()


=== TOKENIZATION ANALYSIS ACROSS MODELS ===
Loading gpt2 tokenizer...
Loading pythia1.4b tokenizer...
Loading gemma2b tokenizer...
Loading qwen2 tokenizer...
Loading bert-base-uncased tokenizer...
Loading bert-large-uncased tokenizer...
Loading deberta-v3-large tokenizer...





Tokenization Statistics Across Models:
                    avg_tokens_per_word  median_tokens_per_word  \
gpt2                             0.0945                     0.0   
pythia1.4b                       0.0875                     0.0   
gemma2b                          0.1880                     0.0   
qwen2                            0.0670                     0.0   
bert-base-uncased                1.1060                     1.0   
bert-large-uncased               1.1060                     1.0   
deberta-v3-large                 1.0260                     1.0   

                    max_tokens_per_word  percent_multitoken  
gpt2                                3.0                0.90  
pythia1.4b                          3.0                0.90  
gemma2b                             3.0                1.90  
qwen2                               2.0                0.65  
bert-base-uncased                   6.0                6.95  
bert-large-uncased                  6.0            

In [31]:
# Sentence context and morphological richness analysis
print("\n=== SENTENCE CONTEXT AND MORPHOLOGICAL RICHNESS ANALYSIS ===")

# Analyze word forms per lemma - morphological richness
lemma_form_counts = df.groupby('Lemma')['Word Form'].nunique()
print(f"\nAverage number of word forms per lemma: {lemma_form_counts.mean():.2f}")
print(f"Median number of word forms per lemma: {lemma_form_counts.median()}")
print(f"Maximum number of word forms per lemma: {lemma_form_counts.max()}")

# Find the lemmas with the most word forms (morphologically rich)
rich_lemmas = lemma_form_counts.sort_values(ascending=False).head(10)
print("\nTop 10 lemmas by number of different word forms:")
print(rich_lemmas)

# For each of these rich lemmas, show the different forms
print("\nExamples of rich inflection paradigms:")
for lemma in rich_lemmas.index[:5]:  # Show just top 5 to keep output manageable
    forms = df[df['Lemma'] == lemma]['Word Form'].unique()
    forms_by_inflection = df[df['Lemma'] == lemma].groupby('Inflection Label')['Word Form'].unique()
    print(f"\n{lemma}:")
    for infl, word_forms in forms_by_inflection.items():
        print(f"  - {infl}: {', '.join(word_forms)}")

# Analyze sentence contexts
# Calculate sentence lengths
sentence_lengths = df.groupby('Sentence').size()
print(f"\nSentence length statistics:")
print(f"Average sentence length (in words): {sentence_lengths.mean():.2f}")
print(f"Median sentence length: {sentence_lengths.median()}")
print(f"Range of sentence lengths: {sentence_lengths.min()} to {sentence_lengths.max()}")

# Plot histogram of sentence lengths
plt.figure(figsize=(10, 5))
plt.hist(sentence_lengths, bins=30)
plt.xlabel('Number of Words')
plt.ylabel('Count of Sentences')
plt.title('Distribution of Sentence Lengths')
plt.savefig('figures//sentence_length_distribution.png')
plt.close()

# Analyze distribution of target positions within sentences
# Convert target index to relative position (0 to 1)
max_indices = df.groupby('Sentence')['Target Index'].transform('max')
df['relative_position'] = df['Target Index'] / np.maximum(max_indices, 1)  # Avoid division by zero

# Plot distribution of relative positions
plt.figure(figsize=(10, 5))
plt.hist(df['relative_position'], bins=20)
plt.xlabel('Relative Position in Sentence (0=start, 1=end)')
plt.ylabel('Count')
plt.title('Distribution of Target Word Positions within Sentences')
plt.savefig('figures//target_position_distribution.png')
plt.close()

print(f"Average relative position of target words: {df['relative_position'].mean():.2f}")


=== SENTENCE CONTEXT AND MORPHOLOGICAL RICHNESS ANALYSIS ===

Average number of word forms per lemma: 1.53
Median number of word forms per lemma: 1.0
Maximum number of word forms per lemma: 16

Top 10 lemmas by number of different word forms:
Lemma
be       16
get      11
go       10
do        9
light     9
make      9
take      8
open      8
try       8
see       7
Name: Word Form, dtype: int64

Examples of rich inflection paradigms:

be:
  - 3rd_pers: is, 's, ai, Is, s
  - base: are, be, being, Are, 'm, 're
  - past: was, were, been, Was, where

get:
  - 3rd_pers: gets, Gets
  - base: get, getting, Get, GET, got, Getting
  - past: got, Got, gotten, GOT

go:
  - 3rd_pers: goes
  - base: go, gon, going, Gon, Go, GO
  - past: went, gone
  - plural: go's
  - singular: go

do:
  - 3rd_pers: does
  - base: do, doing, Do, Doing, to
  - past: done, did
  - plural: Dos
  - singular: do

light:
  - 3rd_pers: lights
  - base: lighting, light
  - comparative: lighter
  - past: litten, lit, ligh

In [32]:
sentences = df['Sentence'].drop_duplicates().tolist()

avg_tokens_per_sentence = {}
for model_name, config in MODEL_CONFIGS.items():
    print(f"Loading {model_name} tokenizer for sentence stats…")
    tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'])
    # encode each sentence and count tokens
    token_counts = [len(tokenizer.encode(s)) for s in sentences]
    # store the mean (rounded to 1 decimal)
    avg_tokens_per_sentence[model_name] = np.mean(token_counts).round(1)

Loading gpt2 tokenizer for sentence stats…
Loading pythia1.4b tokenizer for sentence stats…
Loading gemma2b tokenizer for sentence stats…
Loading qwen2 tokenizer for sentence stats…
Loading bert-base-uncased tokenizer for sentence stats…
Loading bert-large-uncased tokenizer for sentence stats…
Loading deberta-v3-large tokenizer for sentence stats…




In [33]:
# display as a DataFrame
avg_sent_df = pd.DataFrame.from_dict(
    avg_tokens_per_sentence, orient='index', columns=['avg_tokens_per_sentence']
)
print("\nAverage subword tokens per sentence:")
print(avg_sent_df.describe())


Average subword tokens per sentence:
       avg_tokens_per_sentence
count                 7.000000
mean                 22.628571
std                   0.910521
min                  21.600000
25%                  21.800000
50%                  22.700000
75%                  23.400000
max                  23.700000


In [34]:
# Generate LaTeX tables for the paper
print("\n=== GENERATING LATEX TABLES ===")

def dataframe_to_latex(df, caption, label):
    """Convert a DataFrame to a LaTeX table with proper formatting"""
    if isinstance(df.index, pd.MultiIndex):
        # Handle MultiIndex case
        latex = "\\begin{table}\n  \\centering\n"
        latex += "  \\begin{tabular}{" + "l" * (len(df.index.levels) + 1) + "r" * (len(df.columns)) + "}\n"
        latex += "    \\hline\n"
        
        # Header
        header = "    " + " & ".join(["\\textbf{" + str(col) + "}" for col in [''] * len(df.index.levels) + list(df.columns)]) + " \\\\\n"
        latex += header
        
        latex += "    \\hline\n"
        
        # Rows
        current_level0 = None
        for idx, row in df.iterrows():
            if idx[0] != current_level0:
                current_level0 = idx[0]
                latex += "    \\multicolumn{" + str(len(df.index.levels) + len(df.columns)) + "}{l}{\\textbf{" + str(current_level0) + "}} \\\\\n"
            
            values = [str(idx[-1])] + [str(round(val, 2) if isinstance(val, float) else val) for val in row.values]
            latex += "    " + " & ".join(values) + " \\\\\n"
        
    else:
        # Standard DataFrame
        latex = "\\begin{table}\n  \\centering\n"
        latex += "  \\begin{tabular}{" + "l" + "r" * (len(df.columns)) + "}\n"
        latex += "    \\hline\n"
        
        # Header
        header = "    \\textbf{" + "} & \\textbf{".join([str(col) for col in [''] + list(df.columns)]) + "} \\\\\n"
        latex += header
        
        latex += "    \\hline\n"
        
        # Rows
        for idx, row in df.iterrows():
            values = [str(idx)] + [str(round(val, 2) if isinstance(val, float) else val) for val in row.values]
            latex += "    " + " & ".join(values) + " \\\\\n"
    
    latex += "    \\hline\n"
    latex += "  \\end{tabular}\n"
    latex += f"  \\caption{{{caption}}}\n"
    latex += f"  \\label{{{label}}}\n"
    latex += "\\end{table}"
    
    return latex

# Create main dataset statistics table
stats_df = dataset_stats.set_index('Statistic')
print(dataframe_to_latex(
    stats_df, 
    "Dataset statistics for the GUM corpus", 
    "tab:dataset"
))

# Create category distribution table
category_dist_df = pd.DataFrame({
    'Count': category_counts,
    'Percentage': category_percentages
})
print("\n" + dataframe_to_latex(
    category_dist_df, 
    "Distribution of word categories in the dataset", 
    "tab:category_distribution"
))

# Create inflection distribution table
inflection_df = pd.DataFrame({
    'Count': inflection_counts,
    'Percentage': inflection_percentages
})
print("\n" + dataframe_to_latex(
    inflection_df, 
    "Distribution of inflection categories in the dataset", 
    "tab:inflection_distribution"
))

# Create tokenization statistics table
print("\n" + dataframe_to_latex(
    tokenizer_df.round(2), 
    "Tokenization statistics across different models", 
    "tab:tokenization_stats"
))

# Create sentence statistics table
sentence_stats_df = pd.DataFrame({
    'Statistic': ['Average Words', 'Median Words', 'Minimum Words', 'Maximum Words'],
    'Value': [
        round(sentence_lengths.mean(), 1),
        int(sentence_lengths.median()),
        int(sentence_lengths.min()),
        int(sentence_lengths.max())
    ]
}).set_index('Statistic')
print("\n" + dataframe_to_latex(
    sentence_stats_df, 
    "Sentence length statistics", 
    "tab:sentence_stats"
))

# Create train/dev/test split table
split_sizes_df = pd.DataFrame({
    'Split': ['Train', 'Dev', 'Test'],
    'Examples': [len(train_df), len(dev_df), len(test_df)],
    'Percentage': [
        f"{len(train_df)/len(df):.1%}",
        f"{len(dev_df)/len(df):.1%}",
        f"{len(test_df)/len(df):.1%}"
    ]
}).set_index('Split')
print("\n" + dataframe_to_latex(
    split_sizes_df, 
    "Dataset splits", 
    "tab:dataset_splits"
))


=== GENERATING LATEX TABLES ===
\begin{table}
  \centering
  \begin{tabular}{lr}
    \hline
    \textbf{} & \textbf{Value} \\
    \hline
    Total data points & 54816.0 \\
    Unique sentences & 8415.0 \\
    Unique lemmas & 7848.0 \\
    Unique word forms & 11720.0 \\
    Average tokens per sentence & 6.5 \\
    \hline
  \end{tabular}
  \caption{Dataset statistics for the GUM corpus}
  \label{tab:dataset}
\end{table}

\begin{table}
  \centering
  \begin{tabular}{lrr}
    \hline
    \textbf{} & \textbf{Count} & \textbf{Percentage} \\
    \hline
    Noun & 27111.0 & 49.5 \\
    Verb & 17093.0 & 31.2 \\
    Adjective & 10612.0 & 19.4 \\
    \hline
  \end{tabular}
  \caption{Distribution of word categories in the dataset}
  \label{tab:category_distribution}
\end{table}

\begin{table}
  \centering
  \begin{tabular}{lrr}
    \hline
    \textbf{} & \textbf{Count} & \textbf{Percentage} \\
    \hline
    singular & 19830.0 & 36.2 \\
    base & 10076.0 & 18.4 \\
    positive & 9926.0 & 18.1 \\