In [1]:
!pip install conllu --quiet

In [2]:
import os
import requests
import re
import pandas as pd
from conllu import parse_incr

# URL to the UD English-GUM training data (CoNLL-U format)
ud_gum_url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-GUM/master/en_gum-ud-train.conllu"
data_dir = "../data"
os.makedirs(data_dir, exist_ok=True)
file_path = os.path.join(data_dir, "en_gum-ud-train.conllu")

if os.path.exists(file_path):
    print(f"Using cached UD English-GUM train data from {file_path}")
else:
    print(f"Downloading UD English-GUM train data from {ud_gum_url} ...")
    response = requests.get(ud_gum_url)
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(response.text)
    print(f"Downloaded and saved the UD English-GUM dataset with {len(response.text.splitlines())} lines.")


Using cached UD English-GUM train data from ../data/en_gum-ud-train.conllu


In [None]:
# Allowed pattern: only alphabetic characters and apostrophes
allowed_pattern = re.compile(r"^[A-Za-z']+$")

def get_category_and_label_ud(token):
    """
    Map UD morphological features and UPOS to a (category, inflection label, dimension).
    This function assumes that the token is a word token with a non-empty "feats" dictionary.
    """
    upos = token.get("upostag")
    feats = token.get("feats") or {}
    form = token["form"]
    
    # Initialize variables.
    category, label, dimension = None, None, None
    
    if upos == "VERB":
        category = "Verb"
        dimension = "Tense/Aspect"
        # UD often has Tense and VerbForm features.
        tense = feats.get("Tense")
        person = feats.get("Person")
        number = feats.get("Number")
        verbform = feats.get("VerbForm")
        
        # Heuristics for inflection label.
        if tense == "Pres":
            if person == "3" and number == "Sing":
                label = "3rd_pers"
            else:
                label = "base"
        elif tense == "Past":
            label = "past"
        elif verbform == "Part":
            # Check if the form resembles a present participle (ending with 'ing').
            if form.lower().endswith("ing"):
                label = "present_participle"
            else:
                # In UD, this might sometimes be a past participle.
                label = "past_participle"
        else:
            label = "base"
    
    elif upos == "NOUN":
        category = "Noun"
        dimension = "Number"
        num = feats.get("Number")
        # UD uses "Sing" and "Plur"
        if num == "Plur":
            label = "plural"
        else:
            label = "singular"
    
    elif upos == "ADJ":
        category = "Adjective"
        dimension = "Degree"
        degree = feats.get("Degree")
        if degree == "Cmp":
            label = "comparative"
        elif degree == "Sup":
            label = "superlative"
        else:
            label = "positive"
    
    return category, label, dimension

In [4]:
dataset_rows = []

# Open and parse the conllu file.
with open(file_path, "r", encoding="utf-8") as f:
    # parse_incr returns one sentence (as a list of tokens) at a time.
    for tokenlist in parse_incr(f):
        # Get the full sentence text by joining the token forms.
        sentence_tokens = [token["form"] for token in tokenlist]
        sentence_text = " ".join(sentence_tokens)
        
        # Iterate over tokens and extract candidates.
        for idx, token in enumerate(tokenlist):
            # Ensure token is not a multiword token (ID as integer) and check allowed pattern.
            if not isinstance(token["id"], int):
                continue
            word_form = token["form"]
            if not allowed_pattern.fullmatch(word_form):
                continue
            # We need a lemma as well.
            lemma = token.get("lemma")
            if lemma is None or not allowed_pattern.fullmatch(lemma):
                continue
            
            # Compute category, inflection label, and dimension using UD features.
            category, inflection_label, dimension = get_category_and_label_ud(token)
            if category is None or inflection_label is None:
                continue

            # Build a row for the dataset.
            dataset_rows.append({
                "Sentence": sentence_text,
                "Target Index": idx,  # target word index within sentence_tokens
                "Lemma": lemma,
                "Category": category,
                "Inflection Label": inflection_label,
                "Word Form": word_form,
                "Dimension": dimension,
                "Source Type": "UD_GUM"
            })

print(f"Built dataset with {len(dataset_rows)} rows from the UD English-GUM data.")

# Create a pandas DataFrame.
df_ud = pd.DataFrame(dataset_rows)

# Save the dataset to a CSV file.
output_file = os.path.join(data_dir, "ud_gum_dataset.csv")
df_ud.to_csv(output_file, index=False)
print(f"\nUD GUM dataset saved to {output_file}")

Built dataset with 54816 rows from the UD English-GUM data.

UD GUM dataset saved to ../data/ud_gum_dataset.csv


In [6]:
# Show the first 5 rows.
print("First 5 rows of the dataset:")
print(df_ud.head())

# Provide summary statistics.
num_rows = len(df_ud)
unique_lemmas = df_ud["Lemma"].nunique()
print(f"\nTotal rows in dataset: {num_rows}")
print(f"Number of unique lemmas: {unique_lemmas}")

First 5 rows of the dataset:
                                   Sentence  Target Index         Lemma  \
0  Aesthetic Appreciation and Spanish Art :             0     aesthetic   
1  Aesthetic Appreciation and Spanish Art :             1  appreciation   
2  Aesthetic Appreciation and Spanish Art :             3       Spanish   
3  Aesthetic Appreciation and Spanish Art :             4           art   
4              Insights from Eye - Tracking             0       insight   

    Category Inflection Label     Word Form Dimension Source Type  
0  Adjective         positive     Aesthetic    Degree      UD_GUM  
1       Noun         singular  Appreciation    Number      UD_GUM  
2  Adjective         positive       Spanish    Degree      UD_GUM  
3       Noun         singular           Art    Number      UD_GUM  
4       Noun           plural      Insights    Number      UD_GUM  

Total rows in dataset: 54816
Number of unique lemmas: 7848


In [8]:
print("\nRandom sample of 5 rows:")
print(df_ud.sample(5, random_state=42))


Random sample of 5 rows:
                                                Sentence  Target Index  \
23002  It's It 's really strange occurrence , over th...             4   
37916  The old method was to use chemical sprays to k...            24   
47105  From the later 1800s to the 1920s Mérida enjoy...             9   
49525  Reality hurts when it intrudes ; as such , an ...             7   
20987  " To have a business allow that type of filth ...             2   

            Lemma   Category Inflection Label   Word Form     Dimension  \
23002     strange  Adjective         positive     strange        Degree   
37916   awareness       Noun         singular   awareness        Number   
47105  prosperous  Adjective         positive  prosperous        Degree   
49525        such  Adjective         positive        such        Degree   
20987        have       Verb             base        have  Tense/Aspect   

      Source Type  
23002      UD_GUM  
37916      UD_GUM  
47105      UD_GUM 

In [None]:
# Distribution by Category.
cat_counts = df_ud["Category"].value_counts()
print("\nDistribution by Category:")
print(cat_counts)

# Distribution by Inflection Label.
inflect_counts = df_ud["Inflection Label"].value_counts()
print("\nDistribution by Inflection Label:")
print(inflect_counts)


Distribution by Category:
Noun         27111
Verb         17093
Adjective    10612
Name: Category, dtype: int64

Distribution by Inflection Label:
singular       19830
base           10076
positive        9926
plural          7281
past            5604
3rd_pers        1413
comparative      403
superlative      283
Name: Inflection Label, dtype: int64


: 