## Step 1: Clone the Github repo and install libraries

In [None]:
# clone repo and install libraries
!git clone https://github.com/mgbilby/oga_src.git
!pip install conllu pandas

# import libraries
import os
import pandas as pd
from conllu import parse_incr
from collections import defaultdict
from textwrap import dedent

fatal: destination path 'oga_src' already exists and is not an empty directory.


## Step 2: Define the data directory, select authors by cts_urn prefix, and check inputs

In [None]:
authors = {
    "Hippocrates": [f for f in os.listdir(data_dir) if f.startswith("tlg0627") and f.endswith(".conllu")],
    "Galen": [f for f in os.listdir(data_dir) if f.startswith("tlg0057") and f.endswith(".conllu")]
}

# Check loaded files
for author, files in authors.items():
    if not files:
        print(f"Warning: No files found for {author}.")
    else:
        print(f"Found {len(files)} files for {author}:")
        print("\n".join(files))

Found 52 files for Hippocrates:
tlg0627.tlg016.1st1K-grc1_oga020_src.conllu
tlg0627.tlg047.1st1K-grc1_oga020_src.conllu
tlg0627.tlg042.1st1K-grc1_oga020_src.conllu
tlg0627.tlg033.1st1K-grc1_oga020_src.conllu
tlg0627.tlg037.1st1K-grc1_oga020_src.conllu
tlg0627.tlg051.1st1K-grc1_oga020_src.conllu
tlg0627.tlg032.1st1K-grc1_oga020_src.conllu
tlg0627.tlg008.1st1K-grc1_oga020_src.conllu
tlg0627.tlg048.1st1K-grc1_oga020_src.conllu
tlg0627.tlg046.1st1K-grc1_oga020_src.conllu
tlg0627.tlg030.1st1K-grc1_oga020_src.conllu
tlg0627.tlg018.1st1K-grc1_oga020_src.conllu
tlg0627.tlg049.1st1K-grc1_oga020_src.conllu
tlg0627.tlg036.1st1K-grc1_oga020_src.conllu
tlg0627.tlg021.1st1K-grc1_oga020_src.conllu
tlg0627.tlg025.1st1K-grc1_oga020_src.conllu
tlg0627.tlg053.1st1K-grc1_oga020_src.conllu
tlg0627.tlg003.1st1K-grc1_oga020_src.conllu
tlg0627.tlg019.1st1K-grc1_oga020_src.conllu
tlg0627.tlg043.1st1K-grc1_oga020_src.conllu
tlg0627.tlg013.1st1K-grc1_oga020_src.conllu
tlg0627.tlg012.1st1K-grc1_oga020_src.conllu


## Step 3: Create functions to extract work title and parse CoNNL-U files and convert to df

In [None]:
# Define the function to extract the work title
def extract_work_title(filepath):
    """
    Extracts the work title from the MISC column in the first token of the file.
    """
    with open(filepath, "r", encoding="utf-8") as file:
        for line in file:
            if line.startswith("1\t"):  # First token line
                columns = line.strip().split("\t")
                if len(columns) > 9 and "ref=" in columns[9]:
                    ref_field = columns[9]
                    # Extract the title after 'ref=' and before the first underscore
                    title_start = ref_field.find("ref=") + 4
                    title_end = ref_field.find("_", title_start)
                    return ref_field[title_start:title_end]
    return "Unknown Work"

# Define the function to parse CoNLL-U files
def parse_conllu_file(filepath, work_title):
    """
    Parses a CoNLL-U file and converts it to a pandas DataFrame.
    """
    with open(filepath, "r", encoding="utf-8") as file:
        sentences = parse_incr(file)
        rows = []
        word_count = 0
        for sentence in sentences:
            for token in sentence:
                rows.append({
                    "form": token["form"],  # Surface form
                    "lemma": token.get("lemma", "_"),  # Lemma
                    "upostag": token.get("upostag", "_"),  # Universal POS tag
                    "xpostag": token.get("xpostag", "_"),  # Language-specific POS tag
                    "feats": token.get("feats", "_"),  # Morphological features
                    "deprel": token.get("deprel", "_"),  # Syntactic relation
                    "work_title": work_title  # Add the extracted work title
                })
                word_count += 1
    return pd.DataFrame(rows), word_count

## Step 4: Load and parse files for selected authors into a dataframe

In [None]:
# Load and parse all files for both authors
data = {}
word_counts_by_author = defaultdict(int)  # To store total word counts for each author

for author, files in authors.items():
    author_data = []
    print(f"Processing works by {author}...")
    for file in files:
        filepath = os.path.join(data_dir, file)
        print(f"  Parsing {file}...")

        # Extract the work title
        work_title = extract_work_title(filepath)
        if work_title:
            print(f"    Work title identified: {work_title}")
        else:
            print(f"    Work title could not be identified for {file}. Skipping...")
            continue

        # Parse the file into a DataFrame and calculate word count
        df, word_count = parse_conllu_file(filepath, work_title)
        print(f"    Word token count is {word_count}")

        if not df.empty:
            df["source_file"] = file  # Add a column to indicate the source file
            df["author"] = author  # Add a column to indicate the author
            author_data.append(df)
            word_counts_by_author[author] += word_count  # Update total word count for the author

    if author_data:
        data[author] = pd.concat(author_data, ignore_index=True)

# Combine data into a single DataFrame for easy querying
if data:
    combined_df = pd.concat(data.values(), ignore_index=True)
    print("Data successfully parsed and combined.")
else:
    print("No data parsed. Please check your files and code.")

# Output total word token count for each author
print("\nTotal word token counts by author:")
for author, total_count in word_counts_by_author.items():
    print(f"  {author}: {total_count} tokens")

# Sample output of combined data
print("\nSample of the parsed data:")
print(combined_df.head())

Processing works by Hippocrates...
  Parsing tlg0627.tlg016.1st1K-grc1_oga020_src.conllu...
    Work title identified: Prorrheticon I
    Word token count is 12993
  Parsing tlg0627.tlg047.1st1K-grc1_oga020_src.conllu...
    Work title identified: De visu
    Word token count is 995
  Parsing tlg0627.tlg042.1st1K-grc1_oga020_src.conllu...
    Work title identified: De glandulis
    Word token count is 2180
  Parsing tlg0627.tlg033.1st1K-grc1_oga020_src.conllu...
    Work title identified: De natura muliebri
    Word token count is 15242
  Parsing tlg0627.tlg037.1st1K-grc1_oga020_src.conllu...
    Work title identified: De virginum morbis
    Word token count is 547
  Parsing tlg0627.tlg051.1st1K-grc1_oga020_src.conllu...
    Work title identified: Praeceptiones
    Word token count is 1651
  Parsing tlg0627.tlg032.1st1K-grc1_oga020_src.conllu...
    Work title identified: De affectionibus interioribus
    Word token count is 19652
  Parsing tlg0627.tlg008.1st1K-grc1_oga020_src.conllu..

## Step 5: Define queries for lemma, morphological features, and syntactical relation frequencies

In [None]:
# Lemma Frequency Query
def query_lemma_frequency(lemma):

    results = {}
    for author in combined_df["author"].unique():
        author_df = combined_df[combined_df["author"] == author]
        total_tokens = len(author_df)
        lemma_count = len(author_df[author_df["lemma"] == lemma])
        frequency = (lemma_count / total_tokens) * 100 if total_tokens > 0 else 0
        results[author] = frequency
    return results

# Morphological Feature Frequency Query
def query_morph_feature_frequency(feature_key, feature_value):

    results = {}

    for author in combined_df["author"].unique():
        author_df = combined_df[combined_df["author"] == author]
        total_tokens = len(author_df)

        # Count occurrences of the feature key-value pair in the feats column
        feature_count = author_df["feats"].apply(
            lambda x: isinstance(x, dict) and x.get(feature_key) == feature_value
        ).sum()

        # Calculate frequency as a percentage
        frequency = (feature_count / total_tokens) * 100 if total_tokens > 0 else 0
        results[author] = frequency

    return results

# Syntactical Feature Frequency Query
def query_syntactical_feature_frequency(deprel):

    results = {}
    for author in combined_df["author"].unique():
        author_df = combined_df[combined_df["author"] == author]
        total_tokens = len(author_df)
        deprel_count = len(author_df[author_df["deprel"] == deprel])
        frequency = (deprel_count / total_tokens) * 100 if total_tokens > 0 else 0
        results[author] = frequency
    return results

### Inspect data samples

In [None]:
# Inspect a sample of Hippocrates's data
print("\n")
galen_sample = combined_df[combined_df["author"] == "Hippocrates"]
print("Sample data for Hippocrates:")
print(galen_sample.tail(10))
print("\n")

# Inspect a sample of Galen's data
galen_sample = combined_df[combined_df["author"] == "Galen"]
print("Sample data for Galen:")
print(galen_sample.tail(10))



Sample data for Hippocrates:
              form      lemma upostag    xpostag  \
423559  ἐπίστασθαι  ἐπίσταμαι       v  v--pne---   
423560          ἐκ         ἐκ       r  r--------   
423561         τῆς          ὁ       l  l-s---fg-   
423562      ἑωυτοῦ     ἑαυτοῦ       p  p-s---mg-   
423563      γνώμης      γνώμη       n  n-s---fg-   
423564          ἐν         ἐν       r  r--------   
423565        τῇσι          ὁ       l  l-p---fd-   
423566   νούσοισιν      νόσος       n  n-p---fd-   
423567  ὠφελέεσθαι     ὠφελέω       v  v--pne---   
423568           .          .       u  u--------   

                                              feats deprel         work_title  \
423559    {'Mood': 'n', 'Tense': 'p', 'Voice': 'e'}    SBJ  De salubri diaeta   
423560                                         None   AuxP  De salubri diaeta   
423561  {'Case': 'g', 'Gender': 'f', 'Number': 's'}    ATR  De salubri diaeta   
423562  {'Case': 'g', 'Gender': 'm', 'Number': 's'}    ATR  De salubri d

### Sample Lemma Query

In [None]:
# Sample query: Which medical author is more concerned with embryoes?
lemma_to_query = "ἔμβρυος"
lemma_frequencies = query_lemma_frequency(lemma_to_query)
print(f"Lemma frequencies for '{lemma_to_query}':")
for author, frequency in lemma_frequencies.items():
    print(f"  {author}: {frequency:.3f}%")

Lemma frequencies for 'ἔμβρυος':
  Hippocrates: 0.007%
  Galen: 0.001%


### Sample Morphological Feature Query

In [None]:
# Example: Which medical author is more apt to use feminine words ('Gender': 'f')
feature_key = "Gender"
feature_value = "f"

morph_frequencies = query_morph_feature_frequency(feature_key, feature_value)
print(f"\nMorphological feature frequencies for '{feature_key}={feature_value}':")
for author, frequency in morph_frequencies.items():
    print(f"  {author}: {frequency:.2f}%")



Morphological feature frequencies for 'Gender=f':
  Hippocrates: 12.95%
  Galen: 15.63%


### Sample Syntactical Relation Query

In [None]:
# Sample query: Which medical author uses more attributive modifiers?
syntactic_feature_to_query = "ATR"
syntactic_frequencies = query_syntactical_feature_frequency(syntactic_feature_to_query)
print(f"\nSyntactical feature frequencies for '{syntactic_feature_to_query}':")
for author, frequency in syntactic_frequencies.items():
    print(f"  {author}: {frequency:.3f}%")


Syntactical feature frequencies for 'ATR':
  Hippocrates: 18.015%
  Galen: 23.530%
