In [1]:
#upload language profile for all languages
from google.colab import files
uploaded = files.upload()


Saving Language_Profile_Wordlist.xlsx to Language_Profile_Wordlist.xlsx


In [2]:
# Install the openpyxl library for working with Excel files in Python
!pip install openpyxl



In [3]:
#import the 'os' module to interact with the operating system (e.g., file paths, environment variables).
import os
# Import pandas for data manipulation
import pandas as pd
# Import re for regular expression operations
import re
# Import Path for file and directory operations
from pathlib import Path
# Import zipfile to create and manage ZIP archives
import zipfile

In [4]:
# Detect and hyphenate repeated substrings (length ≥ 3) before tone processing
def preprocess_repeated_segments(word):
    word = word.strip()
    # Skip if already hyphenated or too short
    if "-" in word or len(word) < 6:
        return word

    for size in range(3, len(word) // 2 + 1):
        segment = word[:size]
        if word.count(segment) > 1 and word.startswith(segment + segment):
            return word.replace(segment + segment, f"{segment}-{segment}", 1)
    return word

In [5]:
# Function to extract a list of phonemes (e.g., vowels, consonants, or syllabic nasals)
# from a structured text description in the language profile using a regex pattern.
# The function searches for the pattern in the given text and splits the matched string
# by commas to return a clean list of phoneme tokens.def extract_inventory(pattern, text):
def extract_inventory(pattern, text):
    match = re.search(pattern, text, re.IGNORECASE)
    return [item.strip() for item in match.group(1).split(',')] if match else []

# Function to load the vowel, consonant, and syllabic nasal inventories for a language.
# It retrieves the 'Speech and sound inventory' field from a row of the Language_Profile sheet
# and uses the extract_inventory function with appropriate regex patterns to extract and return
# the inventories as separate lists: vowels, consonants, and nasals.
def load_inventory_from_profile(profile_row):
    inventory_text = profile_row.get("Speech and sound inventory", "")
    vowels = extract_inventory(r'Vowels?\s*\(V\):\s*([^\n\r\.]+)', inventory_text)
    consonants = extract_inventory(r'Consonants?\s*\(C\):\s*([^\n\r\.]+)', inventory_text)
    nasals = extract_inventory(r'Syllabic nasals?\s*\(N\):\s*([^\n\r\.]+)', inventory_text)
    return vowels, consonants, nasals


In [6]:
# Function to count valid vowels and syllabic nasals in a word.
# Nasals are only valid at the start of a word or following a hyphen or space.
def count_vowel_nasal(word, vowels, nasals, consonants):
    # Step 1: Preprocess each segment individually to split repeated parts
    parts = re.split(r'([\s\-])', word)  # keep delimiters

    for i in range(0, len(parts), 2):  # preprocess only actual word segments
        parts[i] = preprocess_repeated_segments(parts[i])

    word = ''.join(parts)  # rejoin the full word

    # Step 2: Proceed with classification and counting
    replacements = []
    segments = re.split(r'(\s+|\-)', word)  # Split on spaces and hyphens, retaining them

    for segment in segments:
        if segment.strip() in ['', '-', ' ']:
            replacements.append(segment)
            continue

        tokenized = []
        i = 0
        is_first = True  # Reset for each segment

        while i < len(segment):
            match = None
            for l in sorted(vowels + nasals + consonants, key=lambda x: -len(x)):
                if segment[i:].startswith(l):
                    match = l
                    break

            if match:
                if is_first:
                    if match in nasals:
                        tokenized.append('N')
                    elif match in vowels:
                        tokenized.append('V')
                    else:
                        tokenized.append('C')
                    is_first = False
                else:
                    if match in vowels:
                        tokenized.append('V')
                    elif match in nasals:
                        tokenized.append('C')  # mid-segment nasals not tone-bearing
                    else:
                        tokenized.append('C')
                i += len(match)
            else:
                tokenized.append('?')
                i += 1

        replacements.append(''.join(tokenized))

    classification = ''.join(replacements)
    return classification.count('V') + classification.count('N')

In [7]:
# Define the input Excel file containing the language profiles and wordlists
input_path = "Language_Profile_Wordlist.xlsx"

# Define the output directory for tone-labeled files
output_dir = Path("tone_labelling_output")
output_dir.mkdir(exist_ok=True)  # Create the output directory if it doesn't already exist

# Load the Excel file containing multiple sheets (profiles + wordlists)
xls = pd.ExcelFile(input_path)

# Parse the language profile sheet, which includes metadata for each language
profile_df = xls.parse("Language_Profile")

# Initialize a list to store summary statistics for each language
summary = []

# Loop over each language sheet by serial number (SN) from 1 to 25
for sn in range(1, 26):
    # Get the profile row for the current SN
    profile_row = profile_df[profile_df["SN"] == sn].iloc[0]

    # Get the sheet name corresponding to the current SN
    sheet_name = xls.sheet_names[sn]

    # Load the wordlist sheet for the language
    df = xls.parse(sheet_name)

    # Skip this language if the required columns are not present
    if not {"English word", "Transcribed word", "Tone pattern"}.issubset(df.columns):
        continue

    # Load the vowel, consonant, and nasal inventories from the profile row
    vowels, consonants, nasals = load_inventory_from_profile(profile_row)

    # Initialize lists to store count results and remarks for each word
    vn_counts = []     # Stores number of vowels/nasals per word
    tone_counts = []   # Stores number of tone characters per word
    remarks = []       # Stores the diagnostic message per word

    # Iterate over each row in the language wordlist
    for _, row in df.iterrows():
        raw_word = str(row["Transcribed word"]).strip()  # Original transcribed word
        transcribed = preprocess_repeated_segments(raw_word)  # Preprocess repeated segments

        tone_pattern = str(row["Tone pattern"]).strip()  # Get tone pattern string

        # Count the number of vowels and syllabic nasals in the transcribed word
        vn_count = count_vowel_nasal(transcribed, vowels, nasals, consonants)

        # Count the number of alphabetical tone symbols in the tone pattern
        tone_count = len(re.findall(r'[a-zA-Z]', tone_pattern))

        # Compare counts and generate a remark
        if vn_count < tone_count:
            remark = "more tone labels exist"
        elif vn_count > tone_count:
            remark = "less tone labels exist"
        else:
            remark = "tone labels properly marked"

        # Append the results for this word
        vn_counts.append(vn_count)
        tone_counts.append(tone_count)
        remarks.append(remark)


    # Add new columns to the DataFrame with the results
    df["Vowel/Nasal Count"] = vn_counts
    df["Number of Tones"] = tone_counts
    df["Remarks"] = remarks

    # Save the processed DataFrame to an Excel file named after the language
    lang = profile_row["Language"]
    df.to_excel(output_dir / f"{lang}.xlsx", index=False)

    # Add language-level summary to the master summary list
    summary.append({
        "Language": lang,
        "Total transcribed words": len(df),
        "Total V/N": sum(vn_counts),
        "Correctly labelled tones": sum(1 for r in remarks if r == "tone labels properly marked")
    })

In [8]:
# Function to verify each tone label against the allowed tone system
def verify_tone_labels_across_languages(language_profile_path, output_dir_path):
    output_dir = Path(output_dir_path)
    profile_df = pd.read_excel(language_profile_path, sheet_name="Language_Profile")
    language_sheets = [f.stem for f in output_dir.glob("*.xlsx") if f.stem in profile_df["Language"].values]

    for lang in language_sheets:
        lang_profile_row = profile_df[profile_df["Language"] == lang]
        if lang_profile_row.empty:
            continue

        # Extract and normalize valid tones
        valid_tones = lang_profile_row["Tonal system"].values[0]
        if pd.isna(valid_tones):
            continue
        valid_tone_set = set(valid_tones.replace(",", "").replace(" ", ""))

        file_path = output_dir / f"{lang}.xlsx"
        if not file_path.exists():
            continue

        df = pd.read_excel(file_path)

        verification_results = []
        for _, row in df.iterrows():
            tone_pattern = str(row.get("Tone pattern", "")).strip()
            if not tone_pattern or tone_pattern.lower() == "nan":
                verification_results.append("No tone label provided")
                continue

            # Check each character in tone pattern
            unrecognized = [t for t in tone_pattern if t not in valid_tone_set]
            if not unrecognized:
                verification_results.append("Tone label(s) recognized")
            else:
                verification_results.append(f"{', '.join(sorted(set(unrecognized)))} tone(s) not recognized")

        # Remove existing Tone Verification column if exists
        if "Tone Verification" in df.columns:
            df.drop(columns=["Tone Verification"], inplace=True)

        # Insert after 'Remarks' column
        if "Remarks" in df.columns:
            idx = df.columns.get_loc("Remarks") + 1
            df.insert(loc=idx, column="Tone Verification", value=verification_results)
        else:
            df["Tone Verification"] = verification_results

        df.to_excel(file_path, index=False)

    print("✅ Tone verification completed and saved to existing files.")
verify_tone_labels_across_languages("Language_Profile_Wordlist.xlsx", "tone_labelling_output")

✅ Tone verification completed and saved to existing files.


In [9]:
# 📦 Zip all output Excel files
zip_path = "tone_labelling_output.zip"
with zipfile.ZipFile(zip_path, 'w') as zipf:
    for root, _, files in os.walk("tone_labelling_output"):  # corrected from "syllabified_output"
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, "tone_labelling_output")  # corrected here too
            zipf.write(file_path, arcname)

print(f"✅ All tone-labelled files zipped at: {zip_path}")

✅ All tone-labelled files zipped at: tone_labelling_output.zip
