# 📘 Syllabify Nigerian Language Wordlists


In [2]:
# This function classifies a token as a vowel (V), consonant (C), or syllabic nasal (N)
def classify_token(token, V_set, C_set, N_set):
    if token in V_set:
        return 'V'
    elif token in N_set:
        return 'N'
    elif token in C_set:
        return 'C'
    else:
        return '?'


In [3]:
#Import important Python libraries
# Import pandas for handling Excel files and dataframes
import pandas as pd

# Import pathlib for handling file paths in a platform-independent way
from pathlib import Path

# Import os for directory and file operations
import os

# Import re for regular expressions used in tokenization and pattern matching
import re

# Import unicodedata for handling Unicode characters (e.g., diacritics)
import unicodedata

# Import zipfile for creating compressed ZIP archives of output files
import zipfile

In [4]:
# This function extracts the inventory items (Vowels, Consonants, Nasals) from the text description
def extract_inventory(pattern, text):
    """
    Extracts items (e.g., Vowels, Consonants, Nasals) from the inventory description using regex.
    """
    match = re.search(pattern, text, re.IGNORECASE)
    return [item.strip() for item in match.group(1).split(',')] if match else []


In [5]:
# This function determines the syllabic structure of a given sequence of tokens
def get_structure(tokens, V_set, C_set, N_set, at_start=False):
    structure = ""
    for idx, token in enumerate(tokens):
        if token in V_set:
            structure += "V"
        elif token in N_set:
            if at_start and idx == 0:
                structure += "N"  # Only the first token of the whole word/segment can be syllabic
            else:
                structure += "C"  # Elsewhere, treat nasals as consonants
        elif token in C_set:
            structure += "C"
        else:
            structure += "?"  # Unknown/invalid token
    return structure

In [6]:
# This function normalizes the syllable structure string into a list of valid patterns
def normalize_structures(raw_structure):
    structures = []
    for s in str(raw_structure).split(','):
        s = s.strip()
        if '(' in s and ')' in s:
            embedded = re.findall(r'\((.*?)\)', s)
            for part in embedded:
                structures.extend([x.strip() for x in part.split(',')])
        else:
            structures.append(s)
    return structures


In [7]:
# This function splits a word into tokens based on the language's speech inventory
def tokenize(word, inventory):
    tokens = []
    i = 0
    while i < len(word):
        matched = False
        # Try matching up to 3-letter sequences (for complex consonants like 'kp', 'gh', 'ny', etc.)
        for l in range(3, 0, -1):
            if i + l <= len(word) and word[i:i+l] in inventory:
                tokens.append(word[i:i+l])
                i += l
                matched = True
                break
        if not matched:
            tokens.append(word[i])
            i += 1
    return tokens


In [8]:
from google.colab import files
uploaded = files.upload()  # Upload Language_Profile_Wordlist.xlsx


Saving Language_Profile_Wordlist.xlsx to Language_Profile_Wordlist.xlsx


In [9]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.5-py3-none-any.whl.metadata (2.7 kB)
Downloading xlsxwriter-3.2.5-py3-none-any.whl (172 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/172.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.3/172.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.5


In [10]:
#   This function inserts hyphens in words with repeated (identical) consonants:
#   - If allows_clusters=True, insert hyphen BEFORE the first identical consonant (e.g., dikke → di-kke)
#   - If allows_clusters=False, insert hyphen AFTER the first identical consonant (e.g., dikke → dik-ke)

def preprocess_double_consonants(word, consonants, allows_clusters=True):
    result = []
    i = 0
    while i < len(word) - 1:
        current = word[i]
        next_char = word[i + 1]

        if current == next_char and current in consonants:
            if allows_clusters:
                result.extend([word[:i], '-', word[i:]])
            else:
                result.extend([word[:i+1], '-', word[i+1:]])
            return ''.join(result)
        i += 1
    return word


In [11]:
# Detect and hyphenate repeated substrings of ≥3 characters
def insert_repetition_hyphenation(word):
    for length in range(6, 2, -1):  # Check for substrings of length 6 to 3
        for i in range(len(word) - 2 * length + 1):
            first = word[i:i+length]
            second = word[i+length:i+2*length]
            if first == second:
                return word[:i] + first + '-' + word[i+length:]
    return word

In [12]:
# This function syllabifies a single word using the language-specific phonotactic rules
def syllabify_word(word, V_set, C_set, N_set, structures):
    segments = re.split(r'[\s\-]', word)
    syllabified_segments = []

    for segment in segments:
        tokens = tokenize(segment, V_set | C_set | N_set)
        result = []
        i = 0
        rule1_applied = False

        # Rule 1: if first is V or N and followed by C/N, insert hyphen
        if len(tokens) >= 2:
            first = classify_token(tokens[0], V_set, C_set, N_set)
            if first in {'V', 'N'} and tokens[1] in (C_set | N_set):
                result.append(tokens[0])
                result.append('-')
                tokens = tokens[1:]
                i = 0
                rule1_applied = True
        # After Rule 1, treat all nasals as consonants
        full_C_set = C_set | N_set

        while i < len(tokens):
            applied = False
            remaining = tokens[i:]

            if not rule1_applied and get_structure(remaining, V_set, full_C_set, set()) in structures:
                result.extend(remaining)
                break

            for j in range(min(5, len(remaining)), 0, -1):
                chunk = remaining[:j]
                next_index = i + j
                if get_structure(chunk, V_set, full_C_set, set()) in structures:
                    # Only apply if followed by consonant or end
                    if next_index < len(tokens):
                        if classify_token(tokens[next_index], V_set, full_C_set, set()) == 'C':
                            result.extend(chunk + ['-'])
                            i = next_index
                            applied = True
                            break
                    else:
                        result.extend(chunk)
                        i = next_index
                        applied = True
                        break

            if not applied:
                result.append(tokens[i])
                i += 1

        syllabified_segments.append(''.join(result).strip('-'))

    return '-'.join(syllabified_segments)

In [14]:
# This function processes all language sheets in the Excel file and syllabifies the transcribed words
def syllabify_all_languages(input_excel_path, output_dir_path):
    xls = pd.ExcelFile(input_excel_path)
    profile_df = xls.parse("Language_Profile")
    profile_rows = profile_df.set_index("SN").to_dict(orient="index")

    Path(output_dir_path).mkdir(parents=True, exist_ok=True)

    for sn, sheet_name in enumerate(xls.sheet_names[1:], start=1):  # Skip Language_Profile
        profile_row = profile_rows.get(sn)
        if not profile_row:
            continue

        df = xls.parse(sheet_name)
        if not {"English word", "Transcribed word", "Tone pattern"}.issubset(df.columns):
            continue

        inventory_text = profile_row["Speech and sound inventory"]
        structures = normalize_structures(profile_row["Syllable structure"])
        V = extract_inventory(r'Vowels?\s*\(V\):\s*([^\n\r\.]+)', inventory_text)
        C = extract_inventory(r'Consonants?\s*\(C\):\s*([^\n\r\.]+)', inventory_text)
        N = extract_inventory(r'Syllabic nasals?\s*\(N\):\s*([^\n\r\.]+)', inventory_text)

        V_set, C_set, N_set = set(V), set(C), set(N)

        df["Syllabified word"] = df["Transcribed word"].astype(str).apply(
            lambda word: syllabify_word(insert_repetition_hyphenation(word), V_set, C_set | N_set, N_set, structures)
        )

        def classify_segment(segment, V, C, N, full_word, segment_start_index):
            structure = []
            i = 0
            while i < len(segment):
                matched = False
                for unit in sorted(V + C + N, key=lambda x: -len(x)):
                    if segment[i:].startswith(unit):
                        absolute_index = segment_start_index + i
                        next_char = full_word[absolute_index + len(unit):absolute_index + len(unit) + 1]
                        prev_char = full_word[absolute_index - 1] if absolute_index > 0 else ''

                        if unit in N:
                            is_initial = (absolute_index == 0 or prev_char in {'-', ' '})
                            if is_initial and next_char and any(next_char.startswith(c) for c in C):
                                structure.append('N')
                            else:
                                structure.append('C')
                        elif unit in V:
                            structure.append('V')
                        elif unit in C:
                            structure.append('C')
                        else:
                            structure.append('?')
                        i += len(unit)
                        matched = True
                        break
                if not matched:
                    structure.append('?')
                    i += 1
            return ''.join(structure)

        def generate_ncv_structure(row):
            transcribed = row["Transcribed word"]
            syllabified = row["Syllabified word"]
            if not isinstance(syllabified, str) or not syllabified.strip():
                return ""
            segments = syllabified.split('-')
            cv_parts = []
            idx = 0
            for seg in segments:
                while idx < len(transcribed) and transcribed[idx] in {'-', ' '}:
                    idx += 1
                cv = classify_segment(seg, V, C, N, transcribed, idx)
                cv_parts.append(cv)
                idx += len(seg)
            return '-'.join(cv_parts)

        df["NCV_Structure"] = df.apply(generate_ncv_structure, axis=1)

        def validate_characters(word, inventory_set):
            tokens = tokenize(word, inventory_set)
            tokens = [t for t in tokens if t not in {'-', ' '}]
            unrecognized = [t for t in tokens if t not in inventory_set]
            if unrecognized:
                return f"wrong syllabification detected: character(s) {', '.join(unrecognized)} not in the speech and sound inventory of this language"
            else:
                return "transcribed word correctly syllabified using defined syllable structure of this language"

        df["Syllabification Validation"] = df["Transcribed word"].astype(str).apply(
            lambda word: validate_characters(word, V_set | C_set | N_set)
        )

        lang_name = profile_row["Language"]
        output_file = os.path.join(output_dir_path, f"{lang_name}.xlsx")
        cv_output_dir = "structure_transformed_files"
        os.makedirs(cv_output_dir, exist_ok=True)
        cv_output_path = os.path.join(cv_output_dir, f"{lang_name}.xlsx")
        df.to_excel(cv_output_path, index=False)

        with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
            df.drop(columns=["NCV_Structure"], errors="ignore").to_excel(writer, sheet_name=lang_name, index=False)

    print(f"✓ All languages processed. Output saved to: {output_dir_path}")

# Call the function
syllabify_all_languages("Language_Profile_Wordlist.xlsx", "syllabified_output")

✓ All languages processed. Output saved to: syllabified_output


## 📦 Download All Results as ZIP

In [15]:
# 📦 Zip all syllabified output files
output_zip_path = "syllabified_output.zip"
with zipfile.ZipFile(output_zip_path, 'w') as zipf:
    for root, _, files in os.walk("syllabified_output"):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, "syllabified_output")
            zipf.write(file_path, arcname)
print(f"✅ All syllabified files zipped at: {output_zip_path}")

# 📦 Zip all structure-transformed output files
structure_zip_path = "structure_transformed_files.zip"
with zipfile.ZipFile(structure_zip_path, 'w') as zipf:
    for root, _, files in os.walk("structure_transformed_files"):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, "structure_transformed_files")
            zipf.write(file_path, arcname)
print(f"✅ All structure-transformed files zipped at: {structure_zip_path}")

✅ All syllabified files zipped at: syllabified_output.zip
✅ All structure-transformed files zipped at: structure_transformed_files.zip
