In [1]:
import pandas as pd
import re

# Load the raw dataset
df = pd.read_csv('bbaw_egyptian_w_hieroglyphs_translated.csv')

# Function to clean Gardiner Codes
def clean_hieroglyphs(text):
    if not isinstance(text, str): return ""
    tokens = text.split()
    clean_tokens = []
    for t in tokens:
        # Remove artifacts like "var", "large"
        if t.lower() in ['"var"', '"large"', '"lb"', 'var', 'lb']:
            continue
        
        # Remove rotation/scaling info (e.g., \R180 -> "") and backslashes
        t = re.sub(r'\\R\d+', '', t) 
        t = t.replace('\\', '')
        
        # Keep only valid alphanumeric codes (e.g., A1, N35)
        t = re.sub(r'[^A-Za-z0-9]', '', t)
        
        if t:
            clean_tokens.append(t)
    return ' '.join(clean_tokens)

In [2]:
def clean_english(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    # Add space around punctuation for better tokenization
    text = re.sub(r"([?.!,])", r" \1 ", text)
    # Remove brackets and special characters
    text = re.sub(r'["\(\)\[\]\<\>〈〉]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [4]:
df['gardiner_clean'] = df['hieroglyphs'].apply(clean_hieroglyphs)
df['english_clean'] = df['translation_en'].apply(clean_english)

# Drop rows that became empty or were NaN
df_clean = df.dropna(subset=['gardiner_clean', 'english_clean'])
df_clean = df_clean[(df_clean['gardiner_clean'] != '') & (df_clean['english_clean'] != '')]

# Remove duplicates
df_clean = df_clean.drop_duplicates(subset=['gardiner_clean', 'english_clean'])

# Save for the next step
final_df = df_clean[['gardiner_clean', 'english_clean']]
final_df.columns = ['gardiner_sequence', 'english_translation']
final_df.to_csv('cleaned_hieroglyphs_data.csv', index=False)

print(f"Data cleaned. Final row count: {len(final_df)}")
print(final_df.head())

Data cleaned. Final row count: 30727
                                   gardiner_sequence  \
0  D21 Q3 D36 F4 D36 L2 X1 S19 S29 U23 T21 X1 G17...   
1  M17 A26 S34 Aa1 G43 A1 Z3 h N17 N23 A1 Z2B S29...   
2  W24 V31 V22 F34 N35 M23 X1 N35 G17 R8 O6 X1 O1...   
3  G35 F34 F34 F34 D2 Z1 Aa17 U6 D21 M17 M17 X1 N...   
4  M17 G43 D4 N35 M17 M40 O34 O1 Z1 G17 V28 W14 X...   

                                 english_translation  
0  hereditary noble and prince , royal seal-beare...  
1  o living ones , who are upon the earth , who s...  
2  i was a trusted one of the king in the temple ...  
3  a trusted one upon the landing place , great o...  
4  i built a tomb through the favour of the king ...  
