# 07. Practical Project: Building a Character Lookup Table

In this final notebook, we'll put everything together to build a complete character lookup table. This project demonstrates a real-world workflow for creating data structures useful for dictionaries and character composers.


In [None]:
import pandas as pd
import json
from pathlib import Path


## Project Goal

Build a comprehensive character lookup table that includes:
1. Character variants (simplified/traditional, shinjitai/kyujitai)
2. Basic statistics about variant relationships
3. Export in a format useful for dictionary/composer applications

## Step 1: Load Multiple Data Sources


In [None]:
# Load simplified/traditional Chinese variants
df_simplified = pd.read_csv('../cjkvi-variants/cjkvi-simplified.txt',
                            sep=',',
                            comment='#',
                            names=['variant', 'type', 'target'],
                            encoding='utf-8')

# Load Japanese joyo variants
df_joyo = pd.read_csv('../cjkvi-variants/joyo-variants.txt',
                     sep=',',
                     comment='#',
                     names=['character', 'type', 'variant'],
                     encoding='utf-8')

# Load shinjitai data
with open('../shinjitai-table/shinjitai.json', 'r', encoding='utf-8') as f:
    shinjitai_data = json.load(f)

shinjitai_list = []
for shinjitai, kyujitai_list in shinjitai_data.items():
    if kyujitai_list:
        for kyujitai in kyujitai_list:
            shinjitai_list.append({'shinjitai': shinjitai, 'kyujitai': kyujitai})

df_shinjitai = pd.DataFrame(shinjitai_list)

print("Data loaded:")
print(f"  Simplified variants: {len(df_simplified)} rows")
print(f"  Joyo variants: {len(df_joyo)} rows")
print(f"  Shinjitai mappings: {len(df_shinjitai)} rows")


## Step 2: Clean and Filter Data


In [None]:
# Filter for simplified Chinese only
df_simp = df_simplified[df_simplified['type'] == 'cjkvi/simplified'][['variant', 'target']].copy()
df_simp.columns = ['simplified', 'traditional']

# Filter for traditional Chinese only
df_trad = df_simplified[df_simplified['type'] == 'cjkvi/traditional'][['variant', 'target']].copy()
df_trad.columns = ['traditional', 'simplified']

print("Filtered data:")
print(f"  Simplified → Traditional: {len(df_simp)} mappings")
print(f"  Traditional → Simplified: {len(df_trad)} mappings")


## Step 3: Merge Datasets


In [None]:
# Create a base character list from all sources
all_chars = set()
all_chars.update(df_simp['simplified'].unique())
all_chars.update(df_simp['traditional'].unique())
all_chars.update(df_joyo['character'].unique())
all_chars.update(df_shinjitai['shinjitai'].unique())

df_base = pd.DataFrame({'character': list(all_chars)})
print(f"Base character list: {len(df_base)} unique characters")

# Merge with simplified/traditional mappings
df_merged = pd.merge(df_base, df_simp,
                     left_on='character',
                     right_on='simplified',
                     how='left')

# Also add reverse mapping (traditional → simplified)
df_merged = pd.merge(df_merged, df_trad,
                     left_on='character',
                     right_on='traditional',
                     how='left',
                     suffixes=('', '_rev'))

# Clean up columns
if 'simplified_rev' in df_merged.columns:
    df_merged['simplified'] = df_merged['simplified'].fillna(df_merged['simplified_rev'])
    df_merged = df_merged.drop(columns=['simplified_rev', 'traditional_rev'])

print(f"\nAfter merging variant data: {len(df_merged)} rows")
df_merged.head(10)


## Step 4: Add Japanese Variant Information


In [None]:
# Merge with shinjitai data
df_final = pd.merge(df_merged, df_shinjitai,
                   left_on='character',
                   right_on='shinjitai',
                   how='left')

# Add joyo variant information
df_joyo_variants = df_joyo.groupby('character')['variant'].apply(list).reset_index()
df_joyo_variants.columns = ['character', 'joyo_variants']

df_final = pd.merge(df_final, df_joyo_variants,
                   on='character',
                   how='left')

print(f"Final merged table: {len(df_final)} rows")
df_final.head(10)


## Step 5: Analyze with Basic Statistics


In [None]:
# Count characters with different types of variant information
print("Variant type distribution:")
print(f"  Characters with simplified/traditional mapping: {df_final['simplified'].notna().sum()}")
print(f"  Characters with shinjitai/kyujitai mapping: {df_final['kyujitai'].notna().sum()}")
print(f"  Characters with joyo variants: {df_final['joyo_variants'].notna().sum()}")

# Count characters with multiple variant types
has_multiple = (df_final['simplified'].notna().astype(int) + 
                df_final['kyujitai'].notna().astype(int) + 
                df_final['joyo_variants'].notna().astype(int)) >= 2

print(f"\nCharacters with 2+ variant types: {has_multiple.sum()}")


## Step 6: Create Lookup Tables and Export


In [None]:
# Create a character → all variants lookup dictionary
lookup_dict = {}
for idx, row in df_final.iterrows():
    char = row['character']
    variants = []
    
    if pd.notna(row['traditional']):
        variants.append(('traditional', row['traditional']))
    if pd.notna(row['simplified']):
        variants.append(('simplified', row['simplified']))
    if pd.notna(row['kyujitai']):
        variants.append(('kyujitai', row['kyujitai']))
    if pd.notna(row['joyo_variants']):
        for var in row['joyo_variants']:
            variants.append(('joyo', var))
    
    if variants:
        lookup_dict[char] = variants

print(f"Created lookup dictionary with {len(lookup_dict)} characters")
print("\nExample entries:")
for char, variants in list(lookup_dict.items())[:3]:
    print(f"  {char}: {variants}")


In [None]:
# Export to JSON
output_file = 'character_lookup.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(lookup_dict, f, ensure_ascii=False, indent=2)

print(f"Exported lookup table to {output_file}")

# Also export the full DataFrame to CSV
df_final.to_csv('character_table.csv', index=False, encoding='utf-8')
print("Exported full table to character_table.csv")


## Project Summary

We've successfully:
1. ✅ Loaded data from multiple sources (CSV, JSON)
2. ✅ Cleaned and filtered the data
3. ✅ Merged datasets using different join types
4. ✅ Added computed columns and transformations
5. ✅ Performed basic statistical analysis
6. ✅ Created lookup tables
7. ✅ Exported results for use in applications

## What You've Learned

Throughout this series, you've learned:
- How to load data from various file formats
- How to explore and inspect DataFrames
- How to filter data based on conditions
- **How to merge datasets** (most critical skill!)
- How to transform data and create lookup tables
- How to perform basic statistics and grouping
- How to build complete data processing workflows

## Next Steps

You're now ready to:
- Work with AI assistance for more complex data analysis (Part 3)
- Build your own character lookup tools
- Process CHISE and other CJK character databases
- Create data structures for character composers and dictionaries

## Reference Materials

For deeper learning, continue with:
- **PANDAS-TUTORIAL** in `../PANDAS-TUTORIAL/` for advanced topics
- [Pandas Official Documentation](https://pandas.pydata.org/docs/)

## Try It Yourself

1. Extend this project by adding IDS data
2. Create a component → characters forward map
3. Build variant → standard reverse lookup tables
4. Experiment with different merge strategies
5. Add more data sources and see how they combine
