In [None]:
import nltk
import pandas as pd
import string, re
import numpy as np

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
latin_df = pd.read_excel('latin.xlsx')
latin_df.head()

Unnamed: 0,token,file
0,'abdi,Tarikhul Auliya
1,'abdil-muṭṭalib,Tarikhul Auliya
2,'abdillāh,Tarikhul Auliya
3,'abdul-jalīl,Tarikhul Auliya
4,'abdul-majīd,Tarikhul Auliya


In [None]:
pegon_df = pd.read_excel('pegon.xlsx')
pegon_df.head()

Unnamed: 0,token,file
0,﷽,Tarikhul Auliya
1,١,Tarikhul Auliya
2,١,Ilmu Tasawwuf
3,١٠,Tarikhul Auliya
4,١١,Tarikhul Auliya


# Compare Character on Manuscript to Pegon-Latin Character Rules

In [None]:
def get_unique_characters_and_unicode_df(df):
    all_tokens = ''.join(df['token'])
    unique_characters = set(all_tokens)
    unicode_data = [{'char': char, 'utf16': f'{ord(char):04X}'} for char in sorted(unique_characters)]
    return pd.DataFrame(unicode_data)

In [None]:
'''
Extract unique characters from Pegon tokens and its utf16
'''

pegon_unicode_df = get_unique_characters_and_unicode_df(pegon_df)
pegon_unicode_df.head()

Unnamed: 0,char,utf16
0,I,0049
1,K,004B
2,N,004E
3,R,0052
4,S,0053


In [None]:
'''
Extract unique characters from Latin tokens and its utf16
'''

latin_unicode_df = get_unique_characters_and_unicode_df(latin_df)
latin_unicode_df.head()

Unnamed: 0,char,utf16
0,',0027
1,-,002D
2,0,0030
3,1,0031
4,2,0032


In [None]:
pegon_rules_unicodes = [
    "0621", "0622", "0623", "0624", "0625", "0626", "0627", "0628", "0629",
    "062A", "062B", "062C", "062D", "062E", "062F", "0630", "0631", "0632",
    "0633", "0634", "0635", "0636", "0637", "0638", "0639", "063A", "063C",
    "0641", "0642", "0643", "0644", "0645", "0646", "0647", "0648", "0649",
    "064A", "064B", "064C", "064D", "064E", "064F", "0650", "0651", "0652",
    "0653", "0656", "0657", "0660", "0661", "0662", "0663", "0664", "0665",
    "0666", "0667", "0668", "0669", "0670", "067E", "0684", "0686", "068A",
    "06A0", "06A4", "06AC", "06AD", "06AE", "06BD", "06CB", "06D0", "06D1",
    "06E1", "0762", "0763", "088B", "088C", "088D", "08E4", "08AE", "08B4",
    "10EC2", "10EC3", "10EC4"
]

In [None]:
latin_rules_unicodes = [
    "0027", "0030", "0031", "0032", "0033", "0034", "0035", "0036", "0037",
    "0038", "0039", "0060", "0061", "0062", "0063", "0064", "0065", "0066",
    "0067", "0068", "0069", "006A", "006B", "006C", "006D", "006E", "006F",
    "0070", "0070", "0071", "0072", "0073", "0074", "0075", "0076", "0077",
    "0079", "007A", "00F1", "0101", "011B", "0121", "012B", "015B", "016B",
    "1E0B", "1E0D", "1E0F", "1E35", "1E45", "1E63", "1E6B", "1E6D", "1E6F",
    "1E95", "1E96"
]

In [None]:
'''
Find character in Pegon and Latin Token which not included on rules
'''

non_pegon_unicode = pegon_unicode_df[~pegon_unicode_df['utf16'].isin(pegon_rules_unicodes)]
non_latin_unicode = latin_unicode_df[~latin_unicode_df['utf16'].isin(latin_rules_unicodes)]

In [None]:
non_pegon_unicode

Unnamed: 0,char,utf16
0,I,0049
1,K,004B
2,N,004E
3,R,0052
4,S,0053
5,T,0054
6,i,0069
33,ؿ,063F
69,ک,06A9
71,گ,06AF


In [None]:
non_latin_unicode

Unnamed: 0,char,utf16
1,-,002D
36,ê,00EA
41,ō,014D
44,ž,017E
48,ḥ,1E25
51,ṡ,1E61
56,ẑ,1E91
57,ẓ,1E93
59,ⅲ,2172


# Analyze Character

In [None]:
def char_stats(series):
    concatenated = ''.join(series)
    char_count = len(concatenated)
    unique_chars = set(concatenated)
    unique_char_count = len(unique_chars)
    unique_char_list = list(unique_chars)
    return char_count, unique_char_count, unique_char_list

In [None]:
def calculate_char_stats(df, group_by_col='language', token_col='token'):
    grouped_df = df.groupby(group_by_col)[token_col].agg(char_stats).reset_index()
    grouped_df[['char count', 'unique char count', 'unique char list']] = pd.DataFrame(grouped_df[token_col].tolist(), index=grouped_df.index)
    grouped_df.drop(columns=token_col, inplace=True)
    return grouped_df

In [None]:
def calculate_overall_stats(df, group_by_col='language', token_col='token'):
    overall_stats = char_stats(df[token_col])
    overall_df = pd.DataFrame([['all', *overall_stats]], columns=[group_by_col, 'char count', 'unique char count', 'unique char list'])
    return overall_df

In [None]:
def find_char_stats(df, token_col='token', group_by_col='language'):
    char_df = calculate_char_stats(df, group_by_col, token_col)
    overall_df = calculate_overall_stats(df, group_by_col, token_col)
    combined_df = pd.concat([char_df, overall_df], ignore_index=True)
    return combined_df

## Analyze Character per Language

In [None]:
'''
Add language column to analyze per language
'''

latin_df['language'] = np.where(latin_df['file'].isin(['Tarikhul Auliya', 'Mitro Sejati']), 'Jawa', 'sunda')
pegon_df['language'] = np.where(pegon_df['file'].isin(['Tarikhul Auliya', 'Mitro Sejati']), 'Jawa', 'sunda')

In [None]:
'''
Count Latin Characters per Language
'''

latin_char_df = find_char_stats(latin_df)
latin_char_df

Unnamed: 0,language,char count,unique char count,unique char list
0,Jawa,15058,57,"[ṭ, ā, ṅ, z, g, ḥ, ḏ, f, ñ, d, ḋ, o, 7, ū, 3, ..."
1,sunda,27361,53,"[ā, ṭ, z, g, ḥ, f, d, ḋ, o, 7, ū, 3, 5, 9, ẑ, ..."
2,all,42419,60,"[ṭ, ā, z, ṅ, g, ḥ, ḏ, f, ñ, d, ḋ, o, 7, ū, 3, ..."


In [None]:
'''
Count Pegon Characters per Language
'''

pegon_char_df = find_char_stats(pegon_df)
pegon_char_df

Unnamed: 0,language,char count,unique char count,unique char list
0,Jawa,24461,78,"[آ, ی, K, م, ا, N, ٦, ٠, ٥, ظ, ر, ٨, ّ, گ, ل, ..."
1,sunda,39593,69,"[آ, ف, ة, ی, ٢, ‌, ذ, پ, چ, ڠ, ى, ض, ۲, ٰ, م, ..."
2,all,64054,86,"[آ, ی, ‌, K, ݘ, م, ا, N, ٦, ٠, ٥, ظ, ر, ٨, ّ, ..."


## Analyze Character per File

In [None]:
'''
Count Latin Characters per Manuscripts
'''

latin_char_df = find_char_stats(latin_df, group_by_col='file')
latin_char_df

Unnamed: 0,file,char count,unique char count,unique char list
0,Ilmu Tasawwuf,5037,46,"[ā, ṭ, z, g, ḥ, f, d, o, 7, ū, 3, 5, 9, ẑ, ṣ, ..."
1,Mitro Sejati,3970,37,"[ā, ṭ, z, g, ḥ, f, d, o, ū, ẑ, ṣ, w, ṡ, c, y, ..."
2,Qisasul Anbiya,22324,51,"[ā, z, ṭ, g, ḥ, f, d, ḋ, o, 7, ū, 3, 5, ẑ, ṣ, ..."
3,Tarikhul Auliya,11088,57,"[ṭ, ā, ṅ, z, g, ḥ, ḏ, f, ñ, d, ḋ, o, 7, ū, 3, ..."
4,all,42419,60,"[ṭ, ā, z, ṅ, g, ḥ, ḏ, f, ñ, d, ḋ, o, 7, ū, 3, ..."


In [None]:
'''
Count Latin Characters per Manuscripts
'''

pegon_char_df = find_char_stats(pegon_df, group_by_col='file')
pegon_char_df

Unnamed: 0,file,char count,unique char count,unique char list
0,Ilmu Tasawwuf,6817,62,"[آ, ف, ة, ی, ٢, ذ, پ, چ, ڠ, ى, ض, ٰ, إ, ا, م, ..."
1,Mitro Sejati,6085,49,"[ف, ة, ٢, ذ, پ, چ, ڠ, ى, ض, ٰ, م, ا, ࢍ, ْ, ب, ..."
2,Qisasul Anbiya,32776,58,"[آ, ف, ة, ٢, ‌, ذ, پ, چ, ڠ, ى, ض, ۲, ٰ, م, ا, ..."
3,Tarikhul Auliya,18376,76,"[آ, ف, ﷽, ة, ی, ٢, ذ, چ, ڤ, ڠ, ى, Ⅲ, K, ض, ࢌ, ..."
4,all,64054,86,"[آ, ی, ‌, K, ݘ, م, ا, N, ٦, ٠, ٥, ظ, ر, ٨, ّ, ..."


## Analyze Character on Final Dataset

In [None]:
df = pd.read_csv('Jawa-Sunda.csv')
df.head()

Unnamed: 0,language,pegon,latin
0,Jawa,تاريخ,tariḵ
1,Jawa,والى,wali
2,Jawa,سڠا,sṅa
3,Jawa,نࣤرَاڠَاكࣤنْ,něraṅakěn
4,Jawa,بَبَادْاِيْفُونْ,babadipun


In [None]:
'''
Count Latin Characters per Language
'''

latin_char_df = find_char_stats(df, token_col='latin')
latin_char_df

Unnamed: 0,language,char count,unique char count,unique char list
0,Jawa,15135,54,"[ṭ, ḏ, ṅ, ā, z, g, f, ñ, d, ḋ, o, 7, 3, 5, 9, ..."
1,Sunda,26656,42,"[ā, ḏ, ṅ, z, g, ñ, d, ḋ, o, ẕ, ṣ, w, ḵ, ġ, c, ..."
2,all,41791,54,"[ṭ, ḏ, ṅ, ā, z, g, f, ñ, d, ḋ, o, 7, 3, 5, 9, ..."


In [None]:
'''
Count Pegon Characters per Language
'''

pegon_char_df = find_char_stats(df, token_col='pegon')
pegon_char_df

Unnamed: 0,language,char count,unique char count,unique char list
0,Jawa,23177,65,"[آ, ࣤ, ف, ة, ٢, ذ, چ, ڤ, ڠ, ى, ࢌ, ض, ٰ, م, ا, ..."
1,Sunda,37423,54,"[آ, ࣤ, ف, ة, ٢, ذ, چ, ڠ, ى, ض, ٰ, م, ا, ڮ, إ, ..."
2,all,60600,68,"[آ, ࣤ, ف, ة, ٢, ذ, چ, ڤ, ڠ, ى, ࢌ, ض, ٰ, م, ا, ..."
