In [None]:
import nltk
import pandas as pd
import string, re
import numpy as np
from collections import Counter

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
latin_df = pd.read_excel('latin.xlsx')
latin_df.head()

Unnamed: 0,token,file
0,'abdi,Tarikhul Auliya
1,'abdil-muṭṭalib,Tarikhul Auliya
2,'abdillāh,Tarikhul Auliya
3,'abdul-jalīl,Tarikhul Auliya
4,'abdul-majīd,Tarikhul Auliya


In [None]:
pegon_df = pd.read_excel('pegon.xlsx')
pegon_df.head()

Unnamed: 0,token,file
0,﷽,Tarikhul Auliya
1,١,Tarikhul Auliya
2,١,Ilmu Tasawwuf
3,١٠,Tarikhul Auliya
4,١١,Tarikhul Auliya


# Compare Character on Manuscript to Pegon-Latin Character Rules

In [None]:
def get_unique_characters_and_unicode_df(df, token_col='token'):
    # Concatenate all tokens into a single string
    all_tokens = ''.join(df[token_col])

    # Use Counter to count the frequency of each character
    character_counts = Counter(all_tokens)

    # Create a list of dictionaries with character, frequency, and Unicode value
    unicode_data = [{'char': char, 'frequency': freq, 'utf16': f'{ord(char):04X}'}
                    for char, freq in character_counts.items()]

    # Create a DataFrame from the list of dictionaries
    unicode_df = pd.DataFrame(unicode_data)

    return unicode_df

In [None]:
'''
Extract unique characters from Pegon tokens and its utf16
'''

pegon_unicode_df = get_unique_characters_and_unicode_df(pegon_df)
pegon_unicode_df

Unnamed: 0,char,frequency,utf16
0,﷽,1,FDFD
1,١,59,0661
2,٠,9,0660
3,٢,137,0662
4,٣,12,0663
...,...,...,...
81,i,1,0069
82,ک,8,06A9
83,ࢌ,2,088C
84,‌,5,200C


In [None]:
'''
Extract unique characters from Latin tokens and its utf16
'''

latin_unicode_df = get_unique_characters_and_unicode_df(latin_df)
latin_unicode_df

Unnamed: 0,char,frequency,utf16
0,',357,0027
1,a,8263,0061
2,b,1095,0062
3,d,960,0064
4,i,2918,0069
5,l,1851,006C
6,-,228,002D
7,m,1708,006D
8,u,2898,0075
9,ṭ,71,1E6D


In [None]:
pegon_rules_unicodes = [
    "0621", "0622", "0623", "0624", "0625", "0626", "0627", "0628", "0629",
    "062A", "062B", "062C", "062D", "062E", "062F", "0630", "0631", "0632",
    "0633", "0634", "0635", "0636", "0637", "0638", "0639", "063A", "063C",
    "0641", "0642", "0643", "0644", "0645", "0646", "0647", "0648", "0649",
    "064A", "064B", "064C", "064D", "064E", "064F", "0650", "0651", "0652",
    "0653", "0656", "0657", "0660", "0661", "0662", "0663", "0664", "0665",
    "0666", "0667", "0668", "0669", "0670", "067E", "0684", "0686", "068A",
    "06A0", "06A4", "06AC", "06AD", "06AE", "06BD", "06CB", "06D0", "06D1",
    "06E1", "0762", "0763", "088B", "088C", "088D", "08E4", "08AE", "08B4",
    "10EC2", "10EC3", "10EC4"
]

In [None]:
latin_rules_unicodes = [
    "0027", "0030", "0031", "0032", "0033", "0034", "0035", "0036", "0037",
    "0038", "0039", "0060", "0061", "0062", "0063", "0064", "0065", "0066",
    "0067", "0068", "0069", "006A", "006B", "006C", "006D", "006E", "006F",
    "0070", "0070", "0071", "0072", "0073", "0074", "0075", "0076", "0077",
    "0079", "007A", "00F1", "0101", "011B", "0121", "012B", "015B", "016B",
    "1E0B", "1E0D", "1E0F", "1E35", "1E45", "1E63", "1E6B", "1E6D", "1E6F",
    "1E95", "1E96"
]

In [None]:
'''
Find character in Pegon and Latin Token which not included on rules
'''

non_pegon_unicode = pegon_unicode_df[~pegon_unicode_df['utf16'].isin(pegon_rules_unicodes)]
non_latin_unicode = latin_unicode_df[~latin_unicode_df['utf16'].isin(latin_rules_unicodes)]

In [None]:
non_pegon_unicode

Unnamed: 0,char,frequency,utf16
0,﷽,1,FDFD
12,۳,1,06F3
13,Ⅲ,1,2162
14,K,1,004B
15,T,1,0054
16,N,1,004E
17,R,1,0052
18,I,1,0049
19,S,1,0053
51,ی,19,06CC


In [None]:
non_latin_unicode

Unnamed: 0,char,frequency,utf16
6,-,228,002D
28,ẑ,71,1E91
29,ẓ,25,1E93
45,ê,900,00EA
46,ḥ,138,1E25
52,ṡ,36,1E61
57,ⅲ,1,2172
58,ō,1,014D
59,ž,1,017E


# Analyze Character

In [None]:
def char_stats(series):
    # Concatenate strings
    concatenated = ''.join(series)

    # Total character count
    char_count = len(concatenated)

    # Unique characters set
    unique_chars = set(concatenated)

    # Count of unique characters
    unique_char_count = len(unique_chars)

    # Convert set to list
    unique_char_list = list(unique_chars)

    return char_count, unique_char_count, unique_char_list

In [None]:
def calculate_char_stats(df, group_by_col='language', token_col='token'):
    # Group by and calculate stats
    grouped_df = df.groupby(group_by_col)[token_col].agg(char_stats).reset_index()

    # Split stats into columns
    grouped_df[['char count', 'unique char count', 'unique char list']] = pd.DataFrame(grouped_df[token_col].tolist(), index=grouped_df.index)

    # Drop token column
    grouped_df.drop(columns=token_col, inplace=True)

    return grouped_df

In [None]:
def calculate_overall_stats(df, group_by_col='language', token_col='token'):
    # Overall stats calculation
    overall_stats = char_stats(df[token_col])

    # Overall stats DataFrame
    overall_df = pd.DataFrame([['All', *overall_stats]], columns=[group_by_col, 'char count', 'unique char count', 'unique char list'])

    return overall_df

In [None]:
def find_char_stats(df, token_col='token', group_by_col='language'):
    # Group-wise and overall stats combination
    char_df = calculate_char_stats(df, group_by_col, token_col)
    overall_df = calculate_overall_stats(df, group_by_col, token_col)
    combined_df = pd.concat([char_df, overall_df], ignore_index=True)

    return combined_df

## Analyze Character per Language

In [None]:
'''
Add language column to analyze per language
'''

latin_df['language'] = np.where(latin_df['file'].isin(['Tarikhul Auliya', 'Mitro Sejati']), 'Jawa', 'sunda')
pegon_df['language'] = np.where(pegon_df['file'].isin(['Tarikhul Auliya', 'Mitro Sejati']), 'Jawa', 'sunda')

In [None]:
'''
Count Latin Characters per Language
'''

latin_char_df = find_char_stats(latin_df)
latin_char_df

Unnamed: 0,language,char count,unique char count,unique char list
0,Jawa,15058,57,"[ḋ, ś, ẑ, ḍ, ṅ, n, t, ī, s, ḥ, p, o, l, 1, ṣ, ..."
1,sunda,27361,53,"[ś, ḋ, ẑ, ḍ, n, t, ī, s, ḥ, p, o, l, 1, ṣ, i, ..."
2,All,42419,60,"[ś, ḋ, ẑ, ḍ, ṅ, n, t, ī, s, ḥ, p, o, l, 1, ṣ, ..."


In [None]:
'''
Count Pegon Characters per Language
'''

pegon_char_df = find_char_stats(pegon_df)
pegon_char_df

Unnamed: 0,language,char count,unique char count,unique char list
0,Jawa,24461,78,"[ى, ل, ٠, ِ, آ, ج, ے, ڤ, ث, س, ّ, ۳, ه, ط, ب, ..."
1,sunda,39593,69,"[ش, ح, ‌, ى, ٦, م, ٩, أ, ل, ٰ, پ, ظ, ٤, َ, ِ, ..."
2,All,64054,86,"[ى, ل, ٠, ِ, آ, ۲, ج, ݤ, ے, ڤ, ؿ, ث, س, إ, ّ, ..."


## Analyze Character per File

In [None]:
'''
Count Latin Characters per Manuscripts
'''

latin_char_df = find_char_stats(latin_df, group_by_col='file')
latin_char_df

Unnamed: 0,file,char count,unique char count,unique char list
0,Ilmu Tasawwuf,5037,46,"[ẑ, ḍ, n, t, ī, s, ḥ, p, o, l, 1, ṣ, i, ṭ, z, ..."
1,Mitro Sejati,3970,37,"[ẑ, ḍ, t, n, ī, ḥ, s, o, l, ṣ, i, ṭ, z, ṡ, ū, ..."
2,Qisasul Anbiya,22324,51,"[ś, ḋ, ẑ, ḍ, n, t, ī, s, ḥ, p, o, l, 1, ṣ, i, ..."
3,Tarikhul Auliya,11088,57,"[ḋ, ś, ẑ, ḍ, ṅ, n, t, ī, s, ḥ, p, o, l, 1, ṣ, ..."
4,All,42419,60,"[ś, ḋ, ẑ, ḍ, ṅ, n, t, ī, s, ḥ, p, o, l, 1, ṣ, ..."


In [None]:
'''
Count Latin Characters per Manuscripts
'''

pegon_char_df = find_char_stats(pegon_df, group_by_col='file')
pegon_char_df

Unnamed: 0,file,char count,unique char count,unique char list
0,Ilmu Tasawwuf,6817,62,"[ش, ح, ى, ٦, م, ٩, ٰ, ل, پ, ظ, أ, ٤, َ, ِ, ه, ..."
1,Mitro Sejati,6085,49,"[ش, ح, ى, م, ٰ, ظ, أ, ل, پ, َ, ِ, ه, ط, ع, ب, ..."
2,Qisasul Anbiya,32776,58,"[ش, ح, ‌, ى, م, أ, ٰ, ظ, ل, پ, َ, ِ, ه, ع, ط, ..."
3,Tarikhul Auliya,18376,76,"[ش, ح, ى, م, ٦, ٩, ۳, ٰ, K, ل, أ, ظ, ٠, ٤, َ, ..."
4,All,64054,86,"[ى, ل, ٠, ِ, آ, ۲, ج, ݤ, ے, ڤ, ؿ, ث, س, إ, ّ, ..."


## Analyze Character on Final Dataset

In [None]:
df = pd.read_csv('Jawa-Sunda.csv')
df.head()

Unnamed: 0,language,pegon,latin
0,Jawa,تاريخ,tariḵ
1,Jawa,والى,wali
2,Jawa,سڠا,sṅa
3,Jawa,نࣤرَاڠَاكࣤنْ,něraṅakěn
4,Jawa,بَبَادْاِيْفُونْ,babadipun


In [None]:
'''
Count Latin Characters per Language
'''

latin_char_df = find_char_stats(df, token_col='latin')
latin_char_df

Unnamed: 0,language,char count,unique char count,unique char list
0,Jawa,15147,54,"[ḋ, ś, ḍ, ṅ, t, n, ī, s, o, l, 1, ṣ, i, ṭ, z, ..."
1,Sunda,26693,42,"[ḋ, ś, ṅ, t, n, ī, s, p, o, l, ṣ, i, z, ṯ, `, ..."
2,All,41840,54,"[ḋ, ś, ḍ, ṅ, t, n, ī, s, o, l, 1, ṣ, i, ṭ, z, ..."


In [None]:
'''
Count Pegon Characters per Language
'''

pegon_char_df = find_char_stats(df, token_col='pegon')
pegon_char_df

Unnamed: 0,language,char count,unique char count,unique char list
0,Jawa,23177,65,"[ش, ح, ى, م, ظ, ٰ, أ, ل, ٦, ٩, ٠, ٤, َ, ِ, ه, ..."
1,Sunda,37423,54,"[ش, ح, ى, م, ٰ, أ, ظ, ل, ِ, َ, ه, ع, ط, ب, ق, ..."
2,All,60600,68,"[ش, ح, ى, م, ظ, ٰ, أ, ل, ٦, ٩, ٠, ٤, َ, ِ, ه, ..."


# Save Final dataset

In [None]:
'''
Penyimpanan Dataset bahasa Jawa
'''

df_jawa = df[df['language'] == 'Jawa'][['pegon', 'latin']]
df_jawa.to_csv('Jawa.csv', index=False)
df_jawa.shape

(2483, 2)

In [None]:
'''
Penyimpanan Dataset bahasa Sunda
'''

df_sunda = df[df['language'] == 'Sunda'][['pegon', 'latin']]
df_sunda[['pegon', 'latin']].to_csv('Sunda.csv', index=False)
df_sunda.shape

(4103, 2)

In [None]:
'''
Penyimpanan Dataset semua bahasa
'''

df_all = df[['pegon', 'latin']]
df_all.to_csv('All.csv', index=False)
df_all.shape

(6586, 2)

## Analyze Character on Each Final Dataset

### All Dataset

In [None]:
'''
Frequency Each Character in pegon
'''

freq_df = get_unique_characters_and_unicode_df(df_all, 'pegon')
freq_df.sort_values('frequency').head(15)

Unnamed: 0,char,frequency,utf16
66,﻿,1,FEFF
63,-,1,002D
25,ࢌ,2,088C
43,ٖ,5,0656
57,٠,9,0660
60,٦,11,0666
56,٧,12,0667
55,٣,12,0663
59,٤,13,0664
61,٨,13,0668


In [None]:
freq_df.sort_values('frequency').tail(15)

Unnamed: 0,char,frequency,utf16
8,س,1375,0633
0,ت,1501,062A
13,ك,1624,0643
23,م,1648,0645
11,ࣤ,1691,08E4
6,ل,1741,0644
2,ر,1935,0631
3,ي,2402,064A
19,ُ,2649,064F
5,و,2845,0648


In [None]:
'''
Frequency Each Character in pegon
'''

freq_df = get_unique_characters_and_unicode_df(df_all, 'latin')
freq_df.sort_values('frequency').head(15)

Unnamed: 0,char,frequency,utf16
21,ṭ,2,1E6D
39,ī,5,012B
48,0,9,0030
51,6,11,0036
47,7,12,0037
46,3,12,0033
50,4,13,0034
52,8,13,0038
53,9,16,0039
49,5,17,0035


In [None]:
freq_df.sort_values('frequency').tail(15)

Unnamed: 0,char,frequency,utf16
17,h,953,0068
14,p,981,0070
12,b,1085,0062
8,ṅ,1260,1E45
7,s,1389,0073
11,k,1630,006B
0,t,1672,0074
19,m,1678,006D
10,ě,1692,011B
6,l,1714,006C


### Sunda Dataset

In [None]:
'''
Frequency Each Character in pegon
'''

freq_df = get_unique_characters_and_unicode_df(df_sunda, 'pegon')
freq_df.sort_values('frequency')

Unnamed: 0,char,frequency,utf16
51,﻿,1,FEFF
52,ٖ,1,0656
48,ظ,19,0638
44,ث,32,062B
50,ز,36,0632
27,غ,36,063A
47,ض,38,0636
20,ٰ,41,0670
35,ٌ,41,064C
33,ط,42,0637


In [None]:
'''
Frequency Each Character in pegon
'''

freq_df = get_unique_characters_and_unicode_df(df_sunda, 'latin')
freq_df.sort_values('frequency')

Unnamed: 0,char,frequency,utf16
41,ī,1,012B
39,ẕ,20,1E95
36,ṫ,35,1E6B
25,ġ,36,0121
40,z,38,007A
38,ḏ,39,1E0F
30,ṯ,43,1E6F
32,2,51,0032
24,ḋ,53,1E0B
31,ś,56,015B


### Jawa Dataset

In [None]:
'''
Frequency Each Character in pegon
'''

freq_df = get_unique_characters_and_unicode_df(df_jawa, 'pegon')
freq_df.sort_values('frequency').head(15)

Unnamed: 0,char,frequency,utf16
63,-,1,002D
34,ٌ,2,064C
25,ࢌ,2,088C
50,آ,2,0622
43,ٖ,4,0656
32,ظ,4,0638
42,ً,6,064B
36,ث,8,062B
39,ٍ,8,064D
57,٠,9,0660


In [None]:
freq_df.sort_values('frequency').tail(15)

Unnamed: 0,char,frequency,utf16
7,ى,510,0649
8,س,526,0633
13,ك,557,0643
6,ل,587,0644
23,م,645,0645
11,ࣤ,774,08E4
2,ر,787,0631
3,ي,917,064A
19,ُ,953,064F
10,ن,1041,0646


In [None]:
'''
Frequency Each Character in pegon
'''

freq_df = get_unique_characters_and_unicode_df(df_jawa, 'latin')
freq_df

Unnamed: 0,char,frequency,utf16
0,t,556,0074
1,a,2934,0061
2,r,793,0072
3,i,1162,0069
4,ḵ,15,1E35
5,w,239,0077
6,l,577,006C
7,s,531,0073
8,ṅ,509,1E45
9,n,1063,006E
