In [None]:
import nltk
import pandas as pd
import string, re

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Read File Content

In [None]:
def read_file_content(path):
    with open(path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

In [None]:
'''
Read File Content and Assign it to Variable
'''

ta_transkripsi = read_file_content("Transkripsi Tarikhul Auliya.txt")
it_transkripsi = read_file_content("Transkripsi Ilmu Tasawwuf.txt")
ms_transkripsi = read_file_content("Transkripsi Mitro Sejati.txt")
qa_transkripsi = read_file_content("Transkripsi Qisasul Anbiya.txt")

ta_transliterasi = read_file_content("Transliterasi Tarikhul Auliya.txt")
it_transliterasi = read_file_content("Transliterasi Ilmu Tasawwuf.txt")
ms_transliterasi = read_file_content("Transliterasi Mitro Sejati.txt")
qa_transliterasi = read_file_content("Transliterasi Qisasul anbiya.txt")

In [None]:
'''
Value Example of Content
'''

print("Trancription: ")
print(ta_transkripsi[500:700])

print()

print("Transliteration: ")
print(ta_transliterasi[500:700])

Trancription: 
هُ تَعَالٰى وَكُلًّا نَقُصُّ عَلَيْكَ مِنْ اَنْبَاءِ الرُّسُلِ مَا نُثَبِّتُ بِهٖ فُؤَادَكَ، وَعَنْ سَعْدِ ابْنِ اَبِىْ وَقَّاصٍ رَضِيَ الله عَنْهُ اَنَّهُ قَالَ كَانَ اَبِىْ يُعَلِّمُنَا مَغَازِيَ رَ

Transliteration: 
ni abī waqqāṣir raḍiyallāh 'anhu annahu qāla kāna abī yu'allimunā magāziya rasūlillāhi wasarāyāhu fayaqūlu yā bunayya hāẑihī syarafu ābā`ikum falā tansau ẑikrahā) -
(qālaz-zainul-'irāqiyyu raḥimahullā


# Tokenize and Save to Dataframe

# Remove Punctutation

In [None]:
import re

# Sample text
text = "This is a -5 sample - text with - hyphens - surrounded by spaces. Also, consider 5-5 and 6-7."

# Replace hyphens surrounded by spaces with a single space
# This handles cases like ' -5', '5- ', and ' - ', but retains '5-5'
text = re.sub(r'(?<=\s)-(?=\s)|(?<=\s)-|-(?=\s)', '', text)

print(text)


This is a 5 sample  text with  hyphens  surrounded by spaces. Also, consider 5-5 and 6-7.


In [None]:
def remove_latin_punctuation(text, aksara):
    if aksara == 'latin':
        # Lower case the text
        text = text.lower()

        # Replace special characters with their equivalents
        text = text.replace('‘', "'").replace('’', "'").replace('_', "")

        # Remove non-word characters except for apostrophes and hyphens
        text = re.sub(r'[^\w\s\'\-]', '', text)
    else:
        # Remove Pegon punctuation
        pattern = r'[\ufeff\u066A-\u066C\u0023\u002B-\u002F\u003D\u0021-\u0022\u0026-\u0029\u002A-\u003A\u003F-\u0040\u005C\u005F\u060C\u060D\u061B\u061F\u0640\u06D4\u06DD\u06DE\u06E9\u2018\u201C\u25CC\u2663\uFD3E\uFD3F\u2013\u2014\u003B]'
        text = re.sub(pattern, '', text)

    # Strip leading and trailing hyphens
    text = text.strip('-')

    # Replace consecutive hyphens with ''
    text = re.sub(r'-{2,}', '', text)

    # Replace hyphens surrounded by spaces or have space before or after with ''
    text = re.sub(r'(?<=\s)-(?=\s)|(?<=\s)-|-(?=\s)', '', text)

    # Replace multiple whitespace characters with a single space and strip leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Tokenize and Create DataFrame

In [None]:
def create_df(file, aksara):
    # Remove punctuation from the file text
    text = remove_latin_punctuation(file, aksara)

    # Split the text into tokens
    tokens = text.split()

    # Create a DataFrame from the tokens with a column named 'Token'
    df = pd.DataFrame(tokens, columns=['token'])

    # Reset the index of the DataFrame and drop the old index
    return df.reset_index(drop=True)

In [None]:
latin_files = [
    (ta_transliterasi, 'Tarikhul Auliya'),
    (it_transliterasi, 'Ilmu Tasawwuf'),
    (ms_transliterasi, 'Mitro Sejati'),
    (qa_transliterasi, 'Qisasul Anbiya')
]

latin_dfs = []

for dataset, filename in latin_files:
    df = create_df(dataset, 'latin')
    df['file'] = filename
    if filename == 'Ilmu Tasawwuf':
        df = df[11:1900].reset_index(drop=True)
    latin_dfs.append(df)

In [None]:
pegon_files = [
    (ta_transkripsi, 'Tarikhul Auliya'),
    (it_transkripsi, 'Ilmu Tasawwuf'),
    (ms_transkripsi, 'Mitro Sejati'),
    (qa_transkripsi, 'Qisasul Anbiya')
]

pegon_dfs = []

for dataset, filename in pegon_files:
    df = create_df(dataset, 'pegon')
    df['file'] = filename
    pegon_dfs.append(df)

In [None]:
'''
Concatenate Dataframes
'''

latin_df = pd.concat(latin_dfs).reset_index(drop=True)
pegon_df = pd.concat(pegon_dfs).reset_index(drop=True)

In [None]:
'''
Concatenated Latin Result
'''

latin_df

Unnamed: 0,token,file
0,tariḵ,Tarikhul Auliya
1,wali,Tarikhul Auliya
2,saṅa,Tarikhul Auliya
3,nêraṅakên,Tarikhul Auliya
4,babadipun,Tarikhul Auliya
...,...,...
18136,iṣlahkeun,Qisasul Anbiya
18137,amal,Qisasul Anbiya
18138,žohir,Qisasul Anbiya
18139,batin,Qisasul Anbiya


In [None]:
'''
Concatenated Pegon Result
'''

pegon_df

Unnamed: 0,token,file
0,تاريخ,Tarikhul Auliya
1,والى,Tarikhul Auliya
2,سڠا,Tarikhul Auliya
3,نٓرَاڠَاكٓنْ,Tarikhul Auliya
4,بَبَادْاِيْفُونْ,Tarikhul Auliya
...,...,...
18171,أفضل,Qisasul Anbiya
18172,الصلوات,Qisasul Anbiya
18173,على,Qisasul Anbiya
18174,سيد,Qisasul Anbiya


In [None]:
'''
Count Words
'''

print("Count Pegon Words: " + str(pegon_df.shape))
print("Count Latin Words: " + str(latin_df.shape))

Count Pegon Words: (18176, 2)
Count Latin Words: (18141, 2)


In [None]:
'''
Count Words for Every Manuscripts
'''

pegon_word_counts = pegon_df.groupby('file').size().reset_index(name='word_count')
print("Count of Pegon Words per File:\n", pegon_word_counts)

print()

latin_word_counts = latin_df.groupby('file').size().reset_index(name='word_count')
print("Count of Latin Words per File:\n", latin_word_counts)

Count of Pegon Words per File:
               file  word_count
0    Ilmu Tasawwuf        1921
1     Mitro Sejati        1337
2   Qisasul Anbiya        9579
3  Tarikhul Auliya        5339

Count of Latin Words per File:
               file  word_count
0    Ilmu Tasawwuf        1889
1     Mitro Sejati        1320
2   Qisasul Anbiya        9655
3  Tarikhul Auliya        5277


# Remove Duplicate

In [None]:
'''
Remove Duplicate Words
'''

pegon_df.drop_duplicates(inplace=True)
latin_df.drop_duplicates(inplace=True)

In [None]:
'''
Count Words After Drop Duplicate
'''

print("Count Pegon Words: " + str(pegon_df.shape))
print("Count Latin Words: " + str(latin_df.shape))

Count Pegon Words: (7087, 2)
Count Latin Words: (6426, 2)


In [None]:
'''
Count Words for Every Manuscripts After Drop Duplicate
'''

pegon_word_counts = pegon_df.groupby('file').size().reset_index(name='word_count')
print("Count of Pegon Words per File:\n", pegon_word_counts)

print()

latin_word_counts = latin_df.groupby('file').size().reset_index(name='word_count')
print("Count of Latin Words per File:\n", latin_word_counts)

Count of Pegon Words per File:
               file  word_count
0    Ilmu Tasawwuf         819
1     Mitro Sejati         731
2   Qisasul Anbiya        3609
3  Tarikhul Auliya        1928

Count of Latin Words per File:
               file  word_count
0    Ilmu Tasawwuf         755
1     Mitro Sejati         687
2   Qisasul Anbiya        3241
3  Tarikhul Auliya        1743


In [None]:
'''
Save dataframe to Excel
'''

latin_df.to_excel('latin.xlsx', index=False)
pegon_df.to_excel('pegon.xlsx', index=False)