# Corpora Cleaning, Tokenizing, Pickling

### Libraries

In [61]:
import pandas as pd
import arabic_cleaning as ac
import os, re
import xml.etree.ElementTree as ET
from datetime import datetime



In [21]:
#import nltk, glob, os, pickle

### Paths

Home Directory

In [24]:
#set home directory path
hdir = os.path.expanduser('~')

#inbox path for reports
inbox_path = hdir + "/Dropbox/Active_Directories/Inbox"


#external relative path
ext_corp_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora"

#internal relative path
int_corp_path = hdir + "/Dropbox/Active_Directories/Notes/Primary_Sources"

#pickle path
pickle_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

##### Pre-existing Corpora

In [26]:
# Indic Narrative
indo_path = ext_corp_path + "/indo-persian_corpora"

# Transoxania Narrative (Persian)
trans_path = ext_corp_path + "/machine_readable_persian_transoxania_texts"

# Khiva documents
khiva_path = ext_corp_path + "/khiva_khanate_chancery_corpus"

# Muscovite Persian diplomatic documents
musc_path = ext_corp_path + "/khorezm_muscovy_diplomatic"

# Persian Lit
perslit_path = ext_corp_path + "/pickled_tokenized_cleaned_corpora"

# Turkic Narrative sources
turk_path = ext_corp_path + "/turkic_corpora"

##### Self-created Corpora

In [28]:
# Transoxania Non-Machine-Readable Notes
trans_notes = int_corp_path + "/non-machine-readable_notes/transoxania_notes"

# Indian Narrative
indo_man_path = int_corp_path + "/non-machine-readable_notes/india_manuscripts"

# Transoxania Narrative
trans_man_path = int_corp_path + "/non-machine-readable_notes/transoxania_manuscripts"

# Transoxania Documents
trans_man_docs_path = int_corp_path + "/xml_notes_stage3_final/bukhara_xml"

# Hyderabad Documents
hyd_man_docs_path = int_corp_path + "/xml_notes_stage3_final/hyderabad_xml"

# Indian Documents (misc. transcribed)
indo_man_docs_path = int_corp_path + "/xml_notes_stage3_final/indic_corpus_xml"

# Qajar Documents (misc. transcribed)
qajar_man_docs_path = int_corp_path + "/xml_notes_stage3_final/qajar_xml"

# Qajar Documents (misc. transcribed)
saf_man_docs_path = int_corp_path + "/xml_notes_stage3_final/qajar_xml"

# Misc Documents (misc. transcribed)
misc_man_docs_path = int_corp_path + "/xml_notes_stage3_final/misc_xml"


##### Unorganized Documents

In [30]:
# Converted to XML, pre-sorted, Stage 2
parser_xml_path = int_corp_path + "/xml_notes_stage2/parser_depository"

# Converted to XML, pre-sorted, Stage 3
updated_docs_path = int_corp_path + "/xml_notes_stage3_final/updater_repository"

# Old system, yet to update
xml_old_sys_path = int_corp_path + "/xml_notes_stage2/xml_transcriptions_old_system"

# Markdown stage
markdown_path = int_corp_path + "/transcription_markdown_drafting_stage1"

# Markdown backlog (old system)
md_backlog_path = int_corp_path + "/transcription_markdown_drafting_stage1/document_conversion_backlog"

Function to pull in text from plain text and markdown files

In [32]:
def read_text_and_md_files(directory, existing_data=None):
    text_files = {}
    
    for root, dirs, files in os.walk(directory):  # Use os.walk for recursive directory traversal
        for filename in files:
            if filename.endswith('.txt') or filename.endswith('.md'):  # Check for both text and markdown files
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text_files[filename] = file.read()  # Use filename as key and file content as value

    if existing_data is not None:
        # Combine the two dictionaries
        text_files.update(existing_data)

    return text_files

# Example usage
# directory_path = 'path/to/your/files'
# existing_data = {'existing_file.txt': 'Existing content'}
# combined_data = read_text_and_md_files(directory_path, existing_data)
#print(combined_data)  # This will print the combined dictionary

Function that does the same as above, except for XML files (converting them to plain text)

In [41]:
def read_xml_files_as_text(directory, existing_data=None):
    text_files = {}
    
    for root, dirs, files in os.walk(directory):  # Use os.walk for recursive directory traversal
        for filename in files:
            if filename.endswith('.xml'):  # Check for XML files
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    xml_content = file.read()
                    # Parse the XML and extract text
                    try:
                        root_element = ET.fromstring(xml_content)
                        plain_text = ''.join(root_element.itertext())  # Get all text from the XML
                        text_files[filename] = plain_text  # Use filename as key and plain text as value
                    except ET.ParseError:
                        print(f"Error parsing {filename}. Skipping this file.")

    if existing_data is not None:
        # Combine the two dictionaries
        text_files.update(existing_data)

    return text_files

In [43]:
read_xml_files_as_text(trans_man_docs_path)

{'ser934.xml': '\n\n\t\n\t\n\n\t\n\t\t\n\t\t\n\t\t بنده نوازا \n\t\n\t\t\n\t\t تصدق سرمبارک جنابعالیحضرت سیدم شوم از روی مرحمت جنابعالیحضرتم\n\t\t\n\t\t از برای احوال گرفتن و خواهش فقرا\n\t\t\n\t\t و\n\t\tتیمور شاه دادر اکبر شاه\n\t\t\tاشیک اقاباشی\n\t\t\n\t\t غلامشانرا فهمیدن\n\t\tمیرزا حسن خواجه\n\t\t غلامشان را فرستاده بودم که \n\t\t\n\t\t بموضع\n\t\tونج\n\t\t توابیع\n\t\tدرواز\n\t\t رفته\n\t\tتیمورشاه\n\t\t مذکور را مع فقرایان شغنانی همراهش بوده گی دیده احوال و خواهش \n\t\t\n\t\t آنها را پرسیده است\n\t\tتیمور شاه\n\t\t مذکور مع فقرایان جنابعالیحضرت سیدم را دعا نموده گفته اند که درینجا آمدن\n\t\t\n\t\t مایان سه سال شد بوقت از شغنان آمدن مقدار سیصد خانه وار بودیم که اکثر آنها فقرا بودند باز \n\t\t\n\t\t بشغنان رفتند الحال شصت یک خانه وار مانده ایم که اکثرا ما سپاه میباشیم از وجه ظلم و زجرا\n\t\tافغانیه بوطن\n\t\t\n\t\t اما رفته نمیتوانیم تیمورشاه دادر اکبر شاه اشیک اقاباشی نیز از برای سریشته و\n\t\tسرابانی\n\t\t مایان بهمین جا\n\t\t \n\t\t\n\t\t میباشد همین\n\t\tونج\n\t\t جای وسیع می

Make one big flat dictionary of main Central Asia plain text I want to look at:

In [45]:
text_notes = read_text_and_md_files (trans_notes)
cent_texts_prelim = read_xml_files_as_text (trans_man_docs_path, text_notes)
cent_texts = read_xml_files_as_text(parser_xml_path, cent_texts_prelim)

Error parsing ser1270.xml. Skipping this file.
Error parsing ser2037.xml. Skipping this file.
Error parsing ser1258.xml. Skipping this file.
Error parsing ser2023.xml. Skipping this file.
Error parsing ser811.xml. Skipping this file.
Error parsing ser1884.xml. Skipping this file.
Error parsing ser1489.xml. Skipping this file.
Error parsing ser1891.xml. Skipping this file.
Error parsing ser1926.xml. Skipping this file.
Error parsing ser1701.xml. Skipping this file.
Error parsing ser2022.xml. Skipping this file.
Error parsing ser2036.xml. Skipping this file.
Error parsing ser1271.xml. Skipping this file.
Error parsing ser1501.xml. Skipping this file.
Error parsing ser1529.xml. Skipping this file.
Error parsing ser812.xml. Skipping this file.
Error parsing ser1918.xml. Skipping this file.
Error parsing ser1930.xml. Skipping this file.
Error parsing ser1924.xml. Skipping this file.
Error parsing ser1887.xml. Skipping this file.
Error parsing ser1893.xml. Skipping this file.
Error parsing s

In [47]:
# clean up Arabic script in a dictionary:

def clean_dictionary_values(data_dict):
    cleaned_dict = {}
    for key, value in data_dict.items():
        cleaned_dict[key] = ac.clean_document(value)  # Clean the document for each value
    return cleaned_dict

In [49]:
cent_texts_clean = clean_dictionary_values(cent_texts)

In [55]:
def regex_search_in_dict(data_dict, regex_pattern, additional_chars=30):
    results = {}
    
    for key, value in data_dict.items():
        matches = re.finditer(regex_pattern, value)  # Find all matches of the regex pattern
        match_count = {}  # To keep track of how many times each match has occurred
        
        for match in matches:
            matched_value = match.group()  # Get the matched string
            match_count[matched_value] = match_count.get(matched_value, 0) + 1  # Increment count
            
            start_index = max(match.start() - additional_chars, 0)  # Ensure we don't go below 0
            end_index = match.end() + additional_chars  # Get the end index for slicing
            matched_text = value[start_index:end_index]  # Extract the matched text with context
            
            # Create a new key for the results
            ordinal = match_count[matched_value]  # Get the current count for this match
            result_key = f"reg_{matched_value}_no{ordinal}"
            
            # Initialize the nested dictionary for the original key if it doesn't exist
            if key not in results:
                results[key] = {}
            
            results[key][result_key] = matched_text  # Store the matched text in the nested dictionary

    return results

In [57]:
search = regex_search_in_dict(cent_texts_clean, "\s(\S*?ده)\sبوده\sاست")

  search = regex_search_in_dict(cent_texts_clean, "\s(\S*?ده)\sبوده\sاست")


In [59]:
print (search)

{'ser1932.xml': {'reg_ داده بوده است_no1': 'ه گی ها را مع اشتر ها شان جواب داده بوده است کار سوداگران بخارایی و قراکول'}, 'ser1935.xml': {'reg_ فوتیده بوده است_no1': 'در سرای بحجره صوفی هندی اکه اش فوتیده بوده است که اینغلام دعاگوی شنیده آدم ف'}, 'ser1934.xml': {'reg_ شناسنانیده بوده است_no1': 'وده خودش را بامارتپناه بی محرم شناسنانیده بوده است که بی محرم مذکور بعضی خذمت مخ'}, 'ser237.xml': {'reg_ آمده بوده است_no1': 'د طبابت مینمایم گفته در وابکند آمده بوده است مدت یکهفته گذشته بیگاه سه شنب', 'reg_ فوتیده بوده است_no1': 'ته بیگاه سه شنبه سزدهم ماه رجب فوتیده بوده است که دعاگویشان فضیلت پناه قاضی '}, 'ser1880.xml': {'reg_ خوابکرده بوده است_no1': 'از حوالی اش نمی بر امده گی شده خوابکرده بوده است که جناب عالیحضرتم بازار شب ول'}, 'ser1561.xml': {'reg_ فرستاده بوده است_no1': 'ه از جهت بی کسی او در نزد پدرش فرستاده بوده است که از روی غلامی و نادانی معلو'}, 'ser906.xml': {'reg_ گردیده بوده است_no1': 'وی مرحمت عالیحضرتموایاما مامور گردیده بوده است که روز پنجشنبه بقرشی آمده گذش', 'reg_ فرستاده

In [59]:
def generate_markdown_report(matches_dict, inbox_path=None):
    # Generate a timestamp for the filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # Create a standard filename with the timestamp
    report_file_name = f"regex_search_report[{timestamp}].md"
    
    # If inbox_path is not provided, use the current directory
    if inbox_path is None:
        inbox_path = os.getcwd()  # Default to the current working directory
    
    # Combine the directory and file name to create the full path
    full_path = os.path.join(inbox_path, report_file_name)

    # Create a Markdown string to hold the report
    markdown_content = ""

    # Iterate through the top-level keys in the dictionary
    for file_name, matches in matches_dict.items():
        # Add a section for each file
        markdown_content += f"### {file_name}\n\n"
        
        # Iterate through the matches in the inner dictionary
        for match_key, context in matches.items():
            # Extract the regex match from the match_key
            regex_match = match_key.split('_')[1]  # Get the part between 'reg_' and '_no1'
            
            # Add the regex match and context to the markdown content
            markdown_content += f"**Regex Match:** {regex_match}\n"
            markdown_content += f"**Context:** {context}\n\n"

    # Save the markdown content to a file
    with open(full_path, 'w', encoding='utf-8') as markdown_file:
        markdown_file.write(markdown_content)

In [60]:
generate_markdown_report(search, inbox_path)