# Comparing Verb Forms in Modern Tajik (Newspaper Corpus) with Bukhari Persian

In [3]:
import os, re
import pandas as pd

Reading in data

In [4]:
#set home directory path
hdir = os.path.expanduser('~')

# Tajik corpus directory
taj_path = os.path.join(hdir, "Dropbox/Active_Directories/Digital_Humanities/Corpora/tajik_newspaper_corpus")


In [5]:
# List to hold data
data = []

# Walk through the directory structure
for subdir, dirs, files in os.walk(taj_path):
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(subdir, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            subdir_name = os.path.basename(subdir)
            data.append({'sub_directory': subdir_name, 'filename': file, 'content': content})

# Create a DataFrame
df = pd.DataFrame(data)

In [6]:
# Total number of Tajik newspaper articles
len(df)

141572

In [7]:
#df.sample(5)

### Regex search pattern for Tajik newspapers

`ме` marks the beginning of the participle (unlike می in Persian, there it is always attached to the verb without a space); then comes the verb participle; which ends with `дагӣ`.  

In [8]:
taj_medagi_pattern = r'\Sи\sме[^ ]*?дагӣ\s'

In [9]:
# Filter the DataFrame and select only rows where 'content' column matches the pattern
filtered_df = df[df['content'].str.contains(taj_medagi_pattern, regex=True, na=False)]

# Keep only the columns where 'content' column matches the pattern
filtered_df = filtered_df[['sub_directory', 'filename', 'content']]


In [10]:
def extract_sentences_with_pattern(df, pattern):
    """
    Extract sentences containing the regex pattern from the dataframe
    
    Args:
        df: DataFrame with 'content' column
        pattern: regex pattern to search for
    
    Returns:
        DataFrame with additional columns for matched sentences
    """
    results = []
    
    for idx, row in df.iterrows():
        content = row['content']
        
        # Split content into sentences (simple approach using common sentence endings)
        # This handles Cyrillic punctuation as well
        sentences = re.split(r'[.!?։]\s+', content)
        
        # Find sentences that contain the pattern
        matching_sentences = []
        for sentence in sentences:
            if re.search(pattern, sentence):
                # Clean up the sentence (remove extra whitespace)
                clean_sentence = ' '.join(sentence.split())
                matching_sentences.append(clean_sentence)
        
        # If we found matching sentences, add them to results
        if matching_sentences:
            for sentence in matching_sentences:
                results.append({
                    'sub_directory': row['sub_directory'],
                    'filename': row['filename'],
                    'matching_sentence': sentence
                })
    
    return pd.DataFrame(results)

In [11]:
pat_df = extract_sentences_with_pattern(df, taj_medagi_pattern)

In [12]:
print(pat_df.head())


        sub_directory                      filename  \
0                Oila         oila_2022-12-31-2.txt   
1  Javonon_Tojikiston      javonon_13_Июн_20173.txt   
2  Javonon_Tojikiston     javonon_11_Май_20231.txt   
3         osiyoavrupo  osiyoavrupo_2022_03_19_0.txt   
4         osiyoavrupo  osiyoavrupo_2022_03_19_0.txt   

                                   matching_sentence  
0  Дар муносибатҳои ошқонаи Ханда ва Карим ҳанӯз ...  
1  Акнун қатъӣ талаб кард: - Ман ба ту чизи мехӯр...  
2  Аммо волидон бо ҳар баҳона дар маҷлисҳои падар...  
3  Ҳанӯз пеш аз фарорасии Наврӯз қазоқҳо анъанае ...  
4  Ба касоне, ки хусумат доштанд, сулҳу оштиро ба...  


In [13]:
len(filtered_df)

16

In [17]:
len(df)

141572

In [16]:
ratio=len(filtered_df)/len(df)
print(f"Ratio of total articles to articles with 'и медагӣ': {ratio:.2f}")

Ratio of total articles to articles with 'и медагӣ': 0.00


In [14]:
def export_sentences_to_file(df, hdir=None):
    """Export all sentences to a text file with timestamp"""
    import datetime
    
    # Use home directory if not provided
    if hdir is None:
        hdir = os.path.expanduser('~')
    
    # Create timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"taj_news_export_{timestamp}.txt"
    
    # Create full path
    full_path = os.path.join(hdir, "Dropbox/Active_Directories/Inbox", filename)
    
    # Export the sentences
    with open(full_path, 'w', encoding='utf-8') as f:
        for i, (idx, row) in enumerate(df.iterrows()):
            f.write(f"--- Example {i+1} ---\n")
            f.write(f"File: {row['filename']}\n")
            f.write(f"Directory: {row['sub_directory']}\n")
            f.write(f"Sentence: {row['matching_sentence']}\n")
            f.write("-" * 80 + "\n\n")
    
    print(f"Exported {len(df)} sentences to {full_path}")
    return full_path


In [18]:
export_sentences_to_file(pat_df)

Exported 18 sentences to /Users/pickettj/Dropbox/Active_Directories/Inbox/taj_news_export_20250815_095448.txt


'/Users/pickettj/Dropbox/Active_Directories/Inbox/taj_news_export_20250815_095448.txt'

In [19]:
pickle_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

df_eurcorp = pd.read_csv (os.path.join(pickle_path,r'eurasia_corpus.csv'))

In [20]:
df_eurcorp.sample(5)

Unnamed: 0,Category,Text,No,Token
1357385,indo_nar_ext_toks,ain-i_akbari_murty,51235,از
6311361,pers_lit_toks,attar.ma,37544,خود
9491062,pers_lit_toks,hojviri.kashfol-mahjoob,19777,می
352423,indo_nar_ext_toks,sjn2,53215,ازیشان
8986365,pers_lit_toks,ghaani.divan,191497,جسم


In [21]:
unique_categories = df_eurcorp['Category'].unique()
print(unique_categories)


['indo_nar_ext_toks' 'trans_nar_ext_toks' 'khiva_doc_toks'
 'oldsys_xml_toks' 'indo_nar' 'trans_xml_toks' 'indo_xml_toks'
 'presort_xml_toks' 'md_oldsys_toks' 'pers_lit_toks']


In [22]:
categories_to_keep = ['trans_xml_toks', 'presort_xml_toks', 'md_oldsys_toks']
trans_docs = df_eurcorp[df_eurcorp['Category'].isin(categories_to_keep)]



In [25]:
trans_docs.sample(5)

Unnamed: 0,Category,Text,No,Token
5274612,md_oldsys_toks,apsa_119,18,اعلام
5257004,presort_xml_toks,ser972,9,تعالی
5282138,md_oldsys_toks,tsgaruz_i126_1_1867_5_93,36,تصدق
5266172,presort_xml_toks,ser105,30,بنیاد
5284842,md_oldsys_toks,tsgaruz_ i126-1-1904-4_ser518,443,جای


In [None]:
# сози мекардаги
# ساز می کرده گی

# regex: limit the length that the word can be so that you don't get the whole document, maybe 20 character limit on stuff that is not dagi

# state that they are (or are not) comparable in terms of length

In [26]:
import pandas as pd
import re

def find_persian_medagi_pattern(df):
    """
    Find Persian می...گی patterns in tokenized corpus
    Handles both single-token and multi-token cases
    
    Args:
        df: DataFrame with columns ['Category', 'Text', 'No', 'Token']
    
    Returns:
        DataFrame with matched patterns and context
    """
    results = []
    
    # Group by text to work with sequences of tokens
    for text_name, text_group in df.groupby('Text'):
        tokens = text_group.sort_values('No')['Token'].tolist()
        
        for i, token in enumerate(tokens):
            # Case 1: Single token containing می...گی pattern
            single_token_pattern = r'می.*?گی'
            if re.search(single_token_pattern, token):
                # Get context (5 tokens before and after)
                start_idx = max(0, i - 5)
                end_idx = min(len(tokens), i + 6)
                context_tokens = tokens[start_idx:end_idx]
                
                results.append({
                    'text_name': text_name,
                    'pattern_type': 'single_token',
                    'matched_token': token,
                    'token_position': i,
                    'context': ' '.join(context_tokens),
                    'match_detail': token
                })
            
            # Case 2: Multi-token pattern می + ... + گی
            elif token == 'می':
                # Look ahead for گی within reasonable distance (up to 3 tokens)
                for j in range(i + 1, min(i + 4, len(tokens))):
                    if tokens[j].endswith('گی'):
                        # Found a match
                        matched_sequence = tokens[i:j+1]
                        
                        # Get broader context
                        start_idx = max(0, i - 5)
                        end_idx = min(len(tokens), j + 6)
                        context_tokens = tokens[start_idx:end_idx]
                        
                        results.append({
                            'text_name': text_name,
                            'pattern_type': 'multi_token',
                            'matched_token': ' '.join(matched_sequence),
                            'token_position': f"{i}-{j}",
                            'context': ' '.join(context_tokens),
                            'match_detail': f"می + {' + '.join(tokens[i+1:j])} + {tokens[j]}"
                        })
                        break  # Stop at first match to avoid overlaps
    
    return pd.DataFrame(results)

def find_alternative_persian_patterns(df):
    """
    Alternative approach: reconstruct text and use regex
    """
    results = []
    
    for text_name, text_group in df.groupby('Text'):
        # Reconstruct the text from tokens
        tokens = text_group.sort_values('No')['Token'].tolist()
        reconstructed_text = ' '.join(tokens)
        
        # Find all می...گی patterns (allowing for spaces)
        pattern = r'می\s+.*?\s+گی|می.*?گی'
        matches = list(re.finditer(pattern, reconstructed_text))
        
        for match in matches:
            # Find which tokens this spans
            match_text = match.group()
            
            results.append({
                'text_name': text_name,
                'matched_sequence': match_text,
                'full_context': reconstructed_text,
                'match_start': match.start(),
                'match_end': match.end()
            })
    
    return pd.DataFrame(results)

# Usage with your trans_docs dataframe:
# Method 1: Token-by-token approach
persian_matches = find_persian_medagi_pattern(trans_docs)

# Method 2: Text reconstruction approach  
# persian_alt_matches = find_alternative_persian_patterns(trans_docs)

# Display results
def display_persian_matches(matches_df, num_to_show=10):
    """Display Persian matches in readable format"""
    pd.set_option('display.max_colwidth', None)
    
    print(f"Found {len(matches_df)} Persian می...گی patterns\n")
    
    for i, (idx, row) in enumerate(matches_df.iterrows()):
        if i >= num_to_show:
            break
        print(f"--- Match {i+1} ---")
        print(f"Text: {row['text_name']}")
        print(f"Type: {row['pattern_type']}")
        print(f"Match: {row['matched_token']}")
        print(f"Context: {row['context']}")
        if 'match_detail' in row:
            print(f"Detail: {row['match_detail']}")
        print("-" * 80)

# Export function for Persian matches
def export_persian_sentences_to_file(df, hdir=None):
    """Export Persian sentences to text file with timestamp"""
    import datetime
    
    if hdir is None:
        hdir = os.path.expanduser('~')
    
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"persian_medagi_export_{timestamp}.txt"
    full_path = os.path.join(hdir, "Dropbox/Active_Directories/Inbox", filename)
    
    with open(full_path, 'w', encoding='utf-8') as f:
        f.write(f"Persian می...گی Pattern Matches\n")
        f.write(f"Total matches found: {len(df)}\n")
        f.write("=" * 80 + "\n\n")
        
        for i, (idx, row) in enumerate(df.iterrows()):
            f.write(f"--- Match {i+1} ---\n")
            f.write(f"Text: {row['text_name']}\n")
            f.write(f"Pattern Type: {row['pattern_type']}\n")
            f.write(f"Matched Token(s): {row['matched_token']}\n")
            f.write(f"Position: {row['token_position']}\n")
            f.write(f"Context: {row['context']}\n")
            if 'match_detail' in row:
                f.write(f"Detail: {row['match_detail']}\n")
            f.write("-" * 80 + "\n\n")
    
    print(f"Exported {len(df)} Persian matches to {full_path}")
    return full_path

# Run the analysis:
print("Searching for Persian می...گی patterns...")
persian_matches = find_persian_medagi_pattern(trans_docs)
print(f"Found {len(persian_matches)} matches")

# To view results:
# display_persian_matches(persian_matches, 5)

# To export:
# export_persian_sentences_to_file(persian_matches)

Searching for Persian می...گی patterns...
Found 6 matches


In [27]:
display_persian_matches(persian_matches, 5)

Found 6 Persian می...گی patterns

--- Match 1 ---
Text: ser179
Type: single_token
Match: میگیرید
Context: و بیچاره گان ولایت دعا میگیرید تصدق شوم اینغلام دعاگوی موافق
Detail: میگیرید
--------------------------------------------------------------------------------
--- Match 2 ---
Text: ser193
Type: multi_token
Match: می ایستاده گی
Context: عملدار دولتخانه بحضور ایلچی خانه می ایستاده گی گردیده بعد از چند وقت
Detail: می + ایستاده + گی
--------------------------------------------------------------------------------
--- Match 3 ---
Text: ser706
Type: multi_token
Match: می شده گی
Context: از وجه تعفن ناک جمع می شده گی باشد و آب باران بخندقهای
Detail: می + شده + گی
--------------------------------------------------------------------------------
--- Match 4 ---
Text: ser967
Type: single_token
Match: میانگی
Context: افروزی سواری اینمکان از در میانگی حکم شهزاد اومیج صاحب که
Detail: میانگی
--------------------------------------------------------------------------------
--- Match 5 ---
Text: tsgar

In [None]:
start over using indexed texts instead of pickles