# Comparing Verb Forms in Modern Tajik (Newspaper Corpus) with Bukhari Persian

In [3]:
import os, re
import pandas as pd

Reading in data

In [4]:
#set home directory path
hdir = os.path.expanduser('~')

# Tajik corpus directory
taj_path = os.path.join(hdir, "Dropbox/Active_Directories/Digital_Humanities/Corpora/tajik_newspaper_corpus")


In [5]:
# List to hold data
data = []

# Walk through the directory structure
for subdir, dirs, files in os.walk(taj_path):
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(subdir, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            subdir_name = os.path.basename(subdir)
            data.append({'sub_directory': subdir_name, 'filename': file, 'content': content})

# Create a DataFrame
df = pd.DataFrame(data)

In [6]:
# Total number of Tajik newspaper articles
len(df)

141572

In [7]:
#df.sample(5)

### Regex search pattern for Tajik newspapers

`ме` marks the beginning of the participle (unlike می in Persian, there it is always attached to the verb without a space); then comes the verb participle; which ends with `дагӣ`.  

In [48]:
taj_medagi_pattern = r'\sме(?:[^\s]*\s){0,2}[^\s]{0,15}гӣ(?=\s|$)'

In [49]:
# Filter the DataFrame and select only rows where 'content' column matches the pattern
filtered_df = df[df['content'].str.contains(taj_medagi_pattern, regex=True, na=False)]

# Keep only the columns where 'content' column matches the pattern
filtered_df = filtered_df[['sub_directory', 'filename', 'content']]


In [50]:
def extract_sentences_with_pattern(df, pattern):
    """
    Extract sentences containing the regex pattern from the dataframe
    
    Args:
        df: DataFrame with 'content' column
        pattern: regex pattern to search for
    
    Returns:
        DataFrame with additional columns for matched sentences
    """
    results = []
    
    for idx, row in df.iterrows():
        content = row['content']
        
        # Split content into sentences (simple approach using common sentence endings)
        # This handles Cyrillic punctuation as well
        sentences = re.split(r'[.!?։]\s+', content)
        
        # Find sentences that contain the pattern
        matching_sentences = []
        for sentence in sentences:
            if re.search(pattern, sentence):
                # Clean up the sentence (remove extra whitespace)
                clean_sentence = ' '.join(sentence.split())
                matching_sentences.append(clean_sentence)
        
        # If we found matching sentences, add them to results
        if matching_sentences:
            for sentence in matching_sentences:
                results.append({
                    'sub_directory': row['sub_directory'],
                    'filename': row['filename'],
                    'matching_sentence': sentence
                })
    
    return pd.DataFrame(results)

In [51]:
pat_df = extract_sentences_with_pattern(df, taj_medagi_pattern)

In [12]:
print(pat_df.head())


        sub_directory                      filename  \
0                Oila         oila_2022-12-31-2.txt   
1  Javonon_Tojikiston      javonon_13_Июн_20173.txt   
2  Javonon_Tojikiston     javonon_11_Май_20231.txt   
3         osiyoavrupo  osiyoavrupo_2022_03_19_0.txt   
4         osiyoavrupo  osiyoavrupo_2022_03_19_0.txt   

                                   matching_sentence  
0  Дар муносибатҳои ошқонаи Ханда ва Карим ҳанӯз ...  
1  Акнун қатъӣ талаб кард: - Ман ба ту чизи мехӯр...  
2  Аммо волидон бо ҳар баҳона дар маҷлисҳои падар...  
3  Ҳанӯз пеш аз фарорасии Наврӯз қазоқҳо анъанае ...  
4  Ба касоне, ки хусумат доштанд, сулҳу оштиро ба...  


In [52]:
len(filtered_df)

3285

In [53]:
len(df)

141572

In [56]:
ratio=len(filtered_df)/len(df)
print(f"Ratio of total articles to articles with 'и медагӣ': {ratio:.2f}")

Ratio of total articles to articles with 'и медагӣ': 0.02


In [14]:
def export_sentences_to_file(df, hdir=None):
    """Export all sentences to a text file with timestamp"""
    import datetime
    
    # Use home directory if not provided
    if hdir is None:
        hdir = os.path.expanduser('~')
    
    # Create timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"taj_news_export_{timestamp}.txt"
    
    # Create full path
    full_path = os.path.join(hdir, "Dropbox/Active_Directories/Inbox", filename)
    
    # Export the sentences
    with open(full_path, 'w', encoding='utf-8') as f:
        for i, (idx, row) in enumerate(df.iterrows()):
            f.write(f"--- Example {i+1} ---\n")
            f.write(f"File: {row['filename']}\n")
            f.write(f"Directory: {row['sub_directory']}\n")
            f.write(f"Sentence: {row['matching_sentence']}\n")
            f.write("-" * 80 + "\n\n")
    
    print(f"Exported {len(df)} sentences to {full_path}")
    return full_path


In [57]:
export_sentences_to_file(pat_df)

Exported 2807 sentences to /Users/pickettj/Dropbox/Active_Directories/Inbox/taj_news_export_20250815_103311.txt


'/Users/pickettj/Dropbox/Active_Directories/Inbox/taj_news_export_20250815_103311.txt'

In [30]:
import historical_corpus_plain_text as hcp

In [31]:
corpus_dict = hcp.get_corpus()

📚 Loading corpus for the first time...
Error parsing ser811.xml. Skipping this file.
Error parsing ser812.xml. Skipping this file.
Error parsing ser813.xml. Skipping this file.
Error parsing ser817.xml. Skipping this file.
Error parsing ser816.xml. Skipping this file.
Error parsing ser814.xml. Skipping this file.
Error parsing ser815.xml. Skipping this file.
Error parsing ser1004.xml. Skipping this file.
Error parsing ser904.xml. Skipping this file.
Error parsing ser1003.xml. Skipping this file.
Error parsing ser876.xml. Skipping this file.
Error parsing ser842.xml. Skipping this file.
Error parsing ser843.xml. Skipping this file.
Error parsing ser857.xml. Skipping this file.
Error parsing ser809.xml. Skipping this file.
Error parsing ser808.xml. Skipping this file.
✅ Loaded 1066 documents
🧹 Cleaning Arabic script...
✅ Cleaning complete


In [37]:
len(corpus_dict)

1066

In [54]:

def filter_corpus_by_pattern(corpus_dict, pattern=None):
    """
    Filter corpus dictionary to keep only entries that contain at least one match
    of the specified regex pattern.
    
    Args:
        corpus_dict (dict): Dictionary where keys are filenames and values are text content
        pattern (str, optional): Regex pattern to search for. If None, uses default Persian pattern.
    
    Returns:
        dict: Filtered dictionary containing only entries with matches
        
    Example:
        filtered_corpus = filter_corpus_by_pattern(corpus_dict)
    """
    
    # Default pattern: می + up to 15 chars (max 2 whitespaces) + گی
    if pattern is None:
        # Pattern breakdown:
        # می - literal "می"
        # (?:[^\s]*\s){0,2} - non-capturing group for non-whitespace chars followed by space, 0-2 times
        # [^\s]{0,15} - up to 15 non-whitespace characters at the end
        # گی - literal "گی"
        pattern = r'می(?:[^\s]*\s){0,2}[^\s]{0,15}گی'
    
    filtered_dict = {}
    
    for filename, text_content in corpus_dict.items():
        # Search for the pattern in the text content
        if re.search(pattern, text_content):
            filtered_dict[filename] = text_content
    
    return filtered_dict

def get_pattern_matches(corpus_dict, pattern=None, context_chars=30):
    """
    Get all pattern matches with context from the corpus.
    
    Args:
        corpus_dict (dict): Dictionary where keys are filenames and values are text content
        pattern (str, optional): Regex pattern to search for
        context_chars (int): Number of characters to include before/after each match
    
    Returns:
        dict: Nested dictionary with matches and their context
    """
    
    if pattern is None:
        pattern = r'\sمی(?:[^\s]*\s){0,2}[^\s]{0,15}گی(?=\s|$)'
    
    results = {}
    
    for filename, text_content in corpus_dict.items():
        matches = re.finditer(pattern, text_content)
        file_matches = {}
        
        for i, match in enumerate(matches, 1):
            start_idx = max(0, match.start() - context_chars)
            end_idx = match.end() + context_chars
            context = text_content[start_idx:end_idx]
            
            match_key = f"match_{i}_{match.group()}"
            file_matches[match_key] = context
        
        if file_matches:  # Only add files that have matches
            results[filename] = file_matches
    
    return results

# Example usage:
# filtered_corpus = filter_corpus_by_pattern(corpus_dict)
# print(f"Original corpus: {len(corpus_dict)} documents")
# print(f"Filtered corpus: {len(filtered_corpus)} documents")

# To get matches with context:
# matches_with_context = get_pattern_matches(corpus_dict)

In [55]:
# Filter your corpus
filtered_corpus = filter_corpus_by_pattern(corpus_dict)

# Check results
print(f"Original corpus: {len(corpus_dict)} documents")
print(f"Filtered corpus: {len(filtered_corpus)} documents")


Original corpus: 1066 documents
Filtered corpus: 198 documents


In [39]:
# Calculate percentage
total_docs = len(corpus_dict)
matching_docs = len(filtered_corpus)
percentage = (matching_docs / total_docs) * 100

print(f"Total documents: {total_docs}")
print(f"Documents with pattern: {matching_docs}")
print(f"Percentage: {percentage:.2f}%")

Total documents: 1066
Documents with pattern: 198
Percentage: 18.57%
