# Comparing Verb Forms in Modern Tajik (Newspaper Corpus) with Bukhari Persian

In [2]:
import os
import pandas as pd

Reading in data

In [3]:
#set home directory path
hdir = os.path.expanduser('~')

# Tajik corpus directory
taj_path = os.path.join(hdir, "Dropbox/Active_Directories/Digital_Humanities/Corpora/tajik_newspaper_corpus")


In [5]:
# List to hold data
data = []

# Walk through the directory structure
for subdir, dirs, files in os.walk(taj_path):
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(subdir, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            subdir_name = os.path.basename(subdir)
            data.append({'sub_directory': subdir_name, 'filename': file, 'content': content})

# Create a DataFrame
df = pd.DataFrame(data)

In [36]:
# Total number of Tajik newspaper articles
len(df)

141572

In [37]:
#df.sample(5)

### Regex search pattern for Tajik newspapers

`ме` marks the beginning of the participle (unlike می in Persian, there it is always attached to the verb without a space); then comes the verb participle; which ends with `дагӣ`.  

In [38]:
taj_medagi_pattern = r'\Sи\sме[^ ]*?дагӣ\s'

In [39]:
# Filter the DataFrame and select only rows where 'content' column matches the pattern
filtered_df = df[df['content'].str.contains(taj_medagi_pattern, regex=True, na=False)]

# Keep only the columns where 'content' column matches the pattern
filtered_df = filtered_df[['sub_directory', 'filename', 'content']]


In [40]:
len(filtered_df)

16

In [41]:
filtered_df.sample(10)

Unnamed: 0,sub_directory,filename,content
73231,Pressa.tj,pressatj_07.10.2022_7.txt,Александра Кудашкина ва Николай Назриев — ду с...
77358,Pressa.tj,pressatj_14.01.2020_0.txt,Аброри Зоҳир пурмаҳсултарин нависандаи тоҷик а...
75968,Pressa.tj,pressatj_06.06.2022_1036.txt,Олимони иқлимшиноси (климатологи) аврупоӣ ҳам...
101747,Farazh,farazh_2019-07-27-2.txt,Баъзан мо аз ғояти нодониву ҷоҳилӣ бо ғаму шик...
83620,Pressa.tj,pressatj_06.06.2022_345.txt,Суҳбат бо Зебунисо Қутбиддинова ҳамсари Шоири ...
79582,Pressa.tj,pressatj_14.12.2016_9.txt,Салом хонандагон ва аҳли коригарони сомонаи «...
58445,Ozodi,ozodi_2009-11-30-2.txt,"Албатта, дар гузашта низ блогнависоне гаҳ-гоҳ ..."
61659,Ozodi,ozodi_2010-01-06-3.txt,"Ҳамин, гӯед, дар кӯдакиям як латифаеро хонда б..."
93129,Pressa.tj,pressatj_06.06.2022_1271.txt,"Хонандагони азиз, тавре ки огоҳӣ доред, ҳафтан..."
719,Oila,oila_2022-12-31-2.txt,Шумо ошиқи сериалҳои туркӣ ҳастед ва ба ишқу м...


In [31]:
pickle_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

df_eurcorp = pd.read_csv (os.path.join(pickle_path,r'eurasia_corpus.csv'))

In [32]:
df_eurcorp.sample(5)

Unnamed: 0,Category,Text,No,Token
3555209,khiva_doc_toks,MIRZAEV_JULY_2016,3059,قطعه
9162340,pers_lit_toks,gorgani.veysoramin,69511,مشکبارش
13285183,pers_lit_toks,ouhadi.jaamejam,36626,مردان
15901049,pers_lit_toks,saeb.divan,441189,بال
8708252,pers_lit_toks,forughi.divan,2355,فروغی


In [33]:
unique_categories = df_eurcorp['Category'].unique()
print(unique_categories)


['indo_nar_ext_toks' 'trans_nar_ext_toks' 'khiva_doc_toks'
 'oldsys_xml_toks' 'indo_nar' 'trans_xml_toks' 'indo_xml_toks'
 'presort_xml_toks' 'md_oldsys_toks' 'pers_lit_toks']


In [34]:
categories_to_keep = ['trans_xml_toks', 'presort_xml_toks', 'md_oldsys_toks']
trans_docs = df_eurcorp[df_eurcorp['Category'].isin(categories_to_keep)]



In [35]:
trans_docs.sample(5)

Unnamed: 0,Category,Text,No,Token
5267546,presort_xml_toks,ser706,2,تصدق
5261481,presort_xml_toks,ser1006,27,سعادت
5255325,presort_xml_toks,ser970,96,کرده
5255773,presort_xml_toks,ser958,271,رضا
5283215,md_oldsys_toks,tsgaruz_i_323_1_1171_100,116,الی


In [None]:
сози мекардаги
ساز می کرده گی

# regex: limit the length that the word can be so that you don't get the whole document, maybe 20 character limit on stuff that is not dagi

# state that they are (or are not) comparable in terms of length