# Corpora Cleaning, Tokenizing, Pickling

### Libraries

In [54]:
import arabic_cleaning as ac
import pandas as pd

In [55]:
import nltk, glob, os, pickle

### Paths

Home Directory

In [56]:
#set home directory path
hdir = os.path.expanduser('~')

#external relative path
ext_corp_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora"

#internal relative path
int_corp_path = hdir + "/Dropbox/Active_Directories/Notes/Primary_Sources"

#pickle path
pickle_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

##### Pre-existing Corpora

In [57]:
# Indic Narrative
indo_path = ext_corp_path + "/indo-persian_corpora"

# Transoxania Narrative (Persian)
trans_path = ext_corp_path + "/machine_readable_persian_transoxania_texts"

# Khiva documents
khiva_path = ext_corp_path + "/khiva_khanate_chancery_corpus"

# Muscovite Persian diplomatic documents
musc_path = ext_corp_path + "/khorezm_muscovy_diplomatic"

# Persian Lit
perslit_path = ext_corp_path + "/pickled_tokenized_cleaned_corpora"

# Turkic Narrative sources
turk_path = ext_corp_path + "/turkic_corpora"

##### Self-created Corpora

In [58]:
# Indian Narrative
indo_man_path = int_corp_path + "/non-machine-readable_notes/indian_manuscripts"

# Transoxania Narrative
trans_man_path = int_corp_path + "/non-machine-readable_notes/bactriana_notes"

# Transoxania Documents
trans_man_docs_path = int_corp_path + "/xml_notes_stage3_final/bukhara_xml"

# Hyderabad Documents
hyd_man_docs_path = int_corp_path + "/xml_notes_stage3_final/hyderabad_xml"

# Indian Documents (misc. transcribed)
indo_man_docs_path = int_corp_path + "/xml_notes_stage3_final/indic_corpus_xml"

# Qajar Documents (misc. transcribed)
qajar_man_docs_path = int_corp_path + "/xml_notes_stage3_final/qajar_xml"

# Qajar Documents (misc. transcribed)
saf_man_docs_path = int_corp_path + "/xml_notes_stage3_final/qajar_xml"

# Misc Documents (misc. transcribed)
misc_man_docs_path = int_corp_path + "/xml_notes_stage3_final/misc_xml"


##### Unorganized Documents

In [59]:
# Converted to XML, pre-sorted, Stage 2
parser_xml_path = int_corp_path + "/xml_notes_stage2/parser_depository"

# Converted to XML, pre-sorted, Stage 3
updated_docs_path = int_corp_path + "/xml_notes_stage3_final/updater_repository"

# Old system, yet to update
xml_old_sys_path = int_corp_path + "/xml_notes_stage2/xml_transcriptions_old_system"

# Markdown stage
markdown_path = int_corp_path + "/transcription_markdown_drafting_stage1"

# Markdown backlog (old system)
md_backlog_path = int_corp_path + "/transcription_markdown_drafting_stage1/document_conversion_backlog"

## Corpus Globbing Section

### Pre-existing Corpora

#### Indic Narrative
Thackston corpus

In [60]:
indo_corpus_files = glob.glob(indo_path + r'//**/*.txt', recursive=True)

indo_corpus = {}
for longname in indo_corpus_files:
    with open(longname) as f:
        txt = f.read()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    indo_corpus[short[0]] = txt
    
#indo_corpus.keys()

#### Transoxania Narrative

In [61]:
trans_corpus_files = glob.glob(trans_path + r'//**/*.txt', recursive=True)

trans_corpus = {}
for longname in trans_corpus_files:
    with open(longname) as f:
        txt = f.read()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    trans_corpus[short[0]] = txt
    
    
#trans_corpus.keys()

#### Persian Literature
*See below*

#### Khiva Documents

In [62]:
khiva_corpus_files = glob.glob(khiva_path + r'//**/*.txt', recursive=True)

khiva_corpus = {}
for longname in khiva_corpus_files:
    with open(longname) as f:
        txt = f.read()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    khiva_corpus[short[0]] = txt
    
    
#khiva_corpus.keys()

#### Turkic Documents
*TBD*

#### Muscovite Persian diplomatic documents
*TBD*

### Self-created Corpora

*Note: need to update processes below to reflect new file organization*

#### Indic Narrative

In [63]:
indo_man_files = glob.glob(indo_man_path + r'//**/*.txt', recursive=True)

indo_man = {}
for longname in indo_man_files:
    with open(longname) as f:
        txt = f.read()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    indo_man[short[0]] = txt
    
    
#indo_man.keys()

#### Transoxania Narrative
Corpus based on partially transcribed manuscripts from early modern Transoxania.

In [64]:
trans_man_files = glob.glob(trans_man_path + r'/*.txt')

trans_man = {}
for longname in trans_man_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    trans_man[short[0]] = txt

trans_man.keys()

dict_keys(['sayyid_muhammad_nasir_tuhfat_al-zairin_lithograph_ser191', 'qari_masihai_tamhid_samarqandi_samariyyah_1957', 'lal_travels_in_the_panjab_afghanistan_turkistan_to_balk_bokhara_and_herat', 'vambery_travels_in_central_asia', 'kurbanov_pechati_bukharskogo_khanstva_xix_nachala_xx_vekov', 'kuns_list_of_manuscripts_kept_in_kitab_city_library_tsgaruz_f_i1_o_69_d_15', 'tarikh_i_muaziya_orenburg1908', 'muhammad_azam_waqiat_i_kashmir_indian_national_archive_no_319', 'fawq_hikayat_i_kashmir', 'shishkov_tadzhiki_1910', 'the_muftis_library_islamic_sources_popular_in_early_modern_transoxiana', 'qursavi_ulege_khekem_itelue_bukhara_1823_kul_f520', 'shams_bukharayi_tarikh_i_bukhara_khuqand_va_kashghar', 'marjani_wafiyyat_al_aslaf_19th_century_ser5', 'muradabadi_fihrist_i_makhtutat_i_amir_al_dawla_public_library', 'materialy_po_istorii_ura_tiube_trans_mukhtarov', 'sadr_al_din_ahmad_al_buhari_al_bardavani_ravayih_al_mustafa_min_azhar_al_murtaza_khuda_bakhsh_1884_ce', 'halat_i_shai_ghulam_ali_ah

#### Transoxania Documents
Qushbegi documents at XML stage

In [65]:
trans_man_doc_files = glob.glob(trans_man_docs_path + r'/*.xml')

trans_man_docs = {}
for longname in trans_man_doc_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    trans_man_docs[short[0]] = txt

trans_man_docs.keys()

dict_keys(['ser934', 'ser89'])

#### Hyderabad Documents

In [66]:
# Hyderabad Documents

hyd_man_doc_files = glob.glob(hyd_man_docs_path + r'/*.xml')

hyd_man_docs = {}
for longname in hyd_man_doc_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    hyd_man_docs[short[0]] = txt

hyd_man_docs.keys()
#Note: nothing in that folder yet.


dict_keys([])

#### Indic Documents
Misc. Indic documents other than those from the Nizam State collection

In [67]:
ind_man_doc_files = glob.glob(indo_man_docs_path + r'/*.xml')

ind_man_docs = {}
for longname in ind_man_doc_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    ind_man_docs[short[0]] = txt

ind_man_docs.keys()

dict_keys(['ser935', 'ser936'])

### Unorganized Documents
E.g. documents still at the markdown stage, and not yet sorted by region.

#### XML, pre-sorted

In [68]:
xml_presort_files = glob.glob(parser_xml_path + r'/*.xml')

xml_presort_docs = {}
for longname in xml_presort_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    xml_presort_docs[short[0]] = txt

xml_presort_docs.keys()

dict_keys(['ser193', 'ser187', 'ser811', 'ser596', 'ser970', 'ser958', 'ser179', 'ser812', 'ser621', 'ser972', 'ser967', 'ser973', 'ser813', 'ser817', 'ser963', 'ser988', 'ser989', 'ser816', 'ser814', 'ser960', 'ser237', 'ser961', 'ser626', 'ser183', 'ser815', 'ser906', 'ser537', 'ser898', 'ser1004', 'ser1006', 'ser939', 'ser84', 'ser905', 'ser904', 'ser85', 'ser938', 'ser91', 'ser1003', 'ser81', 'ser80', 'ser929', 'ser108', 'ser877', 'ser903', 'ser97', 'ser902', 'ser876', 'ser106', 'ser105', 'ser72', 'ser501', 'ser110', 'ser706', 'ser842', 'ser937', 'ser843', 'ser857', 'ser818', 'ser944', 'ser993', 'ser561', 'ser212', 'ser560', 'ser945', 'ser979', 'ser990', 'ser991', 'ser952', 'ser215', 'ser809', 'ser808'])

In [69]:
xml_updated_files = glob.glob(updated_docs_path + r'/*.xml')

xml_updated_docs = {}
for longname in xml_updated_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    xml_updated_docs[short[0]] = txt

xml_updated_docs.keys()

dict_keys([])

#### XML, old system

In [70]:
xml_oldsys_files = glob.glob(xml_old_sys_path + r'//**/*.xml', recursive=True)

xml_oldsys_docs = {}
for longname in xml_oldsys_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    xml_oldsys_docs[short[0]] = txt

xml_oldsys_docs.keys()

dict_keys(['NLR_f-940_ser190', 'IVANUz_1936_ser185', 'TsGARUz_R-2678_ser184', 'TsGARUz_i126_1_1990_20_ser186', 'TsGARUZ_i126_1_1729_101_ser213', 'TsGARUZ_i126_1_1730_81_ser227', 'TsGARUZ_i126_1_1986_1_ser201', 'TsGARUz_i126_1_1730_19_ser218', 'TsGARUz_i126-1-938-2_ser82', 'TsGARUZ_i126_1_1990_3_ser192', 'TsGARUz_i126_1_1730_2_ser188', 'TsGARUz_i126_1_1730_22_ser217', 'RGVIA_400-1-1015_ser143'])

### Pickling XML Corpora

In [71]:
# Merges

## All final stage XML documents
combo_xml_final = {**ind_man_docs, **hyd_man_docs, **trans_man_docs}
## All XML all stages
combo_xml_all = {**combo_xml_final, **xml_oldsys_docs, **xml_presort_docs, **xml_updated_docs}


combo_xml_all.keys()

dict_keys(['ser935', 'ser936', 'ser934', 'ser89', 'NLR_f-940_ser190', 'IVANUz_1936_ser185', 'TsGARUz_R-2678_ser184', 'TsGARUz_i126_1_1990_20_ser186', 'TsGARUZ_i126_1_1729_101_ser213', 'TsGARUZ_i126_1_1730_81_ser227', 'TsGARUZ_i126_1_1986_1_ser201', 'TsGARUz_i126_1_1730_19_ser218', 'TsGARUz_i126-1-938-2_ser82', 'TsGARUZ_i126_1_1990_3_ser192', 'TsGARUz_i126_1_1730_2_ser188', 'TsGARUz_i126_1_1730_22_ser217', 'RGVIA_400-1-1015_ser143', 'ser193', 'ser187', 'ser811', 'ser596', 'ser970', 'ser958', 'ser179', 'ser812', 'ser621', 'ser972', 'ser967', 'ser973', 'ser813', 'ser817', 'ser963', 'ser988', 'ser989', 'ser816', 'ser814', 'ser960', 'ser237', 'ser961', 'ser626', 'ser183', 'ser815', 'ser906', 'ser537', 'ser898', 'ser1004', 'ser1006', 'ser939', 'ser84', 'ser905', 'ser904', 'ser85', 'ser938', 'ser91', 'ser1003', 'ser81', 'ser80', 'ser929', 'ser108', 'ser877', 'ser903', 'ser97', 'ser902', 'ser876', 'ser106', 'ser105', 'ser72', 'ser501', 'ser110', 'ser706', 'ser842', 'ser937', 'ser843', 'ser857'

In [72]:
# No need to pickle sub-directories of unsorted XML files
with open(pickle_path + "/xml_corpora.pkl", "wb") as f:
    pickle.dump((ind_man_docs, hyd_man_docs, trans_man_docs,\
                combo_xml_final, combo_xml_all), f)

#### Markdown Stage
Transcribed docs, yet to be ported over to XML

In [73]:
markdown_files = glob.glob(markdown_path + r'/*.xml')

markdown_docs = {}
for longname in markdown_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    markdown_docs[short[0]] = txt

markdown_docs.keys()
#Will be empty if everything was recently parsed and transfered, per workflow

dict_keys([])

#### Markdown, old system

In [74]:
markdown_old_files = glob.glob(md_backlog_path + r'//**/*.txt', recursive=True)

markdown_old_docs = {}
for longname in markdown_old_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    markdown_old_docs[short[0]] = txt

markdown_old_docs.keys()

dict_keys(['apsa_119', 'apsa_520', 'apsa_534', 'apsa_118', 'apsa_536', 'apsa_527', 'apsa_533', 'apsa_532', 'apsa_526', 'apsa_530', 'apsa_524', 'apsa_531', 'apsa_109', 'apsa_556', 'apsa_557', 'apsa_555', 'apsa_554', 'apsa_550', 'apsa_551', 'apsa_545', 'apsa_69', 'apsa_553', 'apsa_546', 'apsa_552', 'apsa_70', 'apsa_549', 'apsa_71', 'apsa_76', 'apsa_77', 'apsa_113', 'apsa_107', 'apsa_117', 'apsa_115', 'apsa_511', 'apsa_114', 'tsgaruz_i126_1_1897_3_181', 'tsgaruz_i126_1_1953_2_86', 'tsgaruz_i_323_1_749_99', 'tsgaruz_i126_1_1867_3_90', 'tsgaruz_i126_2_317_1_74', 'tsgaruz_i_323_1_581_102', 'tsgaruz_i126_1_1953_5_88', 'tsgaruz_i126_1_1756_2_79', 'tsgaruz_i126_1_1953_4_87', 'tsgaruz_i_323_1_53_98', 'perepiska_glavnogo_shtaba_o_vnutripoliticheskom_polozhenii_v_bukhare_i_afganistane_rgvia_483_1_132', 'tsgaruz_i126_1_1867_5_93', 'tsgaruz_i126_1_1897_1_180', 'tsgaruz_i126_1_1906_1_130', 'tsgaruz_i_323_1_1125_101', 'tsgaruz_i126_1_1953_1_111', 'tsgaruz_i_323_1_1171_100', 'tsgaruz_i126_1_1990_1_177'

## Persian Literature Digital Corpus
Massive corpus of Persian literature, pulled from Ganjur (http://ganjoor.net/) by Roshan (https://persdigumd.github.io/PDL/)

*Corpus pre-cleaned, tokenized, and pickled from a separate script. (Cleaning takes a long time; and this corpus doesn't change very often, and so does not need to be re-run.)*

In [75]:
f = open(perslit_path + '/persian_lit_toks.pkl', 'rb') 

pers_lit_toks = pickle.load(f)
f.close()

In [76]:
#pers_lit_toks.keys()
#pers_lit_toks["hafez.masnavi"][:50]
#pers_lit_toks['ferdowsi.shahnameh']

#type (pers_lit_toks['ferdowsi.shahnameh'][5])

### Cleaning edited texts and notes

In [77]:
# possible to do this once by iterating over the following? crashed computer last time...

# indo_corpus, trans_corpus, khiva_corpus
# indo_man, trans_man
# trans_man_docs, hyd_man_docs, ind_man_docs
# xml_presort_docs, xml_oldsys_docs, markdown_docs, markdown_old_docs


clean_indo = {fn: ac.clean_document(doc) for fn, doc in indo_corpus.items()}
clean_trans = {fn: ac.clean_document(doc) for fn, doc in trans_corpus.items()}
clean_khiva = {fn: ac.clean_document(doc) for fn, doc in khiva_corpus.items()}

clean_indo_man = {fn: ac.clean_document(doc) for fn, doc in indo_man.items()}
clean_trans_man = {fn: ac.clean_document(doc) for fn, doc in trans_man.items()}

clean_trans_man_docs = {fn: ac.clean_document(doc) for fn, doc in trans_man_docs.items()}
clean_hyd_man_docs = {fn: ac.clean_document(doc) for fn, doc in hyd_man_docs.items()}
clean_ind_man_docs = {fn: ac.clean_document(doc) for fn, doc in ind_man_docs.items()}

clean_xml_presort_docs = {fn: ac.clean_document(doc) for fn, doc in xml_presort_docs.items()}
clean_xml_oldsys_docs = {fn: ac.clean_document(doc) for fn, doc in xml_oldsys_docs.items()}
clean_markdown_docs = {fn: ac.clean_document(doc) for fn, doc in markdown_docs.items()}
clean_markdown_old_docs = {fn: ac.clean_document(doc) for fn, doc in markdown_old_docs.items()}



#clean_trans['ikromcha'][:1000]
#clean_trans['ikromcha'][:1000]


#clean_xml['ser561']

#clean_indo['mu_vol1'][:1000]

## Tokenizing

In [94]:
#apparently this dependency is needed for below
#nltk.download('punkt')

In [79]:

# External Corpora Toks

indo_nar_ext_toks = {}
for (fn, txt) in clean_indo.items():
    toks = nltk.word_tokenize(txt)
    indo_nar_ext_toks[fn] = toks

trans_nar_ext_toks = {}
for (fn, txt) in clean_trans.items():
    toks = nltk.word_tokenize(txt)
    trans_nar_ext_toks[fn] = toks 
    
khiva_doc_toks = {}
for (fn, txt) in clean_khiva.items():
    toks = nltk.word_tokenize(txt)
    khiva_doc_toks[fn] = toks

    
# Manually Entered Manuscript Toks

indo_nar_toks = {}
for (fn, txt) in clean_indo_man.items():
    toks = nltk.word_tokenize(txt)
    indo_nar_toks[fn] = toks
    
trans_nar_toks = {}
for (fn, txt) in clean_trans_man.items():
    toks = nltk.word_tokenize(txt)
    trans_nar_toks[fn] = toks

# Clean XML-stage Document Toks
 
trans_xml_toks = {}
for (fn, txt) in clean_trans_man_docs.items():
    toks = nltk.word_tokenize(txt)
    trans_xml_toks[fn] = toks
    
hyd_xml_toks = {}
for (fn, txt) in clean_hyd_man_docs.items():
    toks = nltk.word_tokenize(txt)
    hyd_xml_toks[fn] = toks

indo_xml_toks = {}
for (fn, txt) in clean_ind_man_docs.items():
    toks = nltk.word_tokenize(txt)
    indo_xml_toks[fn] = toks


# Unorganized Markdown-stage Toks


presort_xml_toks = {}
for (fn, txt) in clean_xml_presort_docs.items():
    toks = nltk.word_tokenize(txt)
    presort_xml_toks[fn] = toks
    
oldsys_xml_toks = {}
for (fn, txt) in clean_xml_oldsys_docs.items():
    toks = nltk.word_tokenize(txt)
    oldsys_xml_toks[fn] = toks
    
md_stage_toks = {}
for (fn, txt) in clean_markdown_docs.items():
    toks = nltk.word_tokenize(txt)
    md_stage_toks[fn] = toks

md_oldsys_toks = {}
for (fn, txt) in clean_markdown_old_docs.items():
    toks = nltk.word_tokenize(txt)
    md_oldsys_toks[fn] = toks



*First-stage combinations*: Collapse unsorted documents

In [80]:
unsorted_doc_toks = {**presort_xml_toks, **oldsys_xml_toks, **md_stage_toks, **md_oldsys_toks}

In [81]:
#unsorted_doc_toks['ser560']

### Pickling Corpora

In [82]:
pickle_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

In [83]:
with open(pickle_path + "/corpora.pkl", "wb") as f:
    pickle.dump((unsorted_doc_toks,\
                indo_xml_toks, hyd_xml_toks, trans_xml_toks,\
                trans_nar_toks, indo_nar_toks,\
                trans_nar_ext_toks, indo_nar_ext_toks, khiva_doc_toks), f)

In [84]:
#trans_nar_toks["ziyarat_bukhara_kazan_manuscript_ser492"]

df = pd.DataFrame (trans_nar_toks["ziyarat_bukhara_kazan_manuscript_ser492"], columns=['Token'])
df['Text']='title'

### Corpus Formation: Dataframes

In [85]:
# External Corpora Toks

concat_indo_nar_ext_toks = sum([[("indo_nar_ext_toks", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in indo_nar_ext_toks.items()], [])

concat_trans_nar_ext_toks = sum([[("trans_nar_ext_toks", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in trans_nar_ext_toks.items()], [])

concat_khiva_doc_toks = sum([[("khiva_doc_toks", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in khiva_doc_toks.items()], [])


    
# Manually Entered Manuscript Toks

concat_trans_nar = sum([[("trans_nar", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in trans_nar_toks.items()], [])

concat_indo_nar = sum([[("indo_nar", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in indo_nar_toks.items()], [])


# Clean XML-stage Document Toks
 
concat_trans_xml_toks = sum([[("trans_xml_toks", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in trans_xml_toks.items()], [])

concat_hyd_xml_toks = sum([[("hyd_xml_toks", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in hyd_xml_toks.items()], [])

concat_indo_xml_toks = sum([[("indo_xml_toks", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in indo_xml_toks.items()], [])



# Unorganized Markdown-stage Toks

concat_presort_xml_toks = sum([[("presort_xml_toks", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in presort_xml_toks.items()], [])

concat_trans_nar = sum([[("oldsys_xml_toks", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in oldsys_xml_toks.items()], [])

concat_md_stage_toks = sum([[("md_stage_toks", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in md_stage_toks.items()], [])

concat_md_oldsys_toks = sum([[("md_oldsys_toks", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in md_oldsys_toks.items()], [])




In [86]:
# Persian Lit

concat_pers_lit_toks = sum([[("pers_lit_toks", text, idx, tok) for idx, tok in enumerate(toks)] for text, toks in pers_lit_toks.items()], [])




In [87]:
# just delete the 'test' column above; do the different categories separately, manually specifying the category
# then just concat them all together at the end.

In [88]:
#concat_indo_nar[0:10]

concat = \
concat_indo_nar_ext_toks + concat_trans_nar_ext_toks + concat_khiva_doc_toks + \
concat_trans_nar + concat_indo_nar + \
concat_trans_xml_toks + concat_hyd_xml_toks + concat_indo_xml_toks +\
concat_presort_xml_toks + concat_trans_nar + concat_md_stage_toks + concat_md_oldsys_toks




In [89]:
concat = concat + concat_pers_lit_toks

In [90]:
df = pd.DataFrame(concat, columns = ["Category", "Text", "No", "Token"])

In [91]:
df.sample()

Unnamed: 0,Category,Text,No,Token
2859345,indo_nar_ext_toks,psn1,164137,و


In [92]:
df.to_csv(os.path.join(pickle_path,r'eurasia_corpus.csv'), index=False)

In [93]:
df[5:10]

Unnamed: 0,Category,Text,No,Token
5,indo_nar_ext_toks,sjn1,5,خنده
6,indo_nar_ext_toks,sjn1,6,ریزی
7,indo_nar_ext_toks,sjn1,7,گلبن
8,indo_nar_ext_toks,sjn1,8,سخن
9,indo_nar_ext_toks,sjn1,9,از
