# Import

In [1]:
import re
import glob
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
from uuid import uuid4

In [None]:
nltk.download('punkt')

# Reading texts from Uzbek Wiki

In [None]:
def split_keep_sep(string: str, sep: str) -> list:
    cleaned = []
    string = re.split('(%s)' % re.escape(sep), string)
    for _ in string:
        if _ != '' and _ != sep:
            cleaned.append(sep + _)
    return cleaned

def remove_html_tags(text: str) -> str:
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def remove_special_chars(text: str, char_list: list) -> str:
    for char in char_list:
        text = text.replace(char, '')
    return text.replace(u'\xa0', u' ')

def process_wiki_file(wiki_file: str) -> pd.DataFrame:
    chars = ['\n']
    with open(wiki_file, encoding='utf-8') as f:
        content = f.read()
    articles = split_keep_sep(content, '<doc id=')
    dataframe = pd.DataFrame(columns=['article_uuid', 'sentence'])
    for article in articles:
        uuid = uuid4()
        article = remove_special_chars(remove_html_tags(article), chars)
        sentences = nltk.sent_tokenize(article)
        temp_dataframe = pd.DataFrame({'article_uuid': [uuid] * len(sentences),
                                       'sentence': sentences})
        dataframe = dataframe.append(temp_dataframe)
    return dataframe

In [None]:
wiki_files = []
for filename in glob.iglob("UzWiki/*/*"):
    wiki_files.append(filename)

In [None]:
dataframe = pd.DataFrame()
for file_name in tqdm(wiki_files):
    dataframe_file = process_wiki_file(file_name)
    dataframe = pd.concat([dataframe, dataframe_file])
dataframe['article_uuid'] = dataframe['article_uuid'].astype(str)

In [None]:
dataframe.to_csv("UzWikiTexts.csv", index=False)

# Check

In [2]:
dataframe = pd.read_csv("UzWikiTexts.csv")

In [4]:
dataframe

Unnamed: 0,article_uuid,sentence
0,2508e29b-6b60-42c5-bc8e-577f3a124cf1,ShoʻrvaShoʻrva — suyuq ovqat turi.
1,2508e29b-6b60-42c5-bc8e-577f3a124cf1,"Tayyorlash usuliga koʻra, shoʻrvaning qovurma ..."
2,2508e29b-6b60-42c5-bc8e-577f3a124cf1,"Goʻsht boʻlaklarga boʻlinib, yogʻ bilan birga ..."
3,2508e29b-6b60-42c5-bc8e-577f3a124cf1,"Suv qaynab chiqquncha qozonga tuz, bir dona qi..."
4,2508e29b-6b60-42c5-bc8e-577f3a124cf1,"Shoʻrva qaynab chiqqach, olovi pasaytirilib, y..."
...,...,...
1157634,665ef546-82b2-4ac8-ab4d-07667cf39271,"Aholi zichligi – har kvadrat kilometrga 25,4 n..."
1157635,665ef546-82b2-4ac8-ab4d-07667cf39271,"Shundan 3,70 km2 quruqlik."
1157636,665ef546-82b2-4ac8-ab4d-07667cf39271,Dengiz sathidan oʻrtacha 290 m balandlikda joy...
1157637,de26d018-dbc2-4bc4-a55b-2567f60747bf,"Amenia, North Dakota"
