# Import

In [1]:
# !pip install devon

In [2]:
import re
import json
import nltk
import numpy as np
import pandas as pd
from devon.devon import FSMStemmer

In [3]:
STOPWORDS_ADD = ["a", "aa", "aaa", "aaaa", "aaaaa", "aaaaaa", "aaba"]

# Group data by page (pg_index)

In [4]:
def group_texts(dataframe: pd.DataFrame, text_column_name: str) -> pd.DataFrame:
    df_page_ind = pd.DataFrame(dataframe['article_uuid'].unique(), columns=['article_uuid'])
    df_page_ind = df_page_ind.reset_index().rename(columns={'index': 'pg_index'})
    dataframe = dataframe.merge(df_page_ind, how='left', on='article_uuid')
    dataframe = dataframe[[text_column_name, 'pg_index']]
    dataframe = dataframe.groupby(by='pg_index').agg({text_column_name: ' '.join})
    df_pg = dataframe.reset_index()
    df_pg = df_pg.dropna()
    return df_pg

In [5]:
dataframe = pd.read_csv("UzWikiTexts.csv")
stop_words = json.load(open("UzStopWords.json")) + STOPWORDS_ADD

In [6]:
df_grouped_texts = group_texts(dataframe, 'sentence')

In [7]:
def create_textshape_data(dataframe: pd.DataFrame, text_shape: list, column: str) -> pd.DataFrame:
    data_text_shape = pd.DataFrame(columns=['text_shape', 'pages_amount'])
    for shape in text_shape:
        lambda_ = lambda text: len(text.split()) > shape
        pages_amount = dataframe[dataframe[column].apply(lambda_)].shape[0]
        data_text_shape = data_text_shape.append({'text_shape': shape,
                                                  'pages_amount': pages_amount}, ignore_index=True)
    return data_text_shape

In [8]:
create_textshape_data(df_grouped_texts, [100, 150, 300, 500, 1000], 'sentence')

Unnamed: 0,text_shape,pages_amount
0,100,21019
1,150,12640
2,300,4788
3,500,2193
4,1000,780


Since we need to get a corpus of 10,000 documents, it is permissible that each document contains at least 150 words.

In [9]:
lambda_ = lambda text: len(text.split()) > 150
df_grouped_texts = df_grouped_texts[df_grouped_texts['sentence'].apply(lambda_)].reset_index(drop=True)

In [10]:
df_grouped_texts

Unnamed: 0,pg_index,sentence
0,0,ShoʻrvaShoʻrva — suyuq ovqat turi. Tayyorlash ...
1,12,Shoʻrlangan tuproqlarShoʻrlangan tuproqlar - t...
2,20,"ShoʻrlanishShoʻrlanish, tuproq shoʻrlanishi — ..."
3,62,"Shoʻroi islomiya""SHOʻROI ISLOMIYA"" (""Islom ken..."
4,102,Shoʻrtan gaz-kimyo majmuasiShoʻrtan gaz-kimyo ...
...,...,...
12635,437305,"BjörkBjörk (toʻliq ismi — ""Björk Guðmundsdótti..."
12636,437308,YozYoz toʻrt fasldan biridir. Yoz fasli bahor ...
12637,437889,№ 5-Maksim Gorkiy nomli umumiy oʻrta maktab№ 5...
12638,439612,Bayanavul (tuman)Bayanavul tumani — Pavlodar v...


# Clean Texts

In [11]:
def _split_upper(word: str) -> list:
    upper_word_list = re.split("(?=[A-Z])", word)
    if upper_word_list[0] == "":
        return upper_word_list[1:]
    return upper_word_list

def _check_word_len(word: str, min_len: int = 3, max_len: int = 30) -> bool:
    if len(word) >= min_len and len(word) <= max_len:
        return word
    return None

def _delete_apostrof(word: str) -> str:
    apostrofs = ["'", "ʻ", "ʼ"]
    if word in apostrofs:
        return ""
    if word[0] in apostrofs:
        word = word[1:]
    if word[-1] in apostrofs:
        word = word[:-1]
    return word

def _check_stop_words(word: str, stop_words: list) -> str:
    if word in stop_words:
        return ""
    return word

def _clean_pipline(words_list: np.ndarray) -> pd.Series:
    words_list = pd.Series(words_list)
    words_list = words_list.apply(_delete_apostrof)
    words_list = words_list.apply(lambda word: word.lower())
    words_list = words_list.apply(lambda word: _check_stop_words(word, stop_words)).dropna()
    words_list = words_list.apply(lambda word: FSMStemmer().stem(words=word)[0])
    words_list = words_list.apply(_check_word_len).dropna().reset_index(drop=True)
    words_list = words_list.apply(lambda word: _check_stop_words(word, stop_words)).dropna()
    return words_list

def clean_text(text: str, stop_words: list) -> str:
    apostrofs = ["'", "ʻ", "ʼ"]
    splited_text = re.findall(r"[A-Za-z 'ʻʼ]+", text)
    words_list = np.array([], dtype=object)
    for sentence in splited_text:
        sentence_word_list = sentence.split()
        for word in sentence_word_list:
            splited_word_list = _split_upper(word)
            words_list = np.append(words_list, splited_word_list)
    clean_words = _clean_pipline(words_list)
    return ' '.join(clean_words.values)

In [12]:
lambda_ = lambda text: clean_text(text, stop_words)
df_grouped_texts['clean_text'] = df_grouped_texts.loc[:, 'sentence'].apply(lambda_)
df_grouped_texts = df_grouped_texts.dropna()

In [13]:
df_grouped_texts

Unnamed: 0,pg_index,sentence,clean_text
0,0,ShoʻrvaShoʻrva — suyuq ovqat turi. Tayyorlash ...,shoʻrva shoʻrva suyuq ovqat tur tayyorlash usu...
1,12,Shoʻrlangan tuproqlarShoʻrlangan tuproqlar - t...,shoʻrlangan tuproq shoʻrlangan tuproq tarkib s...
2,20,"ShoʻrlanishShoʻrlanish, tuproq shoʻrlanishi — ...",shoʻrlanish shoʻrlanish tuproq shoʻrlanish suv...
3,62,"Shoʻroi islomiya""SHOʻROI ISLOMIYA"" (""Islom ken...",shoʻro islomiya islo kengash turkiston taraqqi...
4,102,Shoʻrtan gaz-kimyo majmuasiShoʻrtan gaz-kimyo ...,shoʻrtan gaz kimyo majmua shoʻrtan gaz kimyo m...
...,...,...,...
12635,437305,"BjörkBjörk (toʻliq ismi — ""Björk Guðmundsdótti...",toʻliq mundsd ttir talaffuz byork gudmundsdott...
12636,437308,YozYoz toʻrt fasldan biridir. Yoz fasli bahor ...,yoz yoz toʻrt fasl yoz fasl bahor kuz fasl jo...
12637,437889,№ 5-Maksim Gorkiy nomli umumiy oʻrta maktab№ 5...,maks gorkiy noml umumiy oʻrta maktab maks gork...
12638,439612,Bayanavul (tuman)Bayanavul tumani — Pavlodar v...,bayanavul tuman bayanavul tuma pavlodar viloya...


In [14]:
df_uz_corpus = df_grouped_texts[['pg_index', 'clean_text']]

Let's do the same again. After cleaning the texts, the number of words could decrease

In [15]:
create_textshape_data(df_uz_corpus, [100, 130, 150, 300, 500, 1000], 'clean_text')

Unnamed: 0,text_shape,pages_amount
0,100,12555
1,130,10409
2,150,8512
3,300,3091
4,500,1367
5,1000,504


In [16]:
lambda_ = lambda text: len(text.split()) > 130
df_uz_corpus = df_uz_corpus[df_uz_corpus['clean_text'].apply(lambda_)].reset_index(drop=True)

In [17]:
df_uz_corpus

Unnamed: 0,pg_index,clean_text
0,12,shoʻrlangan tuproq shoʻrlangan tuproq tarkib s...
1,20,shoʻrlanish shoʻrlanish tuproq shoʻrlanish suv...
2,62,shoʻro islomiya islo kengash turkiston taraqqi...
3,102,shoʻrtan gaz kimyo majmua shoʻrtan gaz kimyo m...
4,114,shoʻrtepa neft gaz shoʻrtepa neft gaz qashqada...
...,...,...
10404,437284,favvora favvora biror manba suv biror tushib i...
10405,437305,toʻliq mundsd ttir talaffuz byork gudmundsdott...
10406,437889,maks gorkiy noml umumiy oʻrta maktab maks gork...
10407,439612,bayanavul tuman bayanavul tuma pavlodar viloya...


In [18]:
df_uz_corpus.to_csv("UzCleanCorpus.csv", index=False)