# Russian language dataset

First, install all prerequisites. In order to convert you need to get the conversion xsl from here: https://github.com/mgrankin/ru_transformers/blob/master/corpus/FB2_2_txt.xsl and put it in the root folder. Otherwise, you will get empty txts as result.

In [None]:
!pip install langdetect tqdm
!pip install "tqdm==4.43.0"
!sudo apt-get install -y xsltproc
from fastai.basics import *
from tqdm import *
from tqdm.contrib.concurrent import process_map, thread_map
from multiprocessing import Pool
import regex as re
import time
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

NEW_LINE = '<|n|>'

librusec = '/home/u/nas/librusec/lib.rus.ec'
tmpzips = './tmp/zip'
tmptxt = './tmp/txt'
tmpfb2clean = './tmp/fb2clean'
tmpfb2unzip = './tmp/fb2unzip'
data = Path('../data/full')

!curl -LJO https://raw.githubusercontent.com/mgrankin/ru_transformers/master/corpus/FB2_2_txt.xsl
!mkdir ../data
!mkdir ../data/full
!mkdir ../data/classic
!mkdir tmp
!mkdir tmp/fb2unzip
!mkdir tmp/fb2clean
!mkdir tmp/txt
!mkdir tmp/zip

### Unpack ZIPs

Before running this we need to upload our fb2 zip files to /tmp/zip and then run this script

In [23]:
# clean the output directory 
!rm -rfv {tmpfb2unzip + '/*'}

In [59]:
zips = get_files(tmpzips, '.zip')
print(f'{len(zips)} zip file(s) found')

def unpack(fn):
    # replace -o with -n to not overwrite existing files
    # remove the -q flag for more logging
    # -j  junk paths (do not make directories)  
    # -qo flag to remove annoying warning https://www.directadmin.com/features.php?id=2213
    !unzip -qq -joL -qo -O cp396 {fn} -d {tmpfb2unzip} >>/dev/null

# Unpack zips in parallel
thread_map(unpack, zips, max_workers=64)
print(f'Unzipped all - DONE')

# Sanitize file and folder names - remove spaces
!find $tmp -depth -name "* *" -execdir rename 's/ /_/g' "{}" \;

1 zip file(s) found


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Unzipped all


### Convert fb2 to txt

In [None]:
# clean the output directory 
!rm -rfv {tmpfb2clean + '/*'}

In [None]:
# clean the output directory 
!rm -rfv {tmptxt + '/*'}

In [None]:
# Get fb2s
fbs = get_files(tmpfb2unzip, '.fb2', recurse=True)

# Sanitize filenames and move to the 'clean' dir
for fn in fbs:
    nn = (str(fn.name)
        .replace(' ','')
        .replace('_quot;','')
        .replace('!','')
        .replace(',','')
        .replace('(','')
        .replace(')','')
        .replace('\xa0','')
        .replace('.','')
        .replace('fb2', '.fb2')
         )
    shutil.move(fn, f'{tmpfb2clean}/{nn}')
print(f'{len(fbs)} fb2(s) sanitized')

# In order to convert you need to get the conversion xsl from here:
# https://github.com/mgrankin/ru_transformers/blob/master/corpus/FB2_2_txt.xsl
# and put it in the root folder
def convert_fb2(fn):
    #!xsltproc FB2_2_txt.xsl {fn} > {str(fn).replace(' ', '').replace('.fb2','.txt').replace('/fb2','/txt')} 2>>/dev/null
    !xsltproc FB2_2_txt.xsl {fn} > {tmptxt + '/' + fn.name.replace('fb2', 'txt')} 2>>/dev/null
    return {fn}

# Get fb2s from the clean snitized dir
fbs = get_files(tmpfb2clean, '.fb2')

# convert all to .txt
thread_map(convert_fb2, fbs, max_workers=64)
print('FB2(s) conversion done')

### Filter and concat txt files

In [None]:
txts = get_files('./tmp/txt', '.txt')
print(f'Found {len(txts)} txt(s)')

# this will take time, bcs langdetect fails on multithreading
print('Running langdetect . . . ')
for fn in progress_bar(txts):
    with open(f'./{fn}', 'r') as f:
        lines = f.read()
        print(f)
        print(fn)
        print(len(lines))
        try:
            if len(lines) > 1e+4 and detect(lines) == 'ru':
                with open(f'{data}/{fn.name}', 'w') as c:
                    c.write(lines)
        except LangDetectException as e:
            pass

# Add space before each word. It's not really nesessary. 
# It just makes encoding a bit more meaningful to the model and the text smaller(after encoding).
print('Running text sanitization . . . ')
def process_fn(fn):
    match = re.compile(r'(?=[^ ])([\W])([\w])')
    match2 = re.compile('(.|\s)\\1\\1+')
    with open(fn, 'r') as f:
        lines = f.read()
    if lines and lines[0] != ' ': lines = ' ' + lines
    lines = match.sub(r'\g<1> \g<2>', lines)
    lines = match2.sub(r'\1'*3, lines)
    with open(fn, 'w') as c:
        c.write(lines)
        
thread_map(process_fn, txts)

In [None]:
txts = get_files(data, '.txt'); 
print(f'Amount of txt(s): {len(txts)}')

fsorted = get_files('../data/classic', '.txt') + sorted(txts, key=lambda fn: os.path.getsize(fn))

print('Building text corpus')
sz=0
with open('./tmp/russian_corpus_for_vocab.txt', 'w') as c:
    for fn in fsorted:
        with open(fn, 'r') as f:
            sz += c.write(f.read().replace('\n', f' {NEW_LINE} ') + '\n')
            if sz > 5e+9:
                break
print('Done ./tmp/russian_corpus_for_vocab.txt')

Now a text corpus can be collected from ./tmp/russian_corpus_for_vocab.txt to use with gpt-2

## Historical workbook


### more data

In [67]:
!rm -R /share/CE_CACHEDEV1_DATA/data/classic_lit/*
!cp -R /share/CE_CACHEDEV1_DATA/data/downloads/txt /share/CE_CACHEDEV1_DATA/data/classic_lit/

rm: cannot remove '/share/CE_CACHEDEV1_DATA/data/classic_lit/*': No such file or directory
cp: cannot stat '/share/CE_CACHEDEV1_DATA/data/downloads/txt': No such file or directory


In [68]:
fbs = get_files('/home/u/nas/classic_lit/txt/', '.fb2', recurse=True); len(fbs)

0

In [69]:
!rm -R /home/u/nas/classic_lit/out

rm: cannot remove '/home/u/nas/classic_lit/out': No such file or directory


In [70]:
path_out = Path('/home/u/nas/classic_lit/out'); path_out.mkdir(exist_ok = True)

FileNotFoundError: ignored

In [None]:
for fn in fbs:
    nn = (str(fn.name)
        .replace(' ','.')
        .replace('_quot;','.')
        .replace('!','.')
        .replace(',','.')
        .replace('(','.')
        .replace(')','.')
        .replace('\xa0','.')
         )
    shutil.move(fn, path_out/nn)

In [None]:
fbs = get_files(path_out, '.fb2'); len(fbs)

In [None]:
!rm ../data/classic/*

In [None]:
data = Path('../data/classic')

In [None]:
def convert_fb2(fn):
    name = str(fn.name).replace('.fb2','.txt')
    !xsltproc FB2_2_txt.xsl {fn} > {data/name} 2>>/dev/null

In [None]:
for _ in progress_bar(Pool(64).imap_unordered(convert_fb2, fbs), len(fbs)):
    pass

In [None]:
!rm ../data/classic/*месяцеслов*

In [None]:
!mkdir ../data/classic/valid

In [None]:
mv ../data/classic/Tolstoy_Dva_pisma_k_M_Gandi.56185.txt ../data/classic/valid

In [None]:
mv ../data/classic/Tolstoy_Sobranie_sochineniy_v_dvadtsati_dvuh_tomah_22_Tom_22._Izbrannyie_dnevniki_1895-1910.142868.txt ../data/classic/valid

In [None]:
mv ../data/classic/Tolstoy_Sobranie_sochineniy_v_dvadtsati_dvuh_tomah_20_Tom_20._Izbrannyie_pisma_1900-1910.142866.txt ../data/classic/valid

### filter txts

In [None]:
txts = get_files('./tmp/txt', '.txt')
print(f'Found {len(txts)} txt(s)')

In [None]:
# test
fn = txts[0]
match = re.compile('(.|\s)\\1\\1+')
with open(fn, 'r') as f:
    lines = f.read()
lines = 'asdf aaaa dddddfffff' + lines
lines = match.sub(r'\1'*3, lines)             
lines[:1000]

In [None]:
# test
detect(lines)

In [None]:
# test
expr = re.compile('([^\n]{150,})([.] )([^\n]{150,})')
while expr.search(lines):
    lines = expr.sub(r'\g<1>.\n\g<3>', lines, 1)
lines[:1000]

In [None]:
# this will take time, bcs langdetect fails on multithreading
for fn in progress_bar(txts):
    with open(f'./{fn}', 'r') as f:
        lines = f.read()
        print(f)
        print(fn)
        print(len(lines))
        try:
            if len(lines) > 1e+4 and detect(lines) == 'ru':
                with open(f'{data}/{fn.name}', 'w') as c:
                    c.write(lines)
        except LangDetectException as e:
            pass

In [None]:
txts = get_files(data, '.txt')
print(txts)

In [None]:
txts += ['../data/poetry_base.txt']

In [None]:
txts += get_files('../data/classic', '.txt', recurse=True)

In [None]:
# Add space before each word. It's not really nesessary. 
# It just makes encoding a bit more meaningful to the model and the text smaller(after encoding).

def process_fn(fn):
    match = re.compile(r'(?=[^ ])([\W])([\w])')
    match2 = re.compile('(.|\s)\\1\\1+')
    with open(fn, 'r') as f:
        lines = f.read()
    if lines and lines[0] != ' ': lines = ' ' + lines
    lines = match.sub(r'\g<1> \g<2>', lines)
    lines = match2.sub(r'\1'*3, lines)
    with open(fn, 'w') as c:
        c.write(lines)
        
thread_map(process_fn, txts)

In [None]:
with open('../data/poetry_base.txt', 'r') as f:
    lines = f.readlines()
    
split = int(len(lines)*0.95)

with open('../data/poetry_dry.txt', 'w') as f:
    f.writelines(lines[:split])

with open('../data/poetry_eval.txt', 'w') as f:
    f.writelines(lines[split:])

### concat for vocab

Take smallest files to increase word diversity

In [None]:
txts = get_files(data, '.txt'); 
print(f'Amount of txt(s): {len(txts)}')

for fn in txts:
    if os.path.getsize(fn) <= 1e+3: 
        os.remove(fn)

txts = get_files(data, '.txt'); 
print(f'Amount of txt(s): {len(txts)}')

In [None]:
fsorted = get_files('../data/classic', '.txt') + sorted(txts, key=lambda fn: os.path.getsize(fn))  

In [None]:
sz=0
with open('./tmp/russian_corpus_for_vocab.txt', 'w') as c:
    for fn in fsorted:
        with open(fn, 'r') as f:
            sz += c.write(f.read().replace('\n', f' {NEW_LINE} ') + '\n')
            if sz > 5e+9:
                break

### cache tokenization (optional)

In [None]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [None]:
from run_lm_finetuning import TextDataset
from yt_encoder import YTEncoder

In [None]:
txts = get_files(data, '.txt'); len(txts)

In [None]:
def cache_fn(fn):
    tokenizer = YTEncoder.from_pretrained('../bpe/yt.model')
    TextDataset.process_file(fn, tokenizer, 1024, shuffle=True)

In [None]:
for _ in progress_bar(Pool(32).imap_unordered(cache_fn, txts), len(txts)):
    pass

# Prepare cached dataset for upload (for GCloud)

In [None]:
files = get_files('upload', '.txt', True); len(files)

In [None]:
files = [item for item in files if '/full' in str(item) and '/cached' not in str(item)]; len(files)

In [None]:
for item in files:
    with open(item, 'w'):
        pass