In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pathlib import Path


base_dir = '/content/drive/MyDrive/ОмГТУ/ВКР/Реализация/books'
pdf_paths = list(Path(base_dir).glob('**/*.pdf'))
n_files = len(pdf_paths)
txt_paths = [
    Path(base_dir, 'txt', f'{i}.txt') for i in range(n_files)
]
clean_paths = [
    Path(base_dir, 'clean', f'{i}.txt') for i in range(n_files)
]

## Necessary Imports

In [None]:
!pip install -q spacy==3.2.1 pymorphy2 nltk pdfplumber
!python -m spacy download ru_core_news_lg

In [None]:
import os
import re
from string import punctuation
from pprint import pprint
from typing import List

import pdfplumber
import spacy
import nltk
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer


try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

## Extract PDF to TXT

In [None]:
def save_as_txt(pdf_path: Path, txt_path: Path) -> None:
    if txt_path.exists():
        return
    with open(file=txt_path, mode='w', encoding='utf-8') as outfile:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                outfile.write(page.extract_text(x_tolerance=2))

In [None]:
%%time
for pdf_path, txt_path in zip(pdf_paths, txt_paths):
    save_as_txt(pdf_path, txt_path)

CPU times: user 30min 19s, sys: 38.3 s, total: 30min 57s
Wall time: 31min 52s


## Preprocess text

In [None]:
RU_MODEL = spacy.load('ru_core_news_lg')
STOPWORDS = stopwords.words('russian') + stopwords.words('english') + ['это', 'cid']
PUNC = punctuation + '–—“”´'

In [None]:
def preprocess_text(raw_text: str) -> List[str]:
    clean_text = raw_text.replace('\xad', '')
    clean_text = re.sub(rf'[{PUNC}]', '', clean_text)
    clean_text = re.sub(r'\d+', '', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    tokenized = RU_MODEL(clean_text)
    base_tokens = [token.lemma_ for token in tokenized]
    return [
        token for token in base_tokens 
        if len(token) > 2 and \
        token not in STOPWORDS and \
        token not in PUNC
    ]

In [None]:
with open(txt_paths[1], encoding='utf-8') as reader:
    with open(clean_paths[1], mode='w', encoding='utf-8') as writer:
        to_preprocess = ''
        for line in reader.readlines():
            to_preprocess += line.strip()
            if line.endswith('-'):
                to_preprocess = to_preprocess[:-1]
                continue
            tokens = preprocess_text(to_preprocess)
            if not tokens:
                continue
            writer.write(' '.join(tokens) + ' ')
            to_preprocess = ''

In [None]:
%%time
for txt_path, clean_path in zip(txt_paths, clean_paths):
    # clean_path.write_text(
    #     ' '.join(preprocess_text(txt_path.read_text(encoding='utf-8'))),
    #     encoding='utf-8'
    # )
    with open(txt_path, encoding='utf-8') as reader:
        with open(clean_path, mode='w', encoding='utf-8') as writer:
            to_preprocess = ''
            for line in reader.readlines():
                to_preprocess += line.strip()
                if line.endswith('-'):
                    to_preprocess = to_preprocess[:-1]
                    continue
                tokens = preprocess_text(to_preprocess)
                if not tokens:
                    continue
                writer.write(' '.join(tokens) + ' ')
                to_preprocess = ''

CPU times: user 1h 22min 37s, sys: 14.2 s, total: 1h 22min 51s
Wall time: 1h 22min 43s


## Vowpal Wabbit

In [5]:
vw = Path(base_dir, 'vowpal_wabbit', 'data.txt')
with open(vw, mode='w', encoding='utf-8') as f:
    for clean_path in clean_paths:
        f.write(
            f"|text {clean_path.read_text(encoding='utf-8')}\n"
        )