In [79]:
import spacy
from tqdm import tqdm

from pathlib import Path
import re

In [2]:
!pwd

/Users/asbabiy/Documents/study/fcm


In [3]:
# !python -m spacy download ru_core_news_lg

In [96]:
nlp = spacy.load("ru_core_news_lg")
nlp.select_pipes(enable=["tok2vec", "morphologizer", "lemmatizer"])

['parser', 'attribute_ruler', 'ner']

In [100]:
def get_ud(token):
    morph_feats = ",".join(token.morph.to_dict().values())
    return f"{token.lemma_}<{token.pos_},{morph_feats}>"

In [101]:
def process_text(text):
    """
    Process text
    """
    
    text = re.sub("\s+", " ", text)
    text = text.replace('-\n', '')
    
    doc = nlp(text)
    tokens = [get_ud(token) for token in doc if token.morph and "Foreign=Yes" not in token.morph]
    
    return " ".join(tokens)

In [102]:
corpus_path = Path("voc")
corpus_len = len(tuple(corpus_path.glob("**/*_raw.txt")))

print("CORPUS LENGTH:", corpus_len)

for raw_path in tqdm(corpus_path.glob("**/*_raw.txt"), total=corpus_len, desc="Processing"):
    ud_path = Path(re.sub("_raw", "_ud", raw_path.as_posix()))
    
    with open(raw_path) as raw_file:
        text = raw_file.read()
        processed_text = process_text(text)
    
    with open(ud_path, "w") as ud_file:
        ud_file.write(processed_text)

CORPUS LENGTH: 3706


Processing: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3706/3706 [19:39<00:00,  3.14it/s]


## Interface preproc

In [242]:
import spacy
import json
import re
from itertools import chain
from tqdm.notebook import tqdm
import pandas as pd
from pathlib import Path
from spacy.tokens import Doc, DocBin
from spacy import Language
from spacy.matcher import Matcher, PhraseMatcher

In [10]:
# !python -m spacy download ru_core_news_sm

In [17]:
nlp = spacy.load("ru_core_news_sm", exclude=["ner", "parser", "senter"])
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x16b759e80>

In [1]:
# ---------------------------------------------------------------------

In [303]:
doc_bin = DocBin(store_user_data=True)

corpus_path = Path("voc")
corpus_len = len(tuple(corpus_path.glob("**/*_raw.txt")))

print("CORPUS LENGTH:", corpus_len)

# for raw_path in tqdm(list(corpus_path.glob("**/*_raw.txt"))[:250], total=250, desc="Processing"):
for raw_path in tqdm(list(corpus_path.glob("**/*_raw.txt")), total=corpus_len, desc="Processing"):
    meta_path = Path(re.sub("_raw.txt", "_meta.json", raw_path.as_posix()))
    
    with open(meta_path) as meta_file:
        meta = json.load(meta_file)
        
        meta_data = dict()
        meta_data["source"] = meta_path.parent.name
        meta_data["url"] = meta.get("url", "NOT FOUND")
        meta_data["title"] = meta.get("title", "NOT FOUND")
        meta_data["date"] = meta.get("date", "NOT FOUND")
        meta_data["author"] = meta.get("author", "NOT FOUND")
    
    with open(raw_path) as raw_file:
        text = raw_file.read()
        doc = nlp(text)
        doc.user_data = meta_data
    
    doc_bin.add(doc)

CORPUS LENGTH: 3706


Processing:   0%|          | 0/250 [00:00<?, ?it/s]

In [299]:
doc_bin

<spacy.tokens._serialize.DocBin at 0x17b1a8dc0>

In [51]:
!pwd

/Users/asbabiy/Documents/study/fcm


In [53]:
nlp.to_disk("model")

In [305]:
doc_bin.to_disk("corpora/corpus.spacy")

In [None]:
# ---------------------------------------------------------------------