In [2]:
import pandas as pd
import spacy
from tqdm import tqdm
from typing import List
from functools import lru_cache
from collections import Counter
import re
from io import StringIO
import my_downloads
tqdm.pandas()

In [3]:
def drop_tail(text):
    if 'агентство НКР' in text:
        return text.split('Регуляторное раскрытие')[0]
    elif 'Национальное Рейтинговое Агентство' in text:
        text = text.split('(далее – НРА, Агентство)')[1]
        return text.split('ДОПОЛНИТЕЛЬНАЯ ИНФОРМАЦИЯ')[0]
    elif 'АКРА' in text:
        return text.split('Регуляторное раскрытие')[0]
    elif 'Эксперт РА' in text:
        return text.split('Контакты для СМИ')[0]
    return text

In [4]:
@lru_cache(100000)
def clear(text):
    ner_list = ['ORG', 'LOC']
    result = []
    doc = my_downloads.nlp(text)
    for token in doc:
        if token.ent_type_ not in ner_list:
            result.append(token.text.lower())
    text =  ' '.join(result)
    text = text.replace('ё', 'е')
    text = text.replace('Ё', 'Е')
    text = re.sub(r'[^а-яА-Я ]', '', text)
    text = text.replace('\n', ' ')
    text = text.replace('.', '. ')
    text = text.replace(',', ', ')
    text = re.sub('\s+', ' ', text)
    return text.strip()

In [5]:
@lru_cache(100000)
def lemmatize(text):
    result = []
    doc = my_downloads.nlp(text)
    for token in doc:
        result.append(token.lemma_)
    return ' '.join(result)

In [6]:
@lru_cache(100000)
def tokenize_it(text: str) -> List:
    result = []
    doc = my_downloads.nlp(text)
    for token in doc:
        if token.is_stop != True and token.is_punct != True:
            result.append(token.text)
    return result

In [None]:
def data_preprocessing(df):
    df['clear_text'] = df['pr_txt'].progress_apply(drop_tail)    
    df['clear_text'] = df['clear_text'].progress_apply(clear)
    df['clear_text'] = df['clear_text'].progress_apply(lemmatize)
    df['tokenized'] = df['clear_text'].progress_apply(tokenize_it)
    df['tokenized_str'] = df['tokenized'].apply(lambda x: ' '.join(x))
    return df

In [7]:
def get_prection_cat(text):    
    doc = my_downloads.nlp_cat(text)
    scores = doc.cats
    if max(scores, key=scores.get) == '1':
        doc = my_downloads.nlp_a(text)
        scores = doc.cats
        return max(scores, key=scores.get)
    elif max(scores, key=scores.get) == '2':
        doc = my_downloads.nlp_b(text)
        scores = doc.cats
        return max(scores, key=scores.get)
    elif max(scores, key=scores.get) == '3':
        doc = my_downloads.nlp_c(text)
        scores = doc.cats
        return max(scores, key=scores.get)  

In [None]:
def predict(df):
    df = data_preprocessing(df)
    df['pred_rating'] = df['tokenized_str'].apply(get_prection_cat)
    df['category'] = df['pred_rating'].apply(lambda x: re.sub('[^A-Z]', '', x))
    
    return df[['pred_rating', 'category']].values

In [None]:
def predict_text(text):
    df = pd.DataFrame({'pr_txt':[text]})
    return predict(df)

In [None]:
def predict_csv(csv):
    csvStringIO = StringIO(csv)
    df = pd.read_csv(csvStringIO, sep=",")
    df = df.rename(columns={'Категория': 'category', 'Уровень рейтинга': 'rating'})

    return predict(df)