In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from num2words import num2words
from functools import lru_cache

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
nltk.download('stopwords')

import spacy

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sergey_qt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
import warnings; warnings.simplefilter('ignore')

In [2]:
from tqdm import tqdm
tqdm.pandas()

In [3]:
random_state=12345

In [4]:
data = pd.read_excel('CRA_train_1200.xlsx')

In [5]:
data_nkr = data[data.pr_txt.str.contains('агентство НКР')]
data_nra = data[data.pr_txt.str.contains('Национальное Рейтинговое Агентство')]
data_akra = data[data.pr_txt.str.contains('АКРА')]
data_ra = data[data.pr_txt.str.contains('Эксперт РА')]

In [6]:
data_nkr['cuted_text'] = data_nkr.loc[:,'pr_txt'].apply(lambda x: x.split('Регуляторное раскрытие')[0])
data_nra['cuted_text'] = data_nra.loc[:,'pr_txt'].apply(lambda x: x.split('ДОПОЛНИТЕЛЬНАЯ ИНФОРМАЦИЯ')[0])
data_nra['cuted_text'] = data_nra.loc[:,'cuted_text'].apply(lambda x: x.split('(далее – НРА, Агентство)')[1])
data_ra['cuted_text'] = data_ra.loc[:,'pr_txt'].apply(lambda x: x.split('Контакты для СМИ')[0])
data_akra['cuted_text'] = data_akra.loc[:,'pr_txt'].apply(lambda x: x.split('Регуляторное раскрытие')[0])
data = pd.concat([data_nkr, data_nra, data_akra, data_ra])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1200 entries, 2 to 1199
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                1200 non-null   int64 
 1   pr_txt            1200 non-null   object
 2   Категория         1200 non-null   object
 3   Уровень рейтинга  1200 non-null   object
 4   cuted_text        1200 non-null   object
dtypes: int64(1), object(4)
memory usage: 56.2+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_nkr['cuted_text'] = data_nkr.loc[:,'pr_txt'].apply(lambda x: x.split('Регуляторное раскрытие')[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_nra['cuted_text'] = data_nra.loc[:,'pr_txt'].apply(lambda x: x.split('ДОПОЛНИТЕЛЬНАЯ ИНФОРМАЦИЯ')[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

In [7]:
nlp = spacy.load("ru_core_news_lg")

In [8]:
def clean_text(input_text):    
    
    # HTML-теги: первый шаг - удалить из входного текста все HTML-теги
    clean_text = re.sub('<[^<]+?>', '', input_text)
    
    # URL и ссылки: далее - удаляем из текста все URL и ссылки
    clean_text = re.sub(r'http\S+', '', clean_text)
    
    # Приводим все входные данные к нижнему регистру
    clean_text = clean_text.lower()

    # Убираем все лишние пробелы
    # Так как все данные теперь представлены словами - удалим пробелы
    clean_text = re.sub('\s+', ' ', clean_text)
    
    # Убираем специальные символы: избавляемся от всего, что не является "словами"
    clean_text = re.sub('[^а-яА-Яa-zA-z0-9\s\.]', '', clean_text)

    # Записываем числа прописью: 100 превращается в "сто" (для компьютера)
    words = []
    for word in clean_text.split():
        if word.isdigit():
            words.append(num2words(word, lang='ru'))
        else:
            words.append(word)
    clean_text = ' '.join(words)

    # Стоп-слова: удаление стоп-слов - это стандартная практика очистки текстов
    stop_words = set(stopwords.words('russian'))
    tokens = word_tokenize(clean_text)
    tokens = [token for token in tokens if token not in stop_words]
    clean_text = ' '.join(tokens)

    # Знаки препинания: далее - удаляем из текста все знаки препинания кроме точек
    clean_text = re.sub(r'[^\w\s.!?]', '', clean_text)

    # И наконец - возвращаем очищенный текст
    return clean_text

In [9]:
data['cuted_text'] = data['cuted_text'].apply(clean_text)

In [10]:
@lru_cache(100000)
def lemmatize(text):
    result = []
    doc = nlp(text)
    for token in doc:
        result.append(token.lemma_)
    return ' '.join(result)

In [12]:
data['lemm_spacy'] = data['cuted_text'].progress_apply(lemmatize)

100%|██████████████████████████████████████████████████████████████████████████████| 1200/1200 [06:49<00:00,  2.93it/s]


In [13]:
X = data['lemm_spacy']
y = data['Уровень рейтинга']
#y = data['Категория']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [14]:
logreg = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5,class_weight='balanced',solver='newton-cg')),
               ])
logreg.fit(X_train, y_train);

In [20]:
y_test = pd.DataFrame(y_test)
y_test.columns = ['rating']
y_test['category'] = y_test['rating'].apply(lambda x: re.sub('[^A-Z]', '', x))
y_test.head()

Unnamed: 0,rating,category
1169,A,A
668,BB-,BB
451,BBB+,BBB
874,A,A
606,BB+,BB


In [17]:
y_pred = pd.DataFrame(logreg.predict(X_test))
y_pred.columns = ['rating']
y_pred['category'] = y_pred['rating'].apply(lambda x: re.sub('[^A-Z]', '', x))
y_pred.head()

Unnamed: 0,rating,category
0,A,A
1,BB,BB
2,B-,B
3,A,A
4,BB-,BB


In [24]:
print('accuracy %s' % accuracy_score(y_pred['rating'].values, y_test['rating'].values))
print(classification_report(y_test['rating'].values, y_pred['rating'].values))

accuracy 0.7333333333333333
              precision    recall  f1-score   support

           A       0.67      0.75      0.71        24
          A+       0.81      0.72      0.76        29
          A-       0.72      0.84      0.78        31
          AA       0.79      0.79      0.79        19
         AA+       0.72      0.76      0.74        17
         AA-       0.70      0.70      0.70        10
         AAA       0.93      0.93      0.93        30
           B       0.00      0.00      0.00         2
          B+       0.50      0.25      0.33         4
          B-       0.33      0.25      0.29         4
          BB       0.56      0.38      0.45        13
         BB+       0.36      0.71      0.48         7
         BB-       0.40      0.33      0.36         6
         BBB       1.00      0.88      0.93        16
        BBB+       0.72      0.68      0.70        19
        BBB-       0.83      0.71      0.77         7
           C       0.67      1.00      0.80         2

In [25]:
print('accuracy %s' % accuracy_score(y_pred['category'].values, y_test['category'].values))
print(classification_report(y_test['category'].values, y_pred['category'].values))

accuracy 0.8625
              precision    recall  f1-score   support

           A       0.85      0.90      0.88        84
          AA       0.85      0.87      0.86        46
         AAA       0.93      0.93      0.93        30
           B       0.80      0.40      0.53        10
          BB       0.82      0.88      0.85        26
         BBB       0.89      0.81      0.85        42
           C       0.67      1.00      0.80         2

    accuracy                           0.86       240
   macro avg       0.83      0.83      0.82       240
weighted avg       0.86      0.86      0.86       240



In [None]:
import joblib
joblib.dump(logreg, 'logreg.pkl')
model = joblib.load('logreg.pkl')