In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from functools import reduce

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer
import re

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
def remove_tags(line):
    char_regex = re.compile(r'<[a-z]>')
    line = char_regex.sub(' ', str(line).lower())
    return line

In [10]:
def spell_check(line):
    char_regex = re.compile(r'[^а-я]')
    line = char_regex.sub(' ', line.lower())
    
    chkr_ru = SpellChecker(d_ru)
    chkr_ru.set_text(line)
    mistakes_ru = set([err.word for err in chkr_ru])
    
    
    tokenized = toktok.tokenize(line)
    num_mistakes=0
    for i in tokenized:
        if i in mistakes_ru:
            num_mistakes+=1
            
    num_words = len(line.split(' '))
    
    mistakes_rate = num_mistakes / num_words
            
    return mistakes_rate

In [3]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';')

In [4]:
data = pd.read_csv('employment_preprocessed.csv')

In [5]:
data.shape

(923643, 8)

In [14]:
# data.id.isna().sum()

In [15]:
data = data[data['id'].notnull()]

In [18]:
data.tail(2)

Unnamed: 0,id,position,employer,achievements,responsibilities,start_date,finish_date,text
892714,99998.0,Специалист по микрофинансовым операциям,"АО ""Вайнемейнен""",,<ul><li>Оформление договоров займа</li><li>При...,2016-04-01,2020-03-01,оформлен договор займ при наличн формирован ар...
892715,99999.0,Секретарь,Департамент здравоохранения и социальной помощ...,,"<ol><li>Ведения делопроизводства, выполнения п...",2006-06-01,2014-12-01,веден делопроизводств выполнен поручен руковод...


In [22]:
data['text'].isna().sum()

10301

In [23]:
data['text'].fillna('',inplace=True)

### Стат. признаки текста

In [7]:
# уберем только html тэги
data['resp_no_tags'] = data['responsibilities'].apply(remove_tags)

In [9]:
from string import punctuation

# длина комментария
data["answer_len"] = data['resp_no_tags'].apply(len)

# кол-во слов с заглавной буквы
data['upper_case_word_count'] = data['resp_no_tags'].apply(lambda x: len([wrd for wrd in x.split() if wrd[0].isupper()]))

# кол-во знаков препинания
data['punctuation_count'] = data['resp_no_tags'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation)))

In [14]:
data[['answer_len', 'upper_case_word_count','punctuation_count']]

Unnamed: 0,answer_len,upper_case_word_count,punctuation_count
0,99,0,5
1,100,0,6
2,99,0,5
3,70,0,4
4,147,0,29
...,...,...,...
923638,290,0,27
923639,119,0,11
923640,134,0,9
923641,292,0,13


In [15]:
# # Определим процент грамматических ошибок
# import re
# import pandas as pd
# import enchant
# from enchant.checker import SpellChecker
# from nltk.tokenize import ToktokTokenizer

# # инициализируем словари
# d_ru = enchant.DictWithPWL('ru')

# #инициализируем токенайзер
# toktok = ToktokTokenizer()

# data['resp_grammar_mistakes_per_cent'] = data['resp_no_tags'].apply(spell_check)

In [None]:
from functools import reduce

def aggregate_grammar(data):

    mean_df = data.groupby(['id'], as_index=False).agg(np.mean)
    mean_df.columns = [str(col) + '_mean' if col != 'cluster' else 'cluster' for col in mean_df.columns]
    
    max_df = data.groupby(['resp_grammar_mistakes_per_cent'], as_index=False).agg(np.max)
    max_df.columns = [str(col) + '_max' if col != 'cluster' else 'cluster' for col in max_df.columns]
    
    min_df = data.groupby(['resp_grammar_mistakes_per_cent'], as_index=False).agg(np.min)
    min_df.columns = [str(col) + '_min' if col != 'cluster' else 'cluster' for col in min_df.columns]
    
    data_frames = [mean_df, max_df, min_df]

    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['cluster'], how='right'), data_frames)
    return df_merged

In [18]:
stat_text_feats = data[['id','answer_len', 'upper_case_word_count','punctuation_count']].groupby(['id'], as_index=False).agg(np.mean)

In [21]:
train.shape, test.shape

((306270, 25), (131259, 24))

In [24]:
train_text_stat = pd.merge(train, stat_text_feats,how='left', on='id')

In [25]:
test_text_stat = pd.merge(test, stat_text_feats,how='left', on='id')

In [30]:
train_text_stat.fillna(0, inplace=True)
test_text_stat.fillna(0, inplace=True)

In [32]:
train_text_stat[['id','answer_len', 'upper_case_word_count','punctuation_count']].to_csv('train_text_stat.csv',index=False)
test_text_stat[['id','answer_len', 'upper_case_word_count','punctuation_count']].to_csv('test_text_stat.csv',index=False)