In [709]:
import json
import os
from copy import deepcopy
import tqdm

import numpy as np
import pandas as pd

import pymorphy2
import re

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, TruncatedSVD

import matplotlib.pyplot as plt

import pickle

In [532]:
def parse_publication(path):
    with open(path, 'r') as f:
        data = eval(f.readlines()[0])
        data = {'raw_text': data['title'] + \
                            data['intro'] + \
                            blocks2txt(data['blocks']),
                   'genre': data['subsite']['name']}
    return data


def blocks2txt(blocks: list):
    txt_pieces = []
    for block in blocks:
        if 'text' not in block['data']:
            continue
        txt_pieces.append(block['data']['text'])
    return ' '.join(txt_pieces)


m = pymorphy2.MorphAnalyzer()
def preprocessing_rus(text):
    text = text.lower()
    text = re.sub('[^А-Яа-яA-Za-z]+', ' ', text).strip()
    text = word_tokenize(text)
    text = [m.parse(w)[0].normal_form for w in text if w not in stopwords.words('russian')]
    return ' '.join(text)

def label_changer(label):
    label_map = {'Офис':    'Карьера',
                 'Истории': 'Личный опыт',
                 'vc.ru':   'Офтоп',
                 'Промо':   'Маркетинг',
                 'Будущее': 'Техника',
                 'SEO':     'Финансы',
                 'Торговля': 'Финансы',
                 'Транспорт': 'Техника',
                 'Офлайн':  'Офтоп',
                 'Еда':     'Офтоп',
                 'Медиа':   'Офтоп',
                 'Соцсети': 'Сервисы',
                 'Дизайн': 'Офтоп'}
    
    if label in label_map:
        return label_map[label]
    return label

In [533]:
all_data = []
path_to_data = 'db/'
for fname in os.listdir(path_to_data):
    if '.txt' in fname:
        all_data.append(parse_publication(path_to_data+fname))

In [534]:
df = pd.DataFrame(all_data).drop_duplicates()
df = df[df['genre'] != 'DataGang']
df['genre'] = df['genre'].apply(label_changer)
for bad_gen in 'Трибуна', 'Приёмная', 'Вопросы':
    id_drop = df[df['genre'] == bad_gen].index
    df = df.drop(id_drop)

df['genre'].value_counts()

Маркетинг      262
Офтоп          181
Финансы        173
Сервисы        126
Техника         88
Карьера         80
Личный опыт     68
Право           49
Name: genre, dtype: int64

In [539]:
splitted = []
for i, data in df.iterrows():
    for j in textwrap.wrap(data['raw_text'], 3000):
        splitted.append([j, data['genre']])

df = pd.DataFrame(splitted)
df.columns = ['raw_text', 'label']

In [540]:
df['label'].value_counts()

Маркетинг      677
Офтоп          408
Финансы        408
Сервисы        250
Личный опыт    250
Карьера        206
Техника        177
Право           77
Name: label, dtype: int64

In [543]:
tqdm.tqdm.pandas()

df['raw_text'] = df['raw_text'].progress_apply(preprocessing_rus)

100%|██████████| 2453/2453 [05:36<00:00,  6.11it/s]


In [712]:
df.head()

Unnamed: 0,raw_text,label
0,создание система контроль сдача отчётность бух...,Финансы
1,вести руководитель отдел именно проходить весь...,Финансы
2,аналитика который основать дать чек лист собст...,Финансы
3,google запустить подписка приложение игра andr...,Сервисы
4,эффективный воронка продажа построить воронка ...,Маркетинг


In [711]:
tsvd_svc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('svd', TruncatedSVD(n_components=1000)),
                     ('svc', LinearSVC(C=0.15, class_weight='balanced'))])

In [715]:
tsvd_svc.fit(df['raw_text'], df['label'])

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('svd',
                 TruncatedSVD(algorithm='randomized', n_components=1000,
                              n_iter=5, random_state=None, tol=0.0)),
                ('sv

In [703]:
# cv_scores_baseline = cross_val_score(LinearSVC(C=0.2, class_weight='balanced'), X, y, cv=10, scoring='f1_micro')

In [707]:
cv_scores_baseline

array([0.5951417 , 0.53846154, 0.60323887, 0.62348178, 0.63967611,
       0.5951417 , 0.65853659, 0.64609053, 0.6473029 , 0.61410788])

In [716]:
pickle.dump(tsvd_svc, open('classification_pipeline.pkl', 'wb'))

In [719]:
with open('test.txt') as t:
    test = t.readlines()
    for sent in range(len(test)):
        test[sent] = test[sent].strip('\n')
    test = '.'.join(test)
    test = re.sub(r'\.+', ".", test)

In [722]:
tsvd_svc.predict([test])[0]

'Право'