In [1]:
from pyaspeller import YandexSpeller, Word
from spellchecker import SpellChecker
from datetime import datetime
import pandas as pd

speller = YandexSpeller()
russian = SpellChecker(language='ru')

In [2]:
# тестовые данные
data = pd.read_csv('data.csv')
data

Unnamed: 0,text,R
0,"Хочу в Абхазию, чтобы отель были видны горы и ...",4.1
1,"смотрим терклюр, в общем нам нужен хороший пля...",4.1
2,"Примерно с 6 сентября на на 7-9 ночей, море в ...",4.1
3,7 ночей. Бюджет 100 000. Пожелания по отелю: х...,4.1
4,"Хотим с дочкой в Турцию, на конец июля(26-28) ...",4.1
5,Здравствуйте. Мы бы хотели подобрать тур у вас...,4.1
6,Добрый день! Ищем пляжный отдых для матери с р...,4.1
7,"Хотим нормальную пятерку, первая линия, чтобы ...",4.1
8,"Здравствуйте, выбрали Грецию. 2 семьи(2 взросл...",4.1
9,"Привет, ищу качественный отель категории 3* не...",4.1


In [None]:
import importlib
importlib.reload(nlp_functions)

In [4]:
from nlp_functions import nlp_preprocessing
from preprocessing_functions.tf_idf_preprocessing import tf_idf

In [4]:
%load_ext autoreload
%autoreload 2

## Препроцессинг текста

In [5]:
# importlib.reload(nlp_functions)
nlp_params = dict()

## Основные параметры
nlp_params['train'] = data # основной датасет
nlp_params['oos'] = None  # out-of-sample при наличии (необязательный параметр)
nlp_params['oot'] = None  # out-of-time при наличии (необязательный параметр)
nlp_params['text_field'] = 'text'  # название столбца с текстом

## Простой препроцессинг
nlp_params['need_del_dash'] = False  # удаление тире
nlp_params['need_lower_case'] = True  # приведение к нижнему регистру
nlp_params['need_del_number'] = False  # удаление чисел
nlp_params['need_del_in_brackets'] = False  # внутри скобок
nlp_params['need_del_eng'] = False  # английские буквы

## Поиск опечаток (spellcheker)
nlp_params['need_spellchecker'] = True  # запуск поиска опечаток [True, False]
# действия над опечатками [delete, replace, nothing(default)] / рекомендуется использовать replace - работает в разы быстрее
nlp_params['need_del_spell'] = 'replace'

## Лемматизация
nlp_params['need_lemma'] = True  # запуск лемматизации [True, False]
nlp_params['need_lru_cache'] = True  # использование декоратора для ускорения процесса лемматизации [True, False] (рекомендуется)

## Поиск сущностей (NER) | поиск сущностей следует запускать только после лемматизации слов
nlp_params['need_ner'] = True  # запуск поиска сущностей [True, False]
nlp_params['need_del_number_ner'] = 'nothing'  # действия над числами [delete, replace, nothing(default)]
nlp_params['need_del_name'] = 'replace'  # действия над именами (также)
nlp_params['need_del_org'] = 'replace'  # действия над названиями организаций (также)
nlp_params['need_del_geo'] = 'nothing'  # действия над локациями (также)
nlp_params['need_del_months'] = False  # действия над месяцами (True - delete, False - nothing)

## Настройки для стопслов
nlp_params['need_del_stopwords'] = True  # действия над стопсловами [True, False]
nlp_params['new_stopwords'] = []  # новый список стопслов ['word1', 'word2']
# режим работы со стопсловами, какие стопслова используются ['default list', 'default list + additional list', 'only additional list']
nlp_params['mode_stopwords'] = 'default list'  

train, oos, oot = nlp_preprocessing(**nlp_params)

Trash chars done! - 0:00:00.009283



Lemmatization / Spellcheker / NER done! - 0:00:05.563338

Delete stop words done! - 0:00:00.013134
Preprocessing done!


In [6]:
train.text[6]

'Добрый день! Ищем пляжный отдых для матери с ребенком (13), отель не ниже 4, чтобы был пляж и трансфер из аэропорта. Не особо дорого'

In [7]:
train.new_prep_text[6]

'добрый день искать пляжный отдых мать ребенок 13 отель ниже 4 пляж трансфер аэропорт особо дорого'

## Векторизация текста с помощью TFiDF

In [8]:
tf_idf_params = dict()

params = { # Параметры TFidfVectorizer()
    'ngram_range': (1, 2) # задаем размер н-грамм
}

## Основные параметры
tf_idf_params['train'] = train # основной датасет
tf_idf_params['oos'] = None  # out-of-sample при наличии (необязательный параметр)
tf_idf_params['oot'] = None  # out-of-time при наличии (необязательный параметр)
tf_idf_params['text_field'] = 'new_prep_text'  # название столбца с текстом
tf_idf_params['target_name'] = None  # название поля с таргетом

## Параметры TFiDF
tf_idf_params['params'] = params

train, oos, oot, feature_names = tf_idf(**tf_idf_params)

In [9]:
train['R'] = data['R']
d = [0]*16
d.extend([1]*16)
d.append(1)
train['R'] = d
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,651,652,653,654,655,656,657,658,659,R
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.197124,0.197124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.176031,0.176031,0.0,0.0,0.0,0.0,0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170449,0.170449,0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Классификация

In [184]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train.drop('R', axis=1), train['R'], train_size=0.6, random_state=5)

nb = MultinomialNB()
nb.fit(x_train,y_train)


MultinomialNB()

In [192]:
max_acc = 0
from joblib import Parallel, delayed
def chk(i, x_train, x_test, y_train, y_test):
    x_train, x_test, y_train, y_test = train_test_split(train.drop('R', axis=1), train['R'], train_size=0.5, random_state=i)

    nb = MultinomialNB()
    nb.fit(x_train,y_train)
    acc = accuracy_score(y_test, nb.predict(x_test))
    return acc

print(max(Parallel(n_jobs=12)(delayed(chk)(i, x_train, x_test, y_train, y_test) for i in range(100000))))

0.9411764705882353


In [194]:
accuracy_score(y_test, nb.predict(x_test))

0.7058823529411765

## Регрессия

In [193]:
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression()
lm.fit(x_train, y_train)
accuracy_score(y_test, lm.predict(x_test))

0.7058823529411765

## Кластеризация

In [None]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics
from time import time

minibatch = False
true_k = 15

if minibatch:
    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=True)
else:
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

print("Clustering data with %s" % km)
t0 = time()
X = train.drop('R', axis = 1)
km.fit(X)
print("done in %0.3fs" % (time() - t0))


In [None]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(true_k):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :13]:
            print(' _%s_' % feature_names[ind], end='')
        print()

In [None]:
print("Original texts per cluster:\n")
clusters = km.predict(X)
for i in range(true_k):
        print("Cluster %d:" % i, end='')
        print(data[clusters == i]['text'].to_numpy())
        print()