In [None]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict

import re
import string # библиотека для работы со строками
import nltk   # Natural Language Toolkit

# загружаем библиотеку для лемматизации
import pymorphy2 # Морфологический анализатор

#from pymystem3 import Mystem
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import silhouette_samples
from sklearn.metrics import roc_auc_score, mean_squared_error, \
    accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
#from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

#import xgboost as xgb
#import lightgbm as lgb
from catboost import CatBoostClassifier

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
#import umap

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN

from sklearn.base import BaseEstimator
from scipy.spatial.distance import cdist

In [None]:
%matplotlib inline
plt.rcParams["figure.figsize"] = [10, 10]

In [None]:
#!pip install --upgrade gensim

In [None]:
#!pip install umap-learn

In [None]:
#!pip install catboost

In [None]:
#!pip install pymorphy2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
person_vectors = ("brown", "black", "red", "orange", "yellow", "green", "blue", "purple")

In [None]:
df_vectors = pd.DataFrame([], columns=["description", "vectorId"])

for person_vector in person_vectors:
    with open("drive/MyDrive/vectors/" + person_vector + ".txt", encoding="utf8") as rf:
        texts = rf.read()
        texts = texts.replace('?', '.')
        texts = texts.replace('!', '.')
        texts = texts.replace('\n', '')
        for txt in texts.split("."):
            if len(txt) <= 1: continue
            df_vectors = df_vectors.append({"description": txt.strip(), "vectorId": person_vectors.index(person_vector)}, ignore_index=True)

In [None]:
df_vectors.shape

In [None]:
df_vectors.head(10)

In [None]:
# загружаем список стоп-слов для русского
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')

In [None]:
word_tokenizer = nltk.WordPunctTokenizer()

In [None]:
# инициализируем лемматизатор
morph = pymorphy2.MorphAnalyzer()

**Предобработка данных.**

In [None]:
def words_only(text):
    regex = re.compile("[А-Яа-яA]+")
    try:
        return " ".join(regex.findall(text))
    except:
        return ""

In [None]:
#df_vectors["description"] = df_vectors["description"].apply(lambda txt: words_only(txt))

# Токенизация
df_vectors["description"] = df_vectors["description"].apply(lambda txt: word_tokenizer.tokenize(txt))

# Удаление стоп-слов
df_vectors["description"] = df_vectors["description"].apply(
    lambda tokens: [word.lower() for word in tokens if (word not in string.punctuation and word not in stop_words and word.isalpha())])

# Лемматизация
df_vectors["description"] = df_vectors["description"].apply(lambda txt: [morph.parse(word)[0].normal_form for word in txt])

#mystem = Mystem()

#df_vectors["description"] = df_vectors["description"].apply(lambda txt: ' '.join([mystem.lemmatize(word) for word in txt]))

In [None]:
df_vectors.head()

**========================= WORTOVEC =============================**

In [None]:
#model = Word2Vec(df_vectors["description"].values, vector_size=300, window=5, min_count=5, workers=4)

model = KeyedVectors.load_word2vec_format('drive/MyDrive/186/model.bin', binary=True) # tayga-func_upos_skipgram_300_5_2019

**Средний вектор с весами tf-idf**

In [None]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(w2v.popitem()[1])

    def fit(self, X, y=None):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [None]:
words = []

for w in model.index_to_key:
    idx = w.index('_')
    words.append(w[:idx])

In [None]:
#w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))
w2v = dict(zip(words, model.vectors))

In [None]:
tfidfEmbVect = TfidfEmbeddingVectorizer(w2v)

embedding_train = tfidfEmbVect.fit(df_vectors["description"].values).transform(df_vectors["description"].values)

y_train = df_vectors["vectorId"].values.copy().astype(np.int8)

Находим аномальные значения.

In [None]:
dbs = DBSCAN(eps=0.5, min_samples=2, metric="euclidean")

y_dbs = dbs.fit_predict(embedding_train)

In [None]:
idx = np.where(y_dbs == -1)
np.unique(y_train[np.where(y_dbs >= 0)], return_counts=True)

В описании желтого вектора большее количество аномальных слов.

In [None]:
print(embedding_train[idx].shape)
embedding_train.shape

In [None]:
# Отбрасываем аномальные значения
embedding_train = embedding_train[idx]
y_train = y_train[idx]

In [None]:
#p.save("OTUS/project/embedding.npy", embedding_train)
#np.save("OTUS/project/vectorIds.npy", y_train)

In [None]:
tsne = TSNE(n_components=2, learning_rate='auto', init='random')
embedding_tsne = tsne.fit_transform(embedding_train)

In [None]:
plt.scatter(embedding_tsne[:, 0], embedding_tsne[:, 1], c=y_train, cmap='viridis', label=y_train)
plt.legend()
plt.show()

Все вектора располагаются в одном кластере.

In [None]:
idxs = np.where((y_train == 0) | (y_train == 1))[0]

plt.scatter(embedding_tsne[idxs, 0], embedding_tsne[idxs, 1], c=y_train[idxs], cmap='viridis', label=[0, 1])
plt.legend()
plt.show()

Видно, что два вектора разделимы.

In [None]:
#classifier = RandomForestClassifier(n_estimators=200)
classifier = CatBoostClassifier(loss_function="MultiClass", eval_metric="AUC", custom_metric="F1", 
                           random_seed=42, logging_level="Silent", use_best_model=False)

classifier.fit(embedding_train, y_train)

In [None]:
classifier.best_score_

In [None]:
data = pd.DataFrame([], columns=["description"])

with open("drive/MyDrive/test/darvin.txt", encoding="utf8") as rf:
    texts = rf.read()
    texts = texts.replace('?', '.')
    texts = texts.replace('!', '.')
    texts = texts.replace('\n', '')
    for txt in texts.split("."):
        if len(txt) <= 1: continue
        data = data.append({"description": txt.strip()}, ignore_index=True)

In [None]:
# Токенизация
data["description"] = data["description"].apply(lambda txt: word_tokenizer.tokenize(txt))

# Удаление стоп-слов
data["description"] = data["description"].apply(
    lambda tokens: [word.lower() for word in tokens if (word not in string.punctuation and word not in stop_words and word.isalpha())])

# Лемматизация
data["description"] = data["description"].apply(lambda txt: [morph.parse(word)[0].normal_form for word in txt])

In [None]:
data.head()

In [None]:
embedding_test = tfidfEmbVect.transform(data["description"].values)

In [None]:
#np.save("OTUS/project/embedding_test.npy", embedding_test)

In [None]:
embedding = np.concatenate((embedding_train, embedding_test), axis=0)
labels = np.concatenate((y_train, np.array([8] * embedding_test.shape[0])))

In [None]:
tsne = TSNE(n_components=2, learning_rate='auto', init='random')
embedding_tsne = tsne.fit_transform(embedding)

plt.scatter(embedding_tsne[:, 0], embedding_tsne[:, 1], c=labels, cmap='viridis', label=labels)
plt.legend()
plt.show()

In [None]:
# Отбрасываем значения тестовых данных, которые не попадают ни в один из векторных кластеров
idxs = []
idx = 0
for values in embedding_test:
    for vectorId in range(len(person_vectors)):
        vectorIds = np.where(y_train == vectorId)[0]
        #arr_max = np.mean(embedding_train[vectorIds], axis=0) + 3 * np.std(embedding_train[vectorIds], axis=0)
        #arr_min = np.mean(embedding_train[vectorIds], axis=0) - 3 * np.std(embedding_train[vectorIds], axis=0)
        arr_max = np.max(embedding_train[vectorIds], axis=0)
        arr_min = np.min(embedding_train[vectorIds], axis=0)
    
        if np.all(values <= arr_max) and np.all(values >= arr_min): # Точка в кластере описания какого-либо вектора
            idxs.append(idx)
            break
    idx += 1

print(embedding_test.shape[0], len(idxs))

Отбросили 117 значений как аномальные, не входящие в кластер векторов человека.

In [None]:
embedding = np.concatenate((embedding_train, embedding_test[idxs]), axis=0)
labels = np.concatenate((y_train, np.array([8] * embedding_test[idxs].shape[0])))

In [None]:
tsne = TSNE(n_components=2, learning_rate='auto', init='random')
embedding_tsne = tsne.fit_transform(embedding)

plt.scatter(embedding_tsne[:, 0], embedding_tsne[:, 1], c=labels, cmap='viridis', label=labels)
plt.legend()
plt.show()

Желтым цветом обозначены тестовые данные, видно, что они в кластере векторов человека.

In [None]:
#np.save("OTUS/project/embedding_test.npy", embedding_test[idxs])

In [None]:
predict_proba = classifier.predict_proba(embedding_test[idxs])
#predict_proba

In [None]:
for idx in range(len(person_vectors)):
    print(np.mean(predict_proba[:, idx]), "-", person_vectors[idx])

Коричневый вектор преобладает, о чем и говорил В.К. Толкачев.
Оранжевый - дисциплина
Черный - любит физическую работу.

In [None]:
for idx in range(len(person_vectors)):
    print(np.max(predict_proba[:, idx]), "-", person_vectors[idx])

In [None]:
# Посмотрим какими качествами будет обладать человек, если у него преобладают три вектора 
# (коричневый, черный, красный)
# Так ли это на самом деле, пока вопрос.
idx1 = words.index("аккуратность")     # brown vector
idx2 = words.index("скромность")       # black vector
idx3 = words.index("решительность")    # red vector

vec = model[model.index_to_key[idx1]] + model[model.index_to_key[idx2]] + model[model.index_to_key[idx3]]

model.similar_by_vector(vec)

In [None]:
# Посмотрим какими качествами будет обладать человек, если у него преобладают три вектора 
# (коричневый, черный, красный) и красный не принят (невроз).
idx1 = words.index("аккуратность")     # brown vector
idx2 = words.index("скромность")       # black vector
idx3 = words.index("трусливость")      # red vector

vec = model[model.index_to_key[idx1]] + model[model.index_to_key[idx2]] + model[model.index_to_key[idx3]]

model.similar_by_vector(vec)