#### Определения

In [None]:
!pip install gensim
!pip install compress-fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from gensim import models
import pandas as pd
import numpy as np
from collections import Counter, defaultdict

import re
import string # библиотека для работы со строками
import nltk   # Natural Language Toolkit

# загружаем библиотеку для лемматизации
import pymorphy2 # Морфологический анализатор

from sklearn.manifold import TSNE

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split

In [None]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(x, y):
    return dot(x, y) / (norm(x) * norm(y))

print(cos_sim([1, 0, 0], (1, 0, 0)))
print(cos_sim([1, 0, 0], (0, 1, 0)))
print(cos_sim([1, 0, 0], (-1, 0, 0)))

1.0
0.0
-1.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv("drive/MyDrive/Text-mining-with-Simpsons-Data-master/simpsons_script_lines.csv", usecols = ["raw_character_text", "normalized_text"])

In [None]:
data.head()

Unnamed: 0,raw_character_text,normalized_text
0,Miss Hoover,no actually it was a little of both sometimes ...
1,Lisa Simpson,wheres mr bergstrom
2,Miss Hoover,i dont know although id sure like to talk to h...
3,Lisa Simpson,that life is worth living
4,Edna Krabappel-Flanders,the polls will be open from now until the end ...


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158271 entries, 0 to 158270
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   raw_character_text  140749 non-null  object
 1   normalized_text     132087 non-null  object
dtypes: object(2)
memory usage: 2.4+ MB


In [None]:
data = data.dropna(how="any", axis=0)

In [None]:
data.rename(columns={"raw_character_text": "person", "normalized_text": "text"}, inplace=True)

In [None]:
data.head(1)

Unnamed: 0,person,text
0,Miss Hoover,no actually it was a little of both sometimes ...


In [None]:
data.shape

(132085, 2)

In [None]:
persons = list(data["person"].unique())
persons[:10]

['Miss Hoover',
 'Lisa Simpson',
 'Edna Krabappel-Flanders',
 'Martin Prince',
 'Bart Simpson',
 'Landlady',
 'Nelson Muntz',
 'Terri/sherri',
 'Milhouse Van Houten',
 'Wendell Borton']

In [None]:
#data["person"] = data["person"].apply(lambda txt: persons.index(txt))
#data.rename(columns={"person": "personId"}, inplace=True)


**Предобработка данных.**

In [None]:
# загружаем список стоп-слов
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words[:5]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we']

In [None]:
word_tokenizer = nltk.WordPunctTokenizer()

In [None]:
# инициализируем лемматизатор
morph = pymorphy2.MorphAnalyzer()

In [None]:
# Токенизация
data["text"] = data["text"].apply(lambda txt: word_tokenizer.tokenize(txt))

# Удаление стоп-слов
data["text"] = data["text"].apply(lambda tokens: [word.lower() for word in tokens if (word not in string.punctuation and word not in stop_words and word.isalpha())])

# Лемматизация
data["text"] = data["text"].apply(lambda txt: [morph.parse(word)[0].normal_form for word in txt])

In [None]:
data.head()

Unnamed: 0,person,text
0,Miss Hoover,"[actually, little, sometimes, disease, magazin..."
1,Lisa Simpson,"[wheres, mr, bergstrom]"
2,Miss Hoover,"[dont, know, although, id, sure, like, talk, d..."
3,Lisa Simpson,"[life, worth, living]"
4,Edna Krabappel-Flanders,"[polls, open, end, recess, case, decided, put,..."


#### Обучим модель word2vec

In [None]:
model = models.Word2Vec()
model.build_vocab(data["text"].values)
model.train(data["text"].values, total_examples=model.corpus_count, epochs=model.epochs)

(3190452, 3723840)

In [None]:
try:
    print(cos_sim(model.wv['good'], model.wv['like']))
    print(cos_sim(model.wv['good'], model.wv['bad']))
    print(cos_sim(model.wv['man'], model.wv['woman']))
    print(cos_sim(model.wv['point'], model.wv['line']))
except Exception as e:
    print(e)

0.472965
0.7801208
0.7263508
0.75965214


**Топ 30 слов.**

In [None]:
model.wv.index_to_key[:30]

['im',
 'oh',
 'well',
 'dont',
 'like',
 'get',
 'youre',
 'one',
 'know',
 'thats',
 'hey',
 'homer',
 'right',
 'go',
 'got',
 'bart',
 'ill',
 'uh',
 'good',
 'see',
 'yeah',
 'think',
 'want',
 'cant',
 'look',
 'man',
 'marge',
 'gonna',
 'back',
 'little']

In [None]:
embedding = model.wv[model.wv.index_to_key[:1000]]
embedding

array([[-1.0733597 ,  0.38026398,  0.7549571 , ..., -0.9507026 ,
        -0.3013287 ,  0.6771909 ],
       [-0.43573132,  0.5829406 ,  0.07036531, ..., -0.5516442 ,
        -0.02049379,  0.24810845],
       [-0.5151944 , -0.2074301 , -0.19551836, ...,  0.19684526,
         0.5964999 ,  0.46189353],
       ...,
       [ 0.01749039, -0.04066623,  0.08196614, ..., -0.2777419 ,
         0.0899467 , -0.05879227],
       [-0.01466685,  0.0862652 , -0.11416172, ..., -0.15706713,
         0.14429362, -0.15392867],
       [-0.25587916,  0.29727313,  0.14887024, ..., -0.21414521,
        -0.00478979,  0.15554586]], dtype=float32)

In [None]:
tsne = TSNE(n_components=2, learning_rate='auto', init='random')
embedding_new = tsne.fit_transform(embedding)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=embedding_new[:,0],
                                    x2=embedding_new[:,1],
                                    names=model.wv.index_to_key[:1000]))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

**Cамые близкие слова**

In [None]:
vec = model.wv["homer"] - model.wv["marge"] + model.wv["bart"]
model.wv.similar_by_vector(vec)

[('bart', 0.8990170955657959),
 ('homer', 0.8600276708602905),
 ('lisa', 0.8358585238456726),
 ('eliza', 0.7657955884933472),
 ('mrs', 0.749053418636322),
 ('abe', 0.7397520542144775),
 ('grampa', 0.7330268025398254),
 ('abraham', 0.7203943133354187),
 ('bartholomew', 0.7185417413711548),
 ('j', 0.7182782292366028)]

In [None]:
vec = model.wv["bart"] - model.wv["lisa"] + model.wv["school"]
model.wv.similar_by_vector(vec)

[('school', 0.9336652755737305),
 ('together', 0.7543725371360779),
 ('lives', 0.7492193579673767),
 ('rest', 0.7427104711532593),
 ('high', 0.7379917502403259),
 ('away', 0.7366161346435547),
 ('tomorrow', 0.7279853820800781),
 ('town', 0.7231345176696777),
 ('game', 0.7192825078964233),
 ('sundays', 0.7146451473236084)]

In [None]:
vec = model.wv["marge"] - model.wv["homer"] + model.wv["home"]
model.wv.similar_by_vector(vec)

[('home', 0.9009953737258911),
 ('bed', 0.8568058013916016),
 ('together', 0.7980523705482483),
 ('sleep', 0.7833420634269714),
 ('dinner', 0.7720690965652466),
 ('back', 0.7655618190765381),
 ('stay', 0.760530412197113),
 ('tomorrow', 0.7412276268005371),
 ('outta', 0.7350122928619385),
 ('college', 0.7165538668632507)]

**Классификатор bart/lisa**

**Средние вектора с весами tf-idf**

In [None]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.popitem()[1])

    def fit(self, X, y=None):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [None]:
data_train = data[(data["person"] == "Bart Simpson") | (data["person"] == "Lisa Simpson")]
data_train.head()

Unnamed: 0,person,text
1,Lisa Simpson,"[wheres, mr, bergstrom]"
3,Lisa Simpson,"[life, worth, living]"
7,Bart Simpson,"[victory, party, slide]"
9,Lisa Simpson,"[mr, bergstrom, mr, bergstrom]"
11,Lisa Simpson,"[know, could, find]"


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_train["text"].values, data_train["person"].values, test_size=0.1, random_state=42)

In [None]:
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))

tfidfEmbVect = TfidfEmbeddingVectorizer(w2v)

embedding_train = tfidfEmbVect.fit(X_train).transform(X_train)

In [None]:
classifier = CatBoostClassifier(loss_function="Logloss", iterations=500, depth=3,
                                eval_metric="Accuracy", custom_metric="AUC",
                                random_seed=42, logging_level="Silent", use_best_model=False, task_type="GPU", devices='0')

classifier.fit(embedding_train, y_train)

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x7f734ab15bd0>

In [None]:
persons = ["Bart Simpson", "Lisa Simpson"]

In [None]:
embedding_test = tfidfEmbVect.transform(X_test)

In [None]:
predict_proba = classifier.predict_proba(embedding_test)

cnt = 0
for idx, prob in enumerate(predict_proba):
    if persons[np.argmax(prob)] == y_test[idx]: cnt += 1

print("accuracy = {}".format(cnt / len(y_test)))

accuracy = 0.6320437342304458
