# Обучение модели Word2Vec на текстовых данных социальной сети VK, содержащих эмодзи

### Импорт необходимых библиотек

In [None]:
import pandas as pd
import nltk
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from gensim.models import Word2Vec as wv
from gensim.models import KeyedVectors
import pymorphy2
from pymorphy2 import MorphAnalyzer
import emoji
import zipfile
import numpy as np

In [None]:
nltk.download('stopwords')

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
nltk.download('averaged_perceptron_tagger_ru')

In [None]:
nltk.download('universal_tagset')

### Подготовка датасета со смайликами

##### Чтение датасета 

In [None]:
emojis = pd.read_csv('marking.csv')

In [None]:
emojis = emojis.drop(columns=['image_id', 'name', 'group', 'sub_group', 'train'])

In [None]:
emojis

##### Токенизация описания эмодзи

In [None]:
regular = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
full_tokens = []
for text in emojis['text']:
  text = re.sub(regular, '', text)
  tokens = word_tokenize(text, language='russian')
  full_tokens.append(tokens)
emojis['tokens'] = full_tokens

In [None]:
emojis= emojis.drop('text', axis = 1)

In [None]:
emojis

##### Приведение к нормальной форме всех слов в описании

In [None]:
filtered = []
morph = pymorphy2.MorphAnalyzer()
for tokens in emojis['tokens']:
  filtered_tokens = []
  for token in tokens:
    token = morph.normal_forms(token)[0]
    if token not in stopwords.words('russian'):
      filtered_tokens.append(token)
  filtered.append(filtered_tokens)

In [None]:
emojis['filtered'] = filtered

In [None]:
emojis = emojis.drop('tokens', axis = 1)

In [None]:
emojis

##### Объединение смайлика с описанием для получения единого предложения для дальнейшего использования датасета для обучения модели

In [None]:
for e in emojis['emoji']:
  ind = emojis[emojis['emoji'] == e].index
  ind.to_list()
  emojis.at[ind[0], 'filtered'].append(e)

In [None]:
emojis = emojis.drop('emoji', axis = 1)

In [None]:
emojis

In [None]:
emojis['filtered'] = emojis.apply(lambda x: nltk.pos_tag(x['filtered'], tagset='universal', lang='rus'), axis=1) 

In [None]:
def tag_join(tag_list):
    joined_pares = []
    for pare in tag_list:
        if emoji.is_emoji(pare[0]):
            token_tag = pare[0] + '_SYM'
        else:
            token_tag = pare[0] + '_' + pare[1]
        joined_pares.append(token_tag)
    return joined_pares

In [None]:
emojis['u_tags'] = emojis.apply(lambda x: tag_join(x['filtered']), axis=1)

In [None]:
emojis

### Подготовка датасета с комментариями ВК

##### Чтение датасета

In [None]:
colnames = ['index', 'comment']

In [None]:
comments = pd.read_csv('comments.csv', names=colnames, header=None)

In [None]:
comments

In [None]:
comments = comments.drop(labels = [0,1],axis = 0)

In [None]:
comments = comments.drop(columns = ['index'],axis = 1)

In [None]:
comments = comments.reset_index(drop=True)

##### Предобработка комментариев

In [None]:
patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
stopwords_ru = stopwords.words("russian")
morph = MorphAnalyzer()

def lemmatize(doc):
    doc = re.sub(patterns, ' ', str(doc))
    for c in str(doc):
      if emoji.is_emoji(c):
          i = doc.index(c)
          doc = doc[:i] + doc[i:]
    tokens = []
    for token in doc.split():
        if token and token not in stopwords_ru and emoji.is_emoji(token) == False:
            token = token.strip()
            token = morph.normal_forms(token)[0]
            tokens.append(token)
        elif emoji.is_emoji(token):
            tokens.append(token)
    if len(tokens) > 2:
        return tokens
    return None

In [None]:
comments['comment'] = comments['comment'].apply(lemmatize)

In [None]:
comments = comments.dropna()

In [None]:
comments

In [None]:
comments['token_tag'] = comments.apply(lambda x: nltk.pos_tag(x['comment'], tagset='universal', lang='rus'), axis=1)

In [None]:
comments

In [None]:
comments['token_tag'] = comments.apply(lambda x: tag_join(x['token_tag']), axis=1)

##### Проверка на наличие эмодзи в полученном датасете комментариев

In [None]:
def check(comment):
  flag=0
  for item in comment:
    if emoji.is_emoji(item):
      flag+=1
  if flag > 0:
    return(comment)
  return None 

In [None]:
contain_emoji = comments.apply(check)

In [None]:
contain_emoji = contain_emoji.dropna()

In [None]:
contain_emoji

### Загрузка предобученной модели

##### Скачиваем обученную модель и загружаем её

In [None]:
model_1 = wv(vector_size=300, min_count=1, window=3)

In [None]:
model_1.build_vocab(comments['token_tag'])

In [None]:
model_1.wv.vectors_lockf = np.ones(len(model_1.wv))

In [None]:
model_1.wv.intersect_word2vec_format('model.bin', lockf=1.0, binary=True)

In [None]:
model_1.wv.most_similar('ощущение_NOUN')

In [None]:
model_1.train(comments['token_tag'], total_examples=model_1.corpus_count, epochs=1)

In [None]:
model_1.build_vocab(emojis['u_tags'], update=True)

In [None]:
model_1.wv.most_similar('😀_SYM')