# Задание
Берем отызывы за лето (из архива с материалами или предыдущего занятия)
1. Учим conv сеть для классификации
2. Рассмотреть 2-а варианта сеточек 
2.1 Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/
2.2 Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)
Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше

In [27]:
import pandas as pd
import numpy as np

In [28]:
df = pd.read_excel('отзывы за лето.xls')
df.head(1)

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14


#### Предобработка

In [29]:
df.isna().sum()

Rating     0
Content    3
Date       0
dtype: int64

In [30]:
df = df.dropna()

In [31]:
df.shape

(20656, 3)

In [32]:
df.dtypes

Rating      int64
Content    object
Date       object
dtype: object

In [33]:
df.loc[:, 'Content'] = df['Content'].astype(str)

In [34]:
X = df['Content']
y = df['Rating']

In [35]:
y.value_counts()

5    14584
1     2276
4     2137
3      911
2      748
Name: Rating, dtype: int64

Виден дизбаланс классов

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
import nltk

In [38]:
train_corpus = " ".join(X_train)
train_corpus = train_corpus.lower()

In [39]:
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/maximdoroshenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Отфильтруем данные и соберём в корпус N наиболее частых токенов

In [40]:
tokens_filtered = [word for word in tokens if word.isalnum()]
tokens_filtered[:10]

['не',
 'приходит',
 'смс',
 'код',
 'для',
 'входа',
 'в',
 'приложение',
 'удобно',
 'надежно']

In [41]:
max_words = 200
max_len = 40
num_classes = 6

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [42]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

tokens_filtered_top[:10]

['приложение', 'не', 'и', 'очень', 'удобно', 'все', 'в', 'на', 'что', 'всё']

In [43]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [44]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [45]:
X_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
X_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

In [46]:
#!pip install imblearn

Устранение дизбаланса классов

In [49]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()
X_train, y_train = sm.fit_resample(X_train, y_train)

In [51]:
y_train.value_counts()

1    11656
2    11656
3    11656
4    11656
5    11656
Name: Rating, dtype: int64

In [52]:
import keras
from keras.utils import np_utils

y_train = keras.utils.np_utils.to_categorical(y_train, num_classes)
y_test = keras.utils.np_utils.to_categorical(y_test, num_classes)

#### Conv сеть для классификации

In [53]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
# from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping  

In [54]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [55]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [56]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


In [57]:
preds_proba = model.predict(X_test)

In [58]:
preds = [np.argmax(l) for l in preds_proba]

In [59]:
from sklearn.metrics import classification_report

print(classification_report([np.argmax(l) for l in y_test], preds))

              precision    recall  f1-score   support

           1       0.36      0.85      0.51       445
           2       0.00      0.00      0.00       123
           3       0.13      0.14      0.13       203
           4       0.67      0.00      0.01       433
           5       0.92      0.88      0.90      2928

    accuracy                           0.72      4132
   macro avg       0.41      0.38      0.31      4132
weighted avg       0.76      0.72      0.70      4132



#### Вывод: Т.к. в целевой переменной сильный дизбаланс классов, то метрики плохие.
Сеть хорошо обучилась на 5-м классе. Также, путём овер-семлпинга удалось получить видимый результат по первому классу. 

### Классификация по отлично/не отлично

In [60]:
num_classes = 2

In [62]:
y = np.where(y == 5, 1, 0)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
X_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

y_train = keras.utils.np_utils.to_categorical(y_train, num_classes)
y_test = keras.utils.np_utils.to_categorical(y_test, num_classes)

In [65]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [66]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [67]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


In [70]:
preds_proba = model.predict(X_test)
preds = [np.argmax(l) for l in preds_proba]
print(classification_report([np.argmax(l) for l in y_test], preds))

              precision    recall  f1-score   support

           0       0.80      0.70      0.74      1204
           1       0.88      0.93      0.90      2928

    accuracy                           0.86      4132
   macro avg       0.84      0.81      0.82      4132
weighted avg       0.86      0.86      0.86      4132



#### Для определения оценки 5/не 5 метрика заметно лучше

### Pretrained embedding

#### Испрользованные ресурсы:

- RusVectōrēs: семантические модели для русского языка
https://github.com/akutuzov/webvectors/blob/master/preprocessing/rusvectores_tutorial.ipynb


- Using Gensim Embeddings with Keras and Tensorflow
https://github.com/RaRe-Technologies/gensim/wiki/Using-Gensim-Embeddings-with-Keras-and-Tensorflow


- WV to keras embedding
https://stackoverflow.com/a/71550086/3484997


- Using pre-trained word embeddings
https://keras.io/examples/nlp/pretrained_word_embeddings/

In [82]:
from gensim.models import KeyedVectors
import gensim

In [78]:
wv_from_text = KeyedVectors.load_word2vec_format('araneum_upos_skipgram_300_2_2018.vec', binary=False)

2022-06-20 11:15:20,032 : INFO : loading projection weights from araneum_upos_skipgram_300_2_2018.vec
2022-06-20 11:15:50,062 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (196620, 300) matrix of type float32 from araneum_upos_skipgram_300_2_2018.vec', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-06-20T11:15:50.062673', 'gensim': '4.1.2', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'load_word2vec_format'}


In [89]:
from tensorflow.keras.layers import Embedding

def gensim_to_keras_embedding(keyed_vectors, train_embeddings=False):
    """Get a Keras 'Embedding' layer with weights set from Word2Vec model's learned word embeddings.

    Parameters
    ----------
    train_embeddings : bool
        If False, the returned weights are frozen and stopped from being updated.
        If True, the weights can / will be further updated in Keras.

    Returns
    -------
    `keras.layers.Embedding`
        Embedding layer, to be used as input to deeper network layers.

    """
#     keyed_vectors = model.wv  # structure holding the result of training
    weights = keyed_vectors.vectors  # vectors themselves, a 2D numpy array    
    index_to_key = keyed_vectors.index_to_key  # which row in `weights` corresponds to which word?

    layer = Embedding(
        input_dim=weights.shape[0],
        output_dim=weights.shape[1],
        weights=[weights],
        trainable=train_embeddings,
    )
    return layer

In [90]:
emb_layer = gensim_to_keras_embedding(wv_from_text)

In [91]:
model = Sequential()
model.add(emb_layer)
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [92]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [93]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Train on 14871 samples, validate on 1653 samples
Epoch 1/20
Epoch 2/20
  512/14871 [>.............................] - ETA: 1s - loss: 0.4536 - accuracy: 0.8145

  updates = self.state_updates




In [94]:
preds_proba = model.predict(X_test)
preds = [np.argmax(l) for l in preds_proba]
print(classification_report([np.argmax(l) for l in y_test], preds))

  updates=self.state_updates,


              precision    recall  f1-score   support

           0       0.78      0.66      0.72      1204
           1       0.87      0.92      0.90      2928

    accuracy                           0.85      4132
   macro avg       0.83      0.79      0.81      4132
weighted avg       0.84      0.85      0.84      4132



#### Вывод: Метрика слегка ухудшилась при использовании предобученных векторов.

Использовался небольшой набор данных: araneum_upos_skipgram_300_2_2018.vec. Возможно, при использовании более крупной статистической модели метрику удастся повысить.