In [1]:
!wget -qq "https://drive.google.com/uc?export=download&id=1ACZpj5sxVRPw4wa8Z_07PHjLxqrPPCdo" -O feedbacks.xls

In [2]:
!pip install -qq pymorphy2

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.5 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 KB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for docopt (setup.py) ... [?25l[?25hdone


In [3]:
import warnings
warnings.filterwarnings("ignore")

import re
from collections import Counter
import string
import time

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import nltk
from nltk.tokenize import word_tokenize, WordPunctTokenizer
from nltk.corpus import stopwords
from nltk import FreqDist

from pymorphy2 import MorphAnalyzer

import tensorflow as tf
from tensorflow.keras import layers

pd.set_option("display.max_colwidth", None)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
pymorphy2_analyzer = MorphAnalyzer()
wp_tokenize = WordPunctTokenizer()
punctuation = string.punctuation

russian = stopwords.words("russian")
english = stopwords.words("english")

In [5]:
df = pd.read_excel("feedbacks.xls")

In [7]:
df.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,"В целом удобноное приложение...из минусов хотят слишком большой доступ к персональным данным в телефоне,приходится пользоваться в ограниченном режиме",2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше никуда. Ранее больше года пользовался нормально.,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [8]:
df["Content"].str.len().describe()

count    20655.000000
mean        56.039942
std         78.161446
min          1.000000
25%         13.000000
50%         27.000000
75%         65.000000
max       1147.000000
Name: Content, dtype: float64

In [6]:
cache = {}

def preprocess(text: str, 
               remove_punct: bool = True,
               lemma: bool = True) -> list:

    if not isinstance(text, str):
        text = str(text)

    text = re.sub("\s\s+", " ", text)
    if remove_punct:
        text = re.sub(r"[^a-zA-Zа-яА-Я0-9]", " ", text)
        text = re.sub(r"[^a-zA-Zа-яА-Я]", " ", text)
        text = re.sub(r"[^\w\s]", " ", text)
    else:
        text = re.sub(f"[^a-zA-Zа-яА-Я{punctuation}]", " ", text)

    text = text.lower().strip().replace(',', '')
    text = [w for w in text.split() if len(w)>1]

    if len(text) > 5:
        text = ' '.join([w for w in text if w not in russian and w not in english])
    else:
        text = ' '.join([w for w in text])
    
    text = wp_tokenize.tokenize(text)
    
    if lemma:
        lemm_text = []
        for word in text:
            if word in cache:
                lemm_text.append(cache[word])
            else:
                temp_w = cache[word] = pymorphy2_analyzer.parse(word)[0].normal_form
                lemm_text.append(temp_w)
        return lemm_text
    
    return text

In [7]:
df["Rating"] -= 1

df["prep"] = df["Content"].apply(preprocess).apply(' '.join)

x_train, x_valid, y_train, y_valid = train_test_split(df["prep"], df["Rating"], test_size=0.2, random_state=42)

In [8]:
def prepare_dataset(x, y, batch_size=64, cache=True, shuffle=True):
    ds = tf.data.Dataset.from_tensor_slices((x, y))
    if cache:
        ds = ds.cache()
    if shuffle:
        ds = ds.shuffle(buffer_size=len(x))
    ds = ds.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds

In [9]:
train_ds = prepare_dataset(x_train, y_train)
valid_ds = prepare_dataset(x_valid, y_valid, cache=False, shuffle=False)

In [10]:
vocab_size = 10000
seq_len = 40 # 65  

vectorize_layer = layers.TextVectorization(
    standardize=None, 
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=seq_len)

text_data = tf.data.Dataset.from_tensor_slices((x_train, y_train)).map(lambda x, y: x)
vectorize_layer.adapt(text_data)

In [11]:
!pip install -qq tensorflow_addons
from tensorflow_addons.layers import StochasticDepth
from tensorflow_addons.optimizers import AdamW

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
class ResBlock(layers.Layer):
    def __init__(self, out_channels, stride=1, dropout=0.2):
        super(ResBlock, self).__init__()
        self.dropout = StochasticDepth(survival_probability=dropout)

        self.block = tf.keras.Sequential([
            layers.Conv1D(out_channels, 3, strides=stride, padding="same", activation=None),
            # layers.BatchNormalization(epsilon=1e-8),
            layers.Activation("relu"),
            layers.Conv1D(out_channels, 3, strides=1, padding="same", activation=None),
            ## layers.BatchNormalization(epsilon=1e-8)
            ])
        
        self.downsample = tf.keras.Sequential([
            layers.Conv1D(out_channels, 3, strides=stride, padding="same", activation=None),
            ## layers.BatchNormalization(epsilon=1e-8)
            ])
        self.norm = layers.BatchNormalization(epsilon=1e-8)
        self.activation = layers.Activation("relu")

    def call(self, x):
        bx = self.block(x)
        shortcut = self.downsample(x)

        x = self.dropout([shortcut, bx])

        # x = self.norm(x)
        x = self.activation(x)

        return x

class CNNModel(tf.keras.Model):
    def __init__(self, num_classes=y_train.nunique(), embedding_dim=64, dropout=0.2):
        super(CNNModel, self).__init__()
        self.embedding = layers.Embedding(vocab_size+1, embedding_dim)

        self.unigram = layers.Conv1D(64, 1, 1, padding="same", activation=None)
        self.bigram = layers.Conv1D(64, 2, 1, padding="same", activation=None)
        self.trigram = layers.Conv1D(64, 3, 1, padding="same", activation=None)

        # self.norm = layers.BatchNormalization(epsilon=1e-8)

        self.activation = layers.Activation("relu")

        self.resnet_block = ResBlock(out_channels=64, stride=1, dropout=dropout)

        self.dense_dropout = layers.Dropout(rate=dropout)

        self.pool = layers.GlobalMaxPool1D()

        self.fc = layers.Dense(units=num_classes)

    def call(self, x):
        x = vectorize_layer(x)
        x = self.embedding(x)

        cx1 = self.unigram(x)
        cx2 = self.bigram(x)
        cx3 = self.trigram(x)

        x = tf.concat([cx1, cx2, cx3], axis=-1)
        # x = self.norm(x)
        x = self.activation(x)

        x = self.resnet_block(x)
        x = self.pool(x)

        x = self.dense_dropout(x)
        x = self.fc(x)

        return x

In [13]:
model = CNNModel(embedding_dim=32, dropout=0.5)
optimizer = AdamW(learning_rate=0.001, weight_decay=0.0001)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

callbacks = [
             tf.keras.callbacks.ReduceLROnPlateau(patience=2),
             tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
            ]

In [14]:
epochs = 30
history = model.fit(train_ds, validation_data=valid_ds, epochs=epochs, callbacks=callbacks)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30


In [8]:
!pip install -qq transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m115.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import DataCollatorWithPadding, create_optimizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2", add_prefix_space=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
tokenizer.add_special_tokens({ "eos_token": "</s>", "bos_token": "<s>", "pad_token": "<pad>"})

0

In [12]:
model = TFAutoModelForSequenceClassification.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2", 
                                                             num_labels=5,
                                                             pad_token_id=tokenizer.pad_token_id,
                                                             eos_token_id=tokenizer.eos_token_id,
                                                             bos_token_id=tokenizer.bos_token_id, 
                                                             from_pt=True, 
                                                             use_cache=False)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/551M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2ForSequenceClassification: ['transformer.h.5.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'lm_head.weight', 'transformer.h.11.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.4.attn.masked_bias']
- This IS expected if you are initializing TFGPT2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassificat

In [14]:
train, valid = train_test_split(df[["Content", "Rating"]], test_size=0.2, random_state=42)
train.loc[train["Content"].str.len() == 1, "Content"] = "отлично"

seq_len = 40

In [15]:
train.to_csv("train.csv", index=False)
valid.to_csv("valid.csv", index=False)

In [16]:
dataset = load_dataset(path="csv", data_files={"train": ["train.csv"], "test": ["valid.csv"], })



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-5269a718e1228ed4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-5269a718e1228ed4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
mapping = {0: "horrible", 1: "bad", 2: "normal", 3: "good", 4: "perfect"}

In [18]:
def preprocess_strings(examples):
    try:
        examples["Content"] = examples["Content"].split()
    except:
        examples["Content"] = [mapping[examples["Rating"]]]
    return examples

In [19]:
dataset = dataset.map(preprocess_strings)

  0%|          | 0/16527 [00:00<?, ?ex/s]

  0%|          | 0/4132 [00:00<?, ?ex/s]

In [20]:
def tokenize(examples):
    tokenized_inputs = tokenizer(
                                examples["Content"], 
                                truncation=True,
                                max_length=40,
                                padding="max_length",
                                is_split_into_words=True,
        )
    all_labels = examples["Rating"]
    new_labels = [label for label in all_labels]         
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [21]:
tokenized_datasets = dataset.map(
    tokenize,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

  0%|          | 0/17 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [22]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [23]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    shuffle=False,
    batch_size=8,
    collate_fn=data_collator,
)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [26]:
num_epochs = 3
total_train_steps = (len(tokenized_datasets["train"]) // 8) * num_epochs
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

model.compile(optimizer=optimizer, metrics=["accuracy"])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [27]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=num_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4b5a103700>