<div style="display: flex;">
    <a href="https://t.me/nsbarsukov" style="margin-right:20px;">
        <img src="https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white" alt="write-me-telegram"/>
    </a>
    <a href="https://github.com/nsbarsukov/toxic-comments-detector" style="margin-right:20px;">
        <img src="https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white" alt="Open Github"/>
    </a>
</div>
<div style="display: flex; margin-top: 20px;">
    <a href="https://colab.research.google.com/github/nsbarsukov/toxic-comments-detector/blob/master/models/08weightedRNN%2BmultiBert.ipynb">
        <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
    </a>
</div>

In [None]:
import os

def check_is_google_colab():
    return 'google.colab' in str(get_ipython())


def prepare_environment():
    if check_is_google_colab():
        from google.colab import drive
        import shutil
        
        drive.mount('/content/gdrive/')

        PATH_TO_TEMP_REPO = '/content/gdrive/My Drive/toxic-comments-repo-temp'

        try:
            shutil.rmtree(PATH_TO_TEMP_REPO)
        except:
            pass

        os.mkdir(PATH_TO_TEMP_REPO)
        
        %cd ./gdrive/My Drive/toxic-comments-repo-temp
        ! git clone https://github.com/nsbarsukov/toxic-comments-detector.git .
        ! pip3 install -q tensorflow_text
    else:
        %cd ..

prepare_environment()

In [None]:
import sys
import tensorflow as tf
import numpy as np

# local (this repo) imports
from dev import (
    load_cleaned_russian_text_data,
    get_class_weights,
    get_initial_output_bias,
    evaluate_model,
    SEED,
    VALIDATION_SPLIT_PARTITION,
    DEFAULT_BATCH_SIZE,
    DEFAULT_EPOCHS
)
from wordEmbeddingsLayers import (
    create_multiBERT_preprocess_layer,
    create_multiBERT_encoder_layer,
    BERTLayer
)

tf.random.set_seed(SEED)

print('Python version:', sys.version)
print('Tensorflow version:', tf.__version__)

# Build RNN model

In [None]:
X_train, X_test, y_train, y_test = load_cleaned_russian_text_data()

In [None]:
from tensorflow.keras import layers, losses

negative_class_amount, pos_class_amount = np.bincount(y_train)

initial_output_bias = tf.keras.initializers.Constant(
    get_initial_output_bias(pos_class_amount, negative_class_amount)
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    mode='min',
    patience=2,
    verbose=1,
    restore_best_weights=False
)

# model = tf.keras.Sequential([
#     layers.Input(shape=[], dtype=tf.string),
#     BERTLayer(to_sequence=True),
    
#     layers.Bidirectional(tf.keras.layers.LSTM(256)),
    
#     layers.Dense(128, activation='relu'),
#     layers.Dropout(rate=0.5, seed=SEED),
#     layers.Dense(units=1, bias_initializer=initial_output_bias),
#     layers.Activation('sigmoid') # то есть в конце мы выдаем уже вероятности
# ])


def build_classifier_model():
    text_input = layers.Input(shape=[], dtype=tf.string, name='text')
    
    # bert
    preprocessing_layer = create_multiBERT_preprocess_layer()
    encoder_inputs = preprocessing_layer(text_input)
    encoder = create_multiBERT_encoder_layer()
    outputs = encoder(encoder_inputs)
    net = outputs['sequence_output']
    
    # RNN
    net = layers.Bidirectional(tf.keras.layers.LSTM(256))(net)
    
    # fully connected and final
    net = layers.Dense(128, activation='relu')(net)
    net = layers.Dropout(rate=0.5, seed=SEED)(net)
    net = layers.Dense(units=1, bias_initializer=initial_output_bias)(net)
    net = layers.Activation('sigmoid')(net)
    
    return tf.keras.Model(text_input, net)
    
model = build_classifier_model()

model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), # from_logits=False, т.к. на выходе layers.Activation('sigmoid')
    optimizer='adam',
    metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Recall(), tf.keras.metrics.Precision()]
)

In [None]:
history = model.fit(
    x=X_train,
    y=y_train,
    epochs=DEFAULT_EPOCHS,
    batch_size=DEFAULT_BATCH_SIZE // 16,
    validation_split=VALIDATION_SPLIT_PARTITION,
#     callbacks=[early_stopping],
    class_weight=get_class_weights(pos_class_amount, negative_class_amount),
)

# Model evaluation

In [None]:
y_test_pred = (model.predict(X_test).flatten() > 0.5) * 1

evaluate_model(y_test, y_test_pred, history)