In [None]:
import fasttext
import numpy as np, pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import roc_auc_score
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from keras.metrics import AUC

import tensorflow_addons as tfa

gpus = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpus))
print("GPU devices: ", gpus)

In [None]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

#tf.test.is_gpu_available()

In [None]:
print(tf.__version__)

In [None]:
model_ru = fasttext.load_model('C:\\Users\\astaf\\toxicComments\\fastTextModel\\cc.ru.300.bin')

In [None]:
data = pd.read_csv('../dataset/dataset_lg_train_final.csv')

In [None]:
data = data.dropna(subset=['processed_comment'])

In [None]:
data['processed_comment'][248295]

In [None]:
data.shape

In [None]:
maxlen = 300

In [None]:
max_features = 254288
maxlen = 300
# Инициализация токенизатора
tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(data['processed_comment'])

# Преобразование текстов в последовательности
sequences = tokenizer.texts_to_sequences(data['processed_comment'])
padded_sequences = pad_sequences(sequences, maxlen=300, truncating='post', padding='post')

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, data['toxicity'], test_size=0.2, random_state=42)

In [None]:

# Найти индексы строк с меткой toxicity = 1
toxic_indices_train = np.where(y_train == 1)[0]

# Дублировать эти строки в тренировочной выборке
X_train_toxic = X_train[toxic_indices_train]
y_train_toxic = y_train.iloc[toxic_indices_train]

# Объединить исходные тренировочные данные с новыми дублированными строками
X_train_balanced = np.concatenate([X_train, X_train_toxic], axis=0)
y_train_balanced = np.concatenate([y_train, y_train_toxic], axis=0)

# Удвоить строки с меткой toxicity = 1
#X_train_balanced = np.concatenate([X_train_balanced, X_train_toxic], axis=0)
#y_train_balanced = np.concatenate([y_train_balanced, y_train_toxic], axis=0)

# Перемешать данные, чтобы сохранить случайность
shuffle_indices_train = np.random.permutation(len(X_train_balanced))
X_train_balanced_shuffled = X_train_balanced[shuffle_indices_train]
y_train_balanced_shuffled = y_train_balanced[shuffle_indices_train]

In [None]:
import pickle
with open('tokenizer_bilstm_untrainableembedding.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
embed_size = 300
# Инициализация матрицы эмбеддингов
embedding_matrix = np.zeros((max_features, embed_size))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    try:
        embedding_vector = model_ru[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except KeyError:
        print(f'Error creating embedding for {word}')
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), embed_size)

In [None]:
np.save('embedding_matrix_fasttext_untrainableembedding.npy', embedding_matrix)

In [None]:
def roc_auc(y_true, y_pred):
    return tf.py_function(roc_auc_score, (y_true, y_pred), tf.double)

# Ранний выход
early_stopping = EarlyStopping(monitor='val_accuracy', patience=2, verbose=1, mode='max', restore_best_weights=True)

In [None]:
embedding_matrix.shape

In [None]:
print(tf.__version__)

In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:

print(embedding_matrix.shape)

In [None]:
embedding_matrix = np.load('embedding_matrix_fasttext.npy')

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1], embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), trainable = False)(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)

In [None]:
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', AUC(name='roc_auc', curve='ROC'),tfa.metrics.F1Score(num_classes=1, threshold=0.5), precision, recall])

In [None]:
model.summary()

In [None]:
print("Training data shape:", X_train_balanced_shuffled.shape)
print("Test data shape:", y_train_balanced_shuffled.shape)

In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32, callbacks=[early_stopping])

In [None]:
model.save_weights('bilstm.weights_untrainableembedding.h5')

In [None]:
model.load_weights('fasttext.weights.h5')

In [None]:
model.save('fasttext_final.keras', include_optimizer=True)

In [None]:
predictions = model.predict(X_test)
predictions = predictions.flatten() 

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
roc_auc = roc_auc_score(y_test, predictions)
accuracy = accuracy_score(y_test, (predictions > 0.5).astype(int))
report = classification_report(y_test, (predictions > 0.5).astype(int))

In [None]:
print("ROC AUC:", roc_auc)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

In [None]:
with open('tokenizer_lstm.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
test_sentences = ["ты ужасный человек, стоит плох что-то еще", "Я желаю чтобы ты утонул в колодце тюлень"]

# Преобразование предложений в последовательности
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=maxlen, truncating='post', padding='post')

# Предсказание модели
predictions = model.predict(test_padded)
predictions = predictions.flatten()  # Преобразование в одномерный массив, если модель возвращает двумерный

# Вывод результатов
for i, sentence in enumerate(test_sentences):
    print(f"Sentence: '{sentence}' - Prediction (Toxic Probability): {predictions[i]:.4f}")