In [None]:
import  numpy as np, pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout
from tensorflow.keras.layers import  GlobalMaxPool1D
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC
import tensorflow as tf
import tensorflow_addons as tfa

gpus = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpus))
print("GPU devices: ", gpus)

In [None]:
def load_data(csv_files):
    data_frames = []
    for file in csv_files:
        df = pd.read_csv(file)
        data_frames.append(df)
    combined_df = pd.concat(data_frames, ignore_index=True)
    shuffled_df = combined_df.sample(frac=1, random_state=42)
    return shuffled_df

In [None]:
csv_files = ['..\dataset\dataset_lg_train_final.csv']
data = load_data(csv_files)
print(data.isnull().any())
data.head()


In [None]:
data = data.dropna(subset=['processed_comment'])

In [None]:
list_sentences = data['processed_comment']
y = data['toxicity']

print(f"Количество записей в data: {len(data)}")

In [None]:
print(tf.__version__)

In [None]:
max_features = 176404
maxlen = 200
# Инициализация токенизатора
tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(data['processed_comment'])

# Преобразование текстов в последовательности
sequences = tokenizer.texts_to_sequences(data['processed_comment'])
padded_sequences = pad_sequences(sequences, maxlen=200, truncating='post', padding='post')

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, data['toxicity'], test_size=0.2, random_state=42)

In [None]:
X_train.shape[0]

In [None]:
y_train.shape[0]

In [None]:
toxicity_1_count = np.sum(y_train == 1)
print("Количество элементов с меткой toxicity = 1 в y_train:", toxicity_1_count)


In [None]:
import numpy as np

# Найти индексы строк с меткой toxicity = 1
toxic_indices_train = np.where(y_train == 1)[0]

# Дублировать эти строки в тренировочной выборке
X_train_toxic = X_train[toxic_indices_train]
y_train_toxic = y_train.iloc[toxic_indices_train]

# Объединить исходные тренировочные данные с новыми дублированными строками
X_train_balanced = np.concatenate([X_train, X_train_toxic], axis=0)
y_train_balanced = np.concatenate([y_train, y_train_toxic], axis=0)

# Удвоить строки с меткой toxicity = 1
#X_train_balanced = np.concatenate([X_train_balanced, X_train_toxic], axis=0)
#y_train_balanced = np.concatenate([y_train_balanced, y_train_toxic], axis=0)

# Перемешать данные, чтобы сохранить случайность
shuffle_indices_train = np.random.permutation(len(X_train_balanced))
X_train_balanced_shuffled = X_train_balanced[shuffle_indices_train]
y_train_balanced_shuffled = y_train_balanced[shuffle_indices_train]


In [None]:
X_train_balanced_shuffled.shape[0]

In [None]:
y_train_balanced_shuffled.shape[0]

In [None]:
toxicity_1_count = np.sum(y_train_balanced_shuffled == 1)
print("Количество элементов с меткой toxicity = 1 в y_train:", toxicity_1_count)

In [None]:
import pickle
with open('tokenizer_lstm_lemmatized.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=2, verbose=1, mode='max', restore_best_weights=True)

In [None]:
inp = Input(shape=(maxlen, )) 
embed_size = 128
x = Embedding(max_features, embed_size)(inp)
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)

In [None]:
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', AUC(name='roc_auc', curve='ROC'), tfa.metrics.F1Score(num_classes=1, threshold=0.5), precision, recall])                       

In [None]:
batch_size = 32
epochs = 2
model.fit(X_train_balanced_shuffled, y_train_balanced_shuffled, batch_size=32, epochs=2, validation_data=(X_test, y_test),callbacks=[early_stopping])

In [None]:
model.save_weights('lstm_upsampled.weights.h5')

In [None]:
def prepare_input(text):
    tokenized = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(tokenized, maxlen=200)  # убедитесь, что maxlen соответствует тому, что использовалось при обучении
    return padded

# Пример текста
text = "ну ты и плох"
prepared_text = prepare_input(text)

# Предсказание токсичности
prediction = model.predict(prepared_text)
print("Toxicity Score:", prediction[0][0])

In [None]:
model.summary()

In [None]:
predictions = model.predict(X_test)
predictions = predictions.flatten() 

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
roc_auc = roc_auc_score(y_test, predictions)
accuracy = accuracy_score(y_test, (predictions > 0.5).astype(int))
report = classification_report(y_test, (predictions > 0.5).astype(int))

In [None]:
print("ROC AUC:", roc_auc)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

In [None]:
test_sentences = ["ты плохой человек стоит поискать что-то еще", "аж блевать клоун"]

# Преобразование предложений в последовательности
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=maxlen, truncating='post', padding='post')

# Предсказание модели
predictions = model.predict(test_padded)
predictions = predictions.flatten()  # Преобразование в одномерный массив, если модель возвращает двумерный

# Вывод результатов
for i, sentence in enumerate(test_sentences):
    print(f"Sentence: '{sentence}' - Prediction (Toxic Probability): {predictions[i]:.4f}")