# Классификатор Спама

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense

In [2]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Category'])

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Message'])

In [6]:
sequences = tokenizer.texts_to_sequences(df['Message'])
max_len = max(len(x) for x in sequences)
X = pad_sequences(sequences, maxlen=max_len)
y = df['Label'].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vocab_size = len(tokenizer.word_index) + 1

In [8]:
embedding_dim = 100

# RNN
rnn_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    SimpleRNN(128),
    Dense(1, activation='sigmoid')
])
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [9]:
rnn_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 132ms/step - accuracy: 0.8446 - loss: 0.4090 - val_accuracy: 0.8664 - val_loss: 0.3597
Epoch 2/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 128ms/step - accuracy: 0.9113 - loss: 0.2280 - val_accuracy: 0.9767 - val_loss: 0.0689
Epoch 3/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 108ms/step - accuracy: 0.9913 - loss: 0.0303 - val_accuracy: 0.9910 - val_loss: 0.0395
Epoch 4/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 116ms/step - accuracy: 0.9982 - loss: 0.0089 - val_accuracy: 0.9901 - val_loss: 0.0350
Epoch 5/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 135ms/step - accuracy: 0.9991 - loss: 0.0057 - val_accuracy: 0.9874 - val_loss: 0.0439


<keras.src.callbacks.history.History at 0x79d31d382390>

In [10]:
# LSTM
lstm_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    LSTM(128),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 447ms/step - accuracy: 0.8830 - loss: 0.3344 - val_accuracy: 0.9865 - val_loss: 0.0588
Epoch 2/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 441ms/step - accuracy: 0.9896 - loss: 0.0410 - val_accuracy: 0.9892 - val_loss: 0.0470
Epoch 3/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 419ms/step - accuracy: 0.9954 - loss: 0.0163 - val_accuracy: 0.9910 - val_loss: 0.0416
Epoch 4/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 426ms/step - accuracy: 0.9987 - loss: 0.0077 - val_accuracy: 0.9910 - val_loss: 0.0413
Epoch 5/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 495ms/step - accuracy: 0.9996 - loss: 0.0020 - val_accuracy: 0.9901 - val_loss: 0.0428


<keras.src.callbacks.history.History at 0x79d306814d50>

In [11]:
# GRU
gru_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    GRU(128),
    Dense(1, activation='sigmoid')
])
gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gru_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))
gru_model.evaluate(X_test, y_test)

Epoch 1/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 494ms/step - accuracy: 0.8424 - loss: 0.4087 - val_accuracy: 0.9874 - val_loss: 0.0626
Epoch 2/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 527ms/step - accuracy: 0.9888 - loss: 0.0410 - val_accuracy: 0.9901 - val_loss: 0.0472
Epoch 3/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 441ms/step - accuracy: 0.9971 - loss: 0.0117 - val_accuracy: 0.9910 - val_loss: 0.0441
Epoch 4/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 435ms/step - accuracy: 0.9987 - loss: 0.0085 - val_accuracy: 0.9910 - val_loss: 0.0410
Epoch 5/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 478ms/step - accuracy: 0.9994 - loss: 0.0014 - val_accuracy: 0.9901 - val_loss: 0.0512
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.9910 - loss: 0.0487


[0.05121458321809769, 0.9901345372200012]

##Сравнение моделей

In [12]:
rnn_model.evaluate(X_test, y_test)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9860 - loss: 0.0461


[0.043903328478336334, 0.9874439239501953]

In [13]:
lstm_model.evaluate(X_test, y_test)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 82ms/step - accuracy: 0.9900 - loss: 0.0470


[0.04277870059013367, 0.9901345372200012]

In [14]:
gru_model.evaluate(X_test, y_test)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.9910 - loss: 0.0487


[0.05121458321809769, 0.9901345372200012]