In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras.optimizers import Adam
from tensorflow.keras.layers import Dropout

In [2]:
df = pd.read_csv(r'cleaned_data.csv')

In [None]:
df.isna().sum()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['target'] ,test_size=0.2, random_state=42)

In [14]:
max_words_1 = 10000
tokenizer_1 = Tokenizer(num_words=max_words_1)
tokenizer_1.fit_on_texts(X_train)

In [15]:
X_train_seq_1 = tokenizer_1.texts_to_sequences(X_train)
X_test_seq_1 = tokenizer_1.texts_to_sequences(X_test)

In [16]:
max_sequence_length_1 = max([len(seq) for seq in X_train_seq_1])

In [17]:
X_train_pad_1 = pad_sequences(X_train_seq_1, maxlen=max_sequence_length_1)
X_test_pad_1 = pad_sequences(X_test_seq_1, maxlen=max_sequence_length_1)

In [18]:
def create_model(max_sequence_length, embedding_dim, max_words, lstm_units, num_layers=1):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_sequence_length))
    for _ in range(num_layers - 1):
        model.add(LSTM(lstm_units, return_sequences=True))
    model.add(LSTM(lstm_units, return_sequences=False)) 
    model.add(Dense(5, activation='softmax'))  
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
batch_size_2 = 4
embedding_dim_2 = 10
lstm_units_2 = 8

model_2 = create_model(
    max_sequence_length=max_sequence_length_1,
    embedding_dim=embedding_dim_2,
    max_words=max_words_1,
    lstm_units=lstm_units_2,
    num_layers=1
)

optimizer = Adam(learning_rate=0.001)
model_2.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model_2.fit(X_train_pad_1, y_train, batch_size=batch_size_2, epochs=10)

In [None]:
y_pred = np.argmax(model_2.predict(X_test_pad_1), axis=1)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [6]:
max_words_2 = 25000
tokenizer_2 = Tokenizer(num_words=max_words_2)
tokenizer_2.fit_on_texts(X_train)

In [7]:
X_train_seq_2 = tokenizer_2.texts_to_sequences(X_train)
X_test_seq_2 = tokenizer_2.texts_to_sequences(X_test)

In [8]:
max_sequence_length_2 = max([len(seq) for seq in X_train_seq_2])

In [9]:
X_train_pad_2 = pad_sequences(X_train_seq_2, maxlen=max_sequence_length_2)
X_test_pad_2 = pad_sequences(X_test_seq_2, maxlen=max_sequence_length_2)

In [10]:
def create_double_lstm_model(max_sequence_length, embedding_dim, max_words, lstm_units, dropout_rate=0.3):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_sequence_length))
    model.add(LSTM(lstm_units, return_sequences=True)) 
    model.add(Dropout(dropout_rate))
    model.add(LSTM(lstm_units, return_sequences=False))
    model.add(Dropout(dropout_rate))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
embedding_dim_2 = 30 
lstm_units_2 = 16 
batch_size_2 = 8

double_model_1 = create_double_lstm_model(
    max_sequence_length=max_sequence_length_2,
    embedding_dim=embedding_dim_2,
    max_words=max_words_2,
    lstm_units=lstm_units_2,
    dropout_rate=0.3
)

double_model_1.fit(X_train_pad_2, y_train, batch_size=batch_size_2, epochs=10)

In [None]:
y_pred = np.argmax(double_model_1.predict(X_test_pad_2), axis=1)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()