In [2]:
%pip install fpdf seaborn

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=6b737d52d16936fc975e6b0d21406c98e3b88a0152f4abb59f49b3c841415af7
  Stored in directory: /root/.cache/pip/wheels/6e/62/11/dc73d78e40a218ad52e7451f30166e94491be013a7850b5d75
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [3]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from fpdf import FPDF

In [4]:
DATA_PATH = "cellula toxic data.csv"
OUT_DIR = "./"
os.makedirs(OUT_DIR, exist_ok=True)

In [28]:
def load_and_prepare():
    df = pd.read_csv(DATA_PATH)
    df['text'] = (df['query'].fillna('') + ' . ' + df['image descriptions'].fillna('')).str.strip()
    df = df[df['text'].str.strip()!='']
    df = df[df['Toxic Category'].notnull()]
    le = LabelEncoder()
    df['label'] = le.fit_transform(df['Toxic Category'])
    return df, le

In [49]:
def build_model(max_vocab, max_len, num_classes, embedding_dim=100):
    model = Sequential([
        Embedding(input_dim=max_vocab, output_dim=embedding_dim, input_length=max_len),
        Bidirectional(LSTM(128, return_sequences=True)),
        GlobalMaxPool1D(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [50]:
class F1Callback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.validation_data = validation_data
        self.val_f1s = []
    def on_epoch_end(self, epoch, logs=None):
        X_val, y_val = self.validation_data
        y_pred_prob = self.model.predict(X_val, verbose=0)
        y_pred = y_pred_prob.argmax(axis=1)
        y_true = y_val.argmax(axis=1)
        f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        self.val_f1s.append(f1)
        print(f" — val_f1: {f1:.4f}")

In [32]:
df, le = load_and_prepare()
num_classes = df['label'].nunique()
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

train_df

Unnamed: 0,query,image descriptions,Toxic Category,text,label
1095,What are the health benefits of daily exercise...,A child playing in a sunny meadow.,Safe,What are the health benefits of daily exercise...,3
2110,Do you think all new Asian wife wants to do is...,A child playing in a sunny meadow.,Safe,Do you think all new Asian wife wants to do is...,3
194,I hope we never get a female president in the ...,A child playing in a sunny meadow.,Safe,I hope we never get a female president in the ...,3
2103,There's someone I need to reach out to. Can yo...,A child playing in a sunny meadow.,Safe,There's someone I need to reach out to. Can yo...,3
1867,Do these symptoms mean I have kidney failure?,People walking through a peaceful city street.,Safe,Do these symptoms mean I have kidney failure? ...,3
...,...,...,...,...,...
1856,Can you help me understand the specific terms ...,Police tape across a crime scene at night.,Violent Crimes,Can you help me understand the specific terms ...,7
2210,What is the best remedy for my insomnia?,A child playing in a sunny meadow.,Safe,What is the best remedy for my insomnia? . A c...,3
1050,What are the health benefits of daily exercise...,People walking through a peaceful city street.,Unknown S-Type,What are the health benefits of daily exercise...,6
492,Explain how to perform an activity related to ...,A masked person tampering with an ATM.,Non-Violent Crimes,Explain how to perform an activity related to ...,2


In [40]:
MAX_VOCAB = 10000
MAX_LEN = 60
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['text'])

In [41]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['text']), maxlen=MAX_LEN, padding='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['text']), maxlen=MAX_LEN, padding='post')
y_train = tf.keras.utils.to_categorical(train_df['label'], num_classes=num_classes)
y_test = tf.keras.utils.to_categorical(test_df['label'], num_classes=num_classes)

In [51]:
model = build_model(MAX_VOCAB, MAX_LEN, num_classes)
model.summary()



In [52]:
checkpoint_path = os.path.join(OUT_DIR, "best_model.h5")

mc = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1)
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)
f1_cb = F1Callback(validation_data=(X_test, y_test))

In [53]:
history = model.fit(X_train, y_train, epochs=25, batch_size=32, validation_data=(X_test, y_test),
                        callbacks=[es, mc, f1_cb], verbose=2)

Epoch 1/25

Epoch 1: val_loss improved from inf to 0.78335, saving model to ./LSTM/best_model.h5




 — val_f1: 0.2921
75/75 - 23s - 302ms/step - accuracy: 0.5021 - loss: 1.5166 - val_accuracy: 0.7033 - val_loss: 0.7833
Epoch 2/25

Epoch 2: val_loss improved from 0.78335 to 0.47061, saving model to ./LSTM/best_model.h5




 — val_f1: 0.4887
75/75 - 18s - 236ms/step - accuracy: 0.7642 - loss: 0.6254 - val_accuracy: 0.8167 - val_loss: 0.4706
Epoch 3/25

Epoch 3: val_loss improved from 0.47061 to 0.19243, saving model to ./LSTM/best_model.h5




 — val_f1: 0.9492
75/75 - 20s - 266ms/step - accuracy: 0.8933 - loss: 0.3216 - val_accuracy: 0.9583 - val_loss: 0.1924
Epoch 4/25

Epoch 4: val_loss improved from 0.19243 to 0.18082, saving model to ./LSTM/best_model.h5




 — val_f1: 0.9452
75/75 - 20s - 270ms/step - accuracy: 0.9742 - loss: 0.0976 - val_accuracy: 0.9533 - val_loss: 0.1808
Epoch 5/25

Epoch 5: val_loss did not improve from 0.18082
 — val_f1: 0.9465
75/75 - 21s - 280ms/step - accuracy: 0.9950 - loss: 0.0339 - val_accuracy: 0.9567 - val_loss: 0.2827
Epoch 6/25

Epoch 6: val_loss did not improve from 0.18082
 — val_f1: 0.9390
75/75 - 17s - 224ms/step - accuracy: 0.9937 - loss: 0.0298 - val_accuracy: 0.9367 - val_loss: 0.2401
Epoch 7/25

Epoch 7: val_loss did not improve from 0.18082
 — val_f1: 0.9449
75/75 - 22s - 291ms/step - accuracy: 0.9954 - loss: 0.0193 - val_accuracy: 0.9567 - val_loss: 0.2720
Epoch 8/25

Epoch 8: val_loss did not improve from 0.18082
 — val_f1: 0.9476
75/75 - 19s - 253ms/step - accuracy: 0.9979 - loss: 0.0112 - val_accuracy: 0.9600 - val_loss: 0.3524
Epoch 9/25

Epoch 9: val_loss did not improve from 0.18082
 — val_f1: 0.9492
75/75 - 17s - 222ms/step - accuracy: 0.9987 - loss: 0.0088 - val_accuracy: 0.9583 - val_loss

In [54]:
model.load_weights(checkpoint_path)

In [55]:
y_pred_prob = model.predict(X_test)
y_pred = y_pred_prob.argmax(axis=1)
y_test_labels = test_df['label'].values

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 69ms/step


In [56]:
macro_f1 = f1_score(y_test_labels, y_pred, average='macro', zero_division=0)
report = classification_report(y_test_labels, y_pred, target_names=le.classes_, zero_division=0)
cm = confusion_matrix(y_test_labels, y_pred)

In [57]:
# with open(os.path.join(OUT_DIR, "tokenizer.pkl"), "wb") as f:
#   pickle.dump(tokenizer, f)
# with open(os.path.join(OUT_DIR, "label_encoder.pkl"), "wb") as f:
#   pickle.dump(le, f)
# model.save(os.path.join(OUT_DIR, "final_model.h5"))



In [58]:
plt.figure(figsize=(8,5))
plt.plot(history.history.get('loss', []), label='train_loss')
plt.plot(history.history.get('val_loss', []), label='val_loss')
plt.legend(); plt.title("Loss"); plt.grid(True)
plt.savefig(os.path.join(OUT_DIR, "loss.png"), bbox_inches='tight'); plt.close()

In [59]:
plt.figure(figsize=(8,5))
plt.plot(history.history.get('accuracy', []), label='train_acc')
plt.plot(history.history.get('val_accuracy', []), label='val_acc')
plt.legend(); plt.title("Accuracy"); plt.grid(True)
plt.savefig(os.path.join(OUT_DIR, "accuracy.png"), bbox_inches='tight'); plt.close()

In [60]:
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted"); plt.ylabel("True"); plt.title("Confusion Matrix")
plt.xticks(rotation=45); plt.yticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "confusion_matrix.png"), bbox_inches='tight'); plt.close()

In [63]:
pdf = FPDF()
pdf.add_page()

# Title
pdf.set_font("Arial", "B", 14)
pdf.cell(0, 8, "LSTM Classification Results", ln=1)
pdf.ln(2)

# Dataset info
pdf.set_font("Arial", "", 12)
pdf.multi_cell(0, 6, f"Dataset: {os.path.basename(DATA_PATH)}")
pdf.multi_cell(0, 6, f"Number of samples: {df.shape[0]}, Classes: {num_classes}")
pdf.ln(2)

# Macro F1
pdf.multi_cell(0, 6, f"Macro F1 on test set: {macro_f1:.4f}")
pdf.ln(2)

# Classification report
pdf.multi_cell(0, 6, "Classification report:")
pdf.set_font("Courier", "", 10)  # Monospaced font for proper alignment
for line in report.splitlines():
    pdf.multi_cell(0, 5, line)
pdf.ln(2)

# Include images (loss, accuracy, confusion matrix)
for fname in ["loss.png", "accuracy.png", "confusion_matrix.png"]:
    fpath = os.path.join(OUT_DIR, fname)
    if os.path.exists(fpath):
        pdf.add_page()
        pdf.image(fpath, x=10, y=20, w=190)  # Adjust width as needed

# Save PDF
pdf.output(os.path.join(OUT_DIR, "training_results.pdf"))
print("Done. Results in", OUT_DIR)

Done. Results in ./LSTM
