# Scam classification using a CNN
https://www.geeksforgeeks.org/text-classification-using-cnn/

In [None]:
# importing the necessary libraries
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Setting up the parameters
# maximum_features = 5000  # Maximum number of words to consider as features
# maximum_length = 100  # Maximum length of input sequences
# word_embedding_dims = 50  # Dimension of word embeddings
no_of_filters = 52  # Number of filters in the convolutional layer
kernel_size = 5  # Size of the convolutional filters
hidden_dims = 128  # Number of neurons in the hidden layer
batch_size = 32  # Batch size for training
epochs = 4  # Number of training epochs
threshold = 0.5  # Threshold for binary classification

# 1. Set global seeds
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)


# First dataset
train_df = pd.read_csv("processed_train.csv")
test_df = pd.read_csv("processed_test.csv")

# Second dataset
train_df1 = pd.read_csv("processed_train1.csv")
test_df1 = pd.read_csv("processed_test1.csv")

# Combined dataset
train_df2 = pd.read_csv("combined_processed_train.csv")
test_df2 = pd.read_csv("combined_processed_test.csv")

# Eval dataset
test_df3 = pd.read_csv("scam_dataset_eval_processed.csv")

In [None]:
from transformers import BertTokenizer

# Load Pretrained BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text
train_tokens = tokenizer(list(train_df['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")
test_tokens = tokenizer(list(test_df['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")

train_tokens1 = tokenizer(list(train_df1['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")
test_tokens1 = tokenizer(list(test_df1['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")

train_tokens2 = tokenizer(list(train_df2['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")
test_tokens2 = tokenizer(list(test_df2['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")

test_tokens3 = tokenizer(list(test_df3['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
vocab_size = tokenizer.vocab_size

# Building the model
model = Sequential()

# Adding the embedding layer to convert input sequences to dense vectors
# model.add(Embedding(maximum_features, word_embedding_dims,
#                     input_length=maximum_length))
model.add(Embedding(vocab_size, 128))

# Adding the 1D convolutional layer with ReLU activation
model.add(Conv1D(no_of_filters, kernel_size, padding='valid',
                 activation='relu', strides=1))

# Adding the global max pooling layer to reduce dimensionality
model.add(GlobalMaxPooling1D())

# Adding the dense hidden layer with ReLU activation
model.add(Dense(hidden_dims, activation='relu'))

# Adding the output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compiling the model with binary cross-entropy loss and Adam optimizer
model.compile(loss='binary_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
                  metrics=['accuracy'])

# Training the model
# model.fit(x_train, y_train, batch_size=batch_size,
#           epochs=epochs, validation_data=(x_test, y_test))

X_train = train_tokens['input_ids']
y_train = train_df['label']
X_val = test_tokens['input_ids']
y_val = test_df['label']

X_train1 = train_tokens1['input_ids']
y_train1 = train_df1['label']
X_val1 = test_tokens1['input_ids']
y_val1 = test_df1['label']

X_train2 = train_tokens2['input_ids']
y_train2 = train_df2['label']
X_val2 = test_tokens2['input_ids']
y_val2 = test_df2['label']

X_val3 = test_tokens3['input_ids']
y_val3 = test_df3['label']

model.fit(X_train1, y_train1, epochs=3, validation_data=(X_val1, y_val1), verbose=2)

Epoch 1/3
100/100 - 13s - 127ms/step - accuracy: 0.8759 - loss: 8.9026 - val_accuracy: 0.9862 - val_loss: 1.2612
Epoch 2/3
100/100 - 11s - 109ms/step - accuracy: 0.8888 - loss: 2.1072 - val_accuracy: 0.9563 - val_loss: 0.1387
Epoch 3/3
100/100 - 11s - 105ms/step - accuracy: 0.9650 - loss: 0.1129 - val_accuracy: 0.9287 - val_loss: 0.1562


<keras.src.callbacks.history.History at 0x7f5a10991c50>

In [None]:
# Predicting the probabilities for test data
y_pred_prob = model.predict(X_val1)

# Converting the probabilities to binary classes based on threshold
y_pred = (y_pred_prob > threshold).astype(int)

# Calculating the evaluation metrics
y_test = y_val1
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Printing the evaluation metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

# alternative way of calcualting accuracy
loss, accuracy = model.evaluate(test_tokens['input_ids'], test_df['label'], verbose=0)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Accuracy: 0.92875
Precision: 0.87527352297593
Recall: 1.0
F1-score: 0.9334889148191365


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading mako-1.3.10-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [None]:
import optuna
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

def objective(trial):
    embedding_dim = trial.suggest_categorical("embedding_dim", [32, 64, 128])
    no_of_filters = trial.suggest_int("no_of_filters", 32, 128)
    kernel_size = trial.suggest_int("kernel_size", 3, 7)
    hidden_dims = trial.suggest_categorical("hidden_dims", [32, 64, 128])
    learning_rate = trial.suggest_categorical("lr", [1e-4, 1e-3, 1e-2])
    epochs = trial.suggest_int("epochs", 1, 5)

    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim))
    model.add(Conv1D(no_of_filters, kernel_size, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(hidden_dims, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  metrics=['accuracy'])

    model.fit(train_tokens['input_ids'], train_df['label'], epochs=epochs,
              validation_data=(test_tokens['input_ids'], test_df['label']),
              verbose=0)

    y_pred_prob = model.predict(test_tokens['input_ids'])
    y_pred = (y_pred_prob > 0.5).astype(int)
    y_test = test_df['label']

    f1 = f1_score(y_test, y_pred)
    return f1

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Best parameters found:")
print(study.best_trial.params)

[I 2025-04-23 04:33:56,540] A new study created in memory with name: no-name-37967efb-e681-4e79-b340-b9e15b9fc098


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


[I 2025-04-23 04:34:11,083] Trial 0 finished with value: 0.8652482269503546 and parameters: {'embedding_dim': 128, 'no_of_filters': 54, 'kernel_size': 3, 'hidden_dims': 32, 'lr': 0.001, 'epochs': 1}. Best is trial 0 with value: 0.8652482269503546.


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 


[I 2025-04-23 04:34:13,965] Trial 1 finished with value: 0.6971677559912854 and parameters: {'embedding_dim': 32, 'no_of_filters': 78, 'kernel_size': 6, 'hidden_dims': 32, 'lr': 0.0001, 'epochs': 1}. Best is trial 0 with value: 0.8652482269503546.


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


[I 2025-04-23 04:34:17,399] Trial 2 finished with value: 0.8851351351351351 and parameters: {'embedding_dim': 32, 'no_of_filters': 123, 'kernel_size': 6, 'hidden_dims': 64, 'lr': 0.0001, 'epochs': 1}. Best is trial 2 with value: 0.8851351351351351.


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


[I 2025-04-23 04:34:22,058] Trial 3 finished with value: 0.7644787644787645 and parameters: {'embedding_dim': 32, 'no_of_filters': 64, 'kernel_size': 4, 'hidden_dims': 128, 'lr': 0.0001, 'epochs': 2}. Best is trial 2 with value: 0.8851351351351351.


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 


[I 2025-04-23 04:34:25,164] Trial 4 finished with value: 0.9876543209876543 and parameters: {'embedding_dim': 32, 'no_of_filters': 127, 'kernel_size': 6, 'hidden_dims': 128, 'lr': 0.01, 'epochs': 1}. Best is trial 4 with value: 0.9876543209876543.


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step  


[I 2025-04-23 04:34:32,854] Trial 5 finished with value: 0.6938775510204082 and parameters: {'embedding_dim': 32, 'no_of_filters': 44, 'kernel_size': 4, 'hidden_dims': 128, 'lr': 0.0001, 'epochs': 5}. Best is trial 4 with value: 0.9876543209876543.


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


[I 2025-04-23 04:34:38,893] Trial 6 finished with value: 0.9815950920245399 and parameters: {'embedding_dim': 64, 'no_of_filters': 90, 'kernel_size': 5, 'hidden_dims': 128, 'lr': 0.01, 'epochs': 2}. Best is trial 4 with value: 0.9876543209876543.


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


[I 2025-04-23 04:34:47,125] Trial 7 finished with value: 0.9781931464174455 and parameters: {'embedding_dim': 128, 'no_of_filters': 38, 'kernel_size': 7, 'hidden_dims': 128, 'lr': 0.0001, 'epochs': 2}. Best is trial 4 with value: 0.9876543209876543.


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


[I 2025-04-23 04:34:54,266] Trial 8 finished with value: 0.9368770764119602 and parameters: {'embedding_dim': 64, 'no_of_filters': 128, 'kernel_size': 4, 'hidden_dims': 32, 'lr': 0.01, 'epochs': 2}. Best is trial 4 with value: 0.9876543209876543.


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


[I 2025-04-23 04:35:07,457] Trial 9 finished with value: 0.9905362776025236 and parameters: {'embedding_dim': 128, 'no_of_filters': 52, 'kernel_size': 5, 'hidden_dims': 128, 'lr': 0.01, 'epochs': 4}. Best is trial 9 with value: 0.9905362776025236.


Best parameters found:
{'embedding_dim': 128, 'no_of_filters': 52, 'kernel_size': 5, 'hidden_dims': 128, 'lr': 0.01, 'epochs': 4}
