In [2]:
# Import the huggingface dataset
from datasets import load_dataset

import pandas as pd
import numpy as np

# DL
import torch
from torch.utils.data import Dataset

# Statistics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from scipy import stats

# For text processing
import re
import string

# For sentiment analysis
from textblob import TextBlob

# For BERT
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [4]:
# General dataset info
train_dl = pd.read_csv('balanced_data_files/train_dl.csv')
val_dl = pd.read_csv('balanced_data_files/val_dl.csv')

print(train_dl.head())
print(train_dl.shape)
print(train_dl.info())
print(train_dl.columns)

                                          Email Text  label
0  re : rolex order details - m 3945 s rolex : $ ...      1
1  aisb96 call for workshop proposals - - - - - -...      0
2  query : not not dear linguists , i am working ...      0
3  Update of /cvsroot/spamassassin/spamassassin/m...      0
4  URL: http://boingboing.net/#85482211\nDate: No...      0
(13054, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13054 entries, 0 to 13053
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Email Text  13041 non-null  object
 1   label       13054 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 204.1+ KB
None
Index(['Email Text', 'label'], dtype='object')


In [5]:
# Step 1 pre-process
def preprocess_text(texts, max_len=100, max_words=10000):
    """
    Tokenize and pad text data
    """
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

    print(f"Vocabulary size: {len(word_index)}")
    print(f"Example padded sequence shape: {padded_sequences.shape}")

    return padded_sequences, word_index, tokenizer

In [6]:
# Step 2: Building ANN
def build_ann_model(max_len, vocab_size, embedding_dim=100):
    """
    Build and compile a simple ANN model for text classification
    """
    model = Sequential()

    # Embedding layer
    model.add(Embedding(input_dim=vocab_size+1,
                        output_dim=embedding_dim,
                        input_length=max_len))

    # Flatten the embeddings
    model.add(Flatten())

    # Dense layers
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))

    # Output layer
    model.add(Dense(1, activation='sigmoid'))

    # Compile model
    model.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

    return model


In [9]:
# Step 3: Train the model
def train_model(X_train, y_train, X_val, y_val, model, batch_size=32, epochs=1):
    """
     ANN model training function
    """
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        batch_size=batch_size,
        epochs=epochs,
        verbose=1
    )

    return history, model