In [19]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import keras
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam

In [7]:
# data = pd.read_csv("/home/pavithra/projects/spam_or_ham_fsec/data/source/train_data.csv", header=None)
# data.head()
# target = pd.read_csv("/home/pavithra/projects/spam_or_ham_fsec/data/source/train_labels.csv", header=None)
# target.head()

# # Did the simple preprocessing without sampling. do PCA and target data changes.
# X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1, random_state=42, stratify=target)


# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# n_components = 2000
# pca = PCA(n_components=n_components)

# X_train_pca = pca.fit_transform(X_train_scaled)
# X_test_pca = pca.transform(X_test_scaled)


In [8]:
# y_train.replace(1, 0, inplace=True)
# y_train.replace(-1, 1, inplace=True)


# y_test.replace(1, 0, inplace=True)
# y_test.replace(-1, 1, inplace=True)

# np.savetxt("data/x_train.csv", X_train_pca, delimiter=',')
# np.savetxt("data/x_test.csv", X_test_pca, delimiter=',')
# np.savetxt("data/y_train.csv", y_train, delimiter=',', fmt='%d')
# np.savetxt("data/y_test.csv", y_test, delimiter=',', fmt='%d')

In [3]:
# Load the data from the pre-processed data.
X_train = pd.read_csv('data/x_train.csv', header=None)
X_test = pd.read_csv('data/x_test.csv', header=None)
y_train = pd.read_csv('data/y_train.csv', header=None)
y_test = pd.read_csv('data/y_test.csv', header=None)

print("X train ",X_train.shape)
print("X test",X_test.shape)
print("Y_Train", y_train.shape)
print("Y_test", y_test.shape)


X train  (3375, 2000)
X test (375, 2000)
Y_Train (3375, 1)
Y_test (375, 1)


In [4]:
# Check the miniority class.
y_train.value_counts() / y_train.shape[0]

0
0    0.900148
1    0.099852
Name: count, dtype: float64

In [6]:
# Split the data into majarity class and minority class.
merged_data = pd.concat([X_train, y_train], axis=1)
not_spam_data = merged_data[merged_data.iloc[:, -1] == 0]
spam_data = merged_data[merged_data.iloc[: , -1] == 1]

In [7]:
not_spam_data.iloc[:, -1].value_counts()

0
0    3038
Name: count, dtype: int64

In [8]:
spam_data.iloc[:, -1].value_counts()

0
1    337
Name: count, dtype: int64

In [10]:
spam_data = spam_data.iloc[:, :-1]
spam_data.shape

(337, 2000)

In [None]:


# Assume minority_class_embeddings is the minority class data (shape: [num_samples, embedding_dim])
minority_class_embeddings = spam_data
embedding_dim = minority_class_embeddings.shape[1]

# Define the Generator
def build_generator(noise_dim, embedding_dim):
    model = Sequential([
        Dense(256, input_dim=noise_dim),
        LeakyReLU(alpha=0.2),
        Dense(512),
        LeakyReLU(alpha=0.2),
        Dense(embedding_dim, activation='linear'),
    ])
    return model

# Define the Discriminator
def build_discriminator(embedding_dim):
    model = Sequential([
        Dense(512, input_dim=embedding_dim),
        LeakyReLU(alpha=0.2),
        Dense(256),
        LeakyReLU(alpha=0.2),
        Dense(1, activation='sigmoid'),
    ])
    return model

# Hyperparameters
noise_dim = 2000
batch_size = 64
epochs = 10
learning_rate = 0.0002

# Build and compile the discriminator
discriminator = build_discriminator(embedding_dim)
discriminator.compile(loss=keras.losses.BinaryCrossentropy(), optimizer=Adam(learning_rate), metrics=['accuracy'])

# Build the generator
generator = build_generator(noise_dim, embedding_dim)

# Build and compile the GAN
discriminator.trainable = False  # Freeze the discriminator during generator training
gan_input = tf.keras.Input(shape=(noise_dim,))
gan_output = discriminator(generator(gan_input))
gan = tf.keras.Model(gan_input, gan_output)
gan.compile(loss=keras.losses.BinaryCrossentropy(), optimizer=Adam(learning_rate))

# Training the GAN
for epoch in range(epochs):
    for _ in range(len(minority_class_embeddings) // batch_size):
        # Train Discriminator
        idx = np.random.randint(0, minority_class_embeddings.shape[0], batch_size)
        real_embeddings = minority_class_embeddings.iloc[idx, :]
        real_labels = np.ones((batch_size, 1))  # Label 1 for real data

        noise = np.random.normal(0, 1, (batch_size, noise_dim))
        fake_embeddings = generator.predict(noise)
        fake_labels = np.zeros((batch_size, 1))  # Label 0 for fake data

        # Combine real and fake data
        combined_embeddings = np.vstack([real_embeddings, fake_embeddings])
        combined_labels = np.vstack([real_labels, fake_labels])

        # Train the discriminator
        loss_tracker = tf.keras.metrics.Mean(name='loss')
        d_loss = discriminator.train_on_batch(combined_embeddings, combined_labels)

        # Train Generator
        noise = np.random.normal(0, 1, (batch_size, noise_dim))
        valid_labels = np.ones((batch_size, 1))  # Label 1 for fooling the discriminator
        g_loss = gan.train_on_batch(noise, valid_labels)

    # Logging
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{epochs} | D Loss: {d_loss[0]:.4f}, Acc: {d_loss[1]*100:.2f}% | G Loss: {g_loss:.4f}")

# Generate synthetic embeddings
def generate_synthetic_data(generator, num_samples, noise_dim):
    noise = np.random.normal(0, 1, (num_samples, noise_dim))
    synthetic_data = generator.predict(noise)
    return synthetic_data

# Generate new minority class samples
num_new_samples = 500  # Number of synthetic samples to generate
synthetic_embeddings = generate_synthetic_data(generator, num_new_samples, noise_dim)

print("Synthetic embeddings generated with shape:", synthetic_embeddings.shape)


In [22]:
from tensorflow.keras.backend import clear_session

clear_session()
