In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import FastText
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, SpatialDropout1D, GlobalAveragePooling1D, Layer, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.initializers import Constant
from captum.attr import IntegratedGradients
import requests
import zipfile
import os
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from tensorflow.keras.metrics import Precision, Recall, AUC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc



In [2]:
# Load data
df = pd.read_csv('Training_Essay_Data.csv', encoding='latin1')


In [3]:
# Data Preprocessing and Cleaning
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text


In [4]:
df['clean_text'] = df['text'].apply(clean_text)



In [5]:
# Tokenization, stopwords removal, and lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()



In [6]:

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens


In [7]:
df['tokens'] = df['clean_text'].apply(preprocess_text)



In [8]:
# Train FastText model
tokenized_sentences = df['tokens'].tolist()
fasttext_model = FastText(vector_size=300, window=5, min_count=1, workers=4)
fasttext_model.build_vocab(corpus_iterable=tokenized_sentences)
fasttext_model.train(corpus_iterable=tokenized_sentences, total_examples=len(tokenized_sentences), epochs=10)


(50502678, 54888100)

In [9]:
# Function to load GloVe embeddings
def load_glove_embeddings(filepath):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index



In [10]:

# Download and load GloVe embeddings
def download_glove_embeddings(url, filename):
    response = requests.get(url)
    with open(filename, 'wb') as f:
        f.write(response.content)


In [11]:

def unzip_glove_embeddings(zip_filepath, extract_dir):
    with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)



In [12]:
glove_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
glove_zip_filepath = 'glove.6B.zip'
glove_extract_dir = 'glove.6B'
glove_embedding_filepath = 'glove.6B/glove.6B.100d.txt'



In [13]:
download_glove_embeddings(glove_url, glove_zip_filepath)
unzip_glove_embeddings(glove_zip_filepath, glove_extract_dir)



In [14]:
glove_embeddings = load_glove_embeddings(glove_embedding_filepath)



In [15]:
# Convert text data into numerical representations: Padding
max_seq_length = 100
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['clean_text'])
X_seq = tokenizer.texts_to_sequences(df['clean_text'])
X_pad = pad_sequences(X_seq, maxlen=max_seq_length)




In [16]:
# Define features (X) and target variable (y)
X = X_pad
y = df['generated']



In [185]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)



In [18]:
# Define the Attention layer
class Attention(Layer):
    def __init__(self, return_sequences=True):
        super(Attention, self).__init__()
        self.return_sequences = return_sequences

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1), initializer="zeros")
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.activations.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.activations.softmax(e, axis=1)
        output = x * a
        if self.return_sequences:
            return output
        return tf.keras.backend.sum(output, axis=1)



In [19]:
# Function to create embedding matrix
def create_embedding_matrix(tokenizer, model, embedding_dim):
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if word in model.wv:
            embedding_matrix[i] = model.wv[word]
    return embedding_matrix



In [20]:
# Create embedding matrices for FastText and GloVe
embedding_matrix_fasttext = create_embedding_matrix(tokenizer, fasttext_model, 300)
embedding_matrix_glove = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix_glove[i] = embedding_vector




In [21]:
# Define LSTM model with embeddings and attention
def create_lstm_model(input_dim, output_dim, max_sequence_length, embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim, output_dim, input_length=max_sequence_length,
                        embeddings_initializer=Constant(embedding_matrix),
                        trainable=False))
    model.add(SpatialDropout1D(0.2))
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Attention(return_sequences=True))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [22]:
# Create LSTM models with different embeddings
lstm_model_fasttext = create_lstm_model(len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)
lstm_model_glove = create_lstm_model(len(tokenizer.word_index) + 1, 100, max_seq_length, embedding_matrix_glove)





In [23]:
# Train the LSTM model with FastText embeddings
history_fasttext = lstm_model_fasttext.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=40, batch_size=32)



Epoch 1/40

[1m603/603[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 45ms/step - accuracy: 0.8081 - loss: 0.5910 - val_accuracy: 0.9458 - val_loss: 0.3915
Epoch 2/40
[1m603/603[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 43ms/step - accuracy: 0.9337 - loss: 0.3686 - val_accuracy: 0.9555 - val_loss: 0.2730
Epoch 3/40
[1m603/603[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 44ms/step - accuracy: 0.9450 - loss: 0.2695 - val_accuracy: 0.9557 - val_loss: 0.2163
Epoch 4/40
[1m603/603[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 45ms/step - accuracy: 0.9588 - loss: 0.2045 - val_accuracy: 0.9632 - val_loss: 0.1776
Epoch 5/40
[1m603/603[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 46ms/step - accuracy: 0.9590 - loss: 0.1726 - val_accuracy: 0.9672 - val_loss: 0.1402
Epoch 6/40
[1m603/603[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 44ms/step - accuracy: 0.9628 - loss: 0.1490 - val_accuracy: 0.9718 - val_loss: 0.1244
Epoch 7/40
[1m

In [24]:
# Evaluate the LSTM model on the test set
y_pred_fasttext = (lstm_model_fasttext.predict(X_val) > 0.5).astype(int)
print(classification_report(y_val, y_pred_fasttext))


[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5234
           1       0.98      0.97      0.98      3028

    accuracy                           0.98      8262
   macro avg       0.98      0.98      0.98      8262
weighted avg       0.98      0.98      0.98      8262



In [25]:
# Save the model
lstm_model_fasttext.save('lstm_model_fasttext.h5')





In [27]:
# Additional steps for privacy and security:

# 1. Data Anonymization
# Remove sensitive columns like 'user_id', 'email', etc., if present.
#df_anonymized = df.drop(columns=['user_id', 'email'])



In [109]:
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasAdamOptimizer

In [189]:
import tensorflow as tf
from tensorflow_privacy.privacy.analysis.compute_dp_sgd_privacy import compute_dp_sgd_privacy
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import make_optimizer_class



ImportError: cannot import name 'compute_dp_sgd_privacy' from 'tensorflow_privacy.privacy.analysis.compute_dp_sgd_privacy' (C:\Users\Ahmedma\AppData\Local\anaconda3\Lib\site-packages\tensorflow_privacy\privacy\analysis\compute_dp_sgd_privacy.py)

In [190]:
# Define differential privacy parameters
learning_rate = 0.001
noise_multiplier = 1.1
l2_norm_clip = 1.0
batch_size = 32
epochs = 40



In [193]:
import math



In [195]:


def compute_dp_epsilon(samples, batch_size, noise_multiplier, epochs, delta):
    """Compute epsilon for differential privacy."""
    if noise_multiplier == 0.0:
        return float('inf')
    
    # Gaussian mechanism formula for epsilon calculation
    sigma = noise_multiplier * l2_norm_clip / batch_size
    epsilon = (sigma * math.sqrt(2 * epochs * math.log(1/delta))) + (math.exp(sigma) - 1) * (epochs / samples)
    
    return epsilon

# Example parameters
samples = len(X_train)
batch_size = 32
noise_multiplier = 1.1
epochs = 40
delta = 1e-5

# Compute epsilon
epsilon = compute_dp_epsilon(samples, batch_size, noise_multiplier, epochs, delta)
print(f"Achieved epsilon: {epsilon:.2f}")


Achieved epsilon: 1.04


In [196]:
# Create your model (example LSTM model with FastText embeddings)
def create_lstm_model(input_dim, embedding_dim, max_seq_length, embedding_matrix):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim, embedding_dim, weights=[embedding_matrix],
                                  input_length=max_seq_length, trainable=False),
        tf.keras.layers.LSTM(128),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model



In [197]:
# Example: Assuming you have tokenizer and embedding_matrix_fasttext defined
lstm_model_fasttext_dp = create_lstm_model(len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)



In [198]:
# Compile the model with a native TensorFlow optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
lstm_model_fasttext_dp.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])



In [199]:
# Custom training loop with differential privacy
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train)).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size)



In [200]:
# Define the loss function
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)



In [201]:
# Function to clip gradients and add noise
def apply_dp_noise_and_clip(grads_and_vars, l2_norm_clip, noise_multiplier):
    clipped_gradients = []
    for gradient, variable in grads_and_vars:
        if gradient is not None:
            noise = tf.random.normal(tf.shape(gradient), stddev=noise_multiplier * l2_norm_clip)
            noised_gradient = gradient + noise
            clipped_gradient = tf.clip_by_norm(noised_gradient, l2_norm_clip)
            clipped_gradients.append((clipped_gradient, variable))
        else:
            clipped_gradients.append((gradient, variable))
    return clipped_gradients



In [202]:
# Training loop with differential privacy
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    
    # Training step
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            predictions = lstm_model_fasttext_dp(x_batch_train, training=True)
            loss = loss_fn(y_batch_train, predictions)
        
        gradients = tape.gradient(loss, lstm_model_fasttext_dp.trainable_variables)
        clipped_gradients = apply_dp_noise_and_clip(zip(gradients, lstm_model_fasttext_dp.trainable_variables), l2_norm_clip, noise_multiplier)
        
        # Apply gradients to model
        optimizer.apply_gradients(clipped_gradients)
        
        if step % 100 == 0:
            print(f"Step {step}, Loss: {loss.numpy()}")

    # Validation step
    val_loss = 0
    val_steps = 0
    for x_batch_val, y_batch_val in val_dataset:
        val_predictions = lstm_model_fasttext_dp(x_batch_val, training=False)
        val_loss += tf.reduce_mean(loss_fn(y_batch_val, val_predictions))
        val_steps += 1

    val_loss /= val_steps
    print(f"Validation Loss: {val_loss.numpy()}")



Epoch 1/40
Step 0, Loss: 0.7195061445236206
Step 100, Loss: 0.7181477546691895
Step 200, Loss: 0.6606565713882446
Step 300, Loss: 0.5793500542640686
Step 400, Loss: 0.5711864829063416
Step 500, Loss: 0.6737239360809326
Step 600, Loss: 0.6145614385604858
Validation Loss: 0.6185742616653442
Epoch 2/40
Step 0, Loss: 0.6071523427963257
Step 100, Loss: 0.647794246673584
Step 200, Loss: 0.6053821444511414
Step 300, Loss: 0.5587713122367859
Step 400, Loss: 0.6395576000213623
Step 500, Loss: 0.6156624555587769
Step 600, Loss: 0.5641543865203857
Validation Loss: 0.5822763442993164
Epoch 3/40
Step 0, Loss: 0.5897873640060425
Step 100, Loss: 0.5177196264266968
Step 200, Loss: 0.5647162199020386
Step 300, Loss: 0.5352736115455627
Step 400, Loss: 0.6954275369644165
Step 500, Loss: 0.636247456073761
Step 600, Loss: 0.5569354295730591
Validation Loss: 0.5632286071777344
Epoch 4/40
Step 0, Loss: 0.6082323789596558
Step 100, Loss: 0.46491801738739014
Step 200, Loss: 0.45711013674736023
Step 300, Loss: 

In [206]:
# Compute the privacy budget
def compute_dp_epsilon(samples, batch_size, noise_multiplier, epochs, delta):
    if noise_multiplier == 0.0:
        return float('inf')
    orders = [1 + x / 10. for x in range(1, 100)] + list(range(12, 64))
    rdp = compute_rdp(q=samples / len(X_train), noise_multiplier=noise_multiplier, steps=epochs, orders=orders)
    eps, _, _ = get_privacy_spent(orders, rdp, target_delta=delta)
    return eps



In [207]:
samples = len(X_train)
delta = 1e-5



In [209]:
import tensorflow as tf
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasAdamOptimizer
from tensorflow_privacy.privacy.analysis import compute_rdp, get_privacy_spent

# Define differential privacy parameters
learning_rate = 0.001
noise_multiplier = 1.1
l2_norm_clip = 1.0
batch_size = 32
num_microbatches = batch_size
epochs = 40

# Create the DPKerasAdamOptimizer
dp_optimizer = DPKerasAdamOptimizer(
    l2_norm_clip=l2_norm_clip,
    noise_multiplier=noise_multiplier,
    num_microbatches=num_microbatches,
    learning_rate=learning_rate
)

# Create your model
lstm_model_fasttext_dp = create_lstm_model(len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)

# Compile the model with the differentially private optimizer
lstm_model_fasttext_dp.compile(optimizer=dp_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Custom training loop
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size)

# Define the loss function
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    
    # Training step
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            predictions = lstm_model_fasttext_dp(x_batch_train, training=True)
            loss = loss_fn(y_batch_train, predictions)
        
        gradients = tape.gradient(loss, lstm_model_fasttext_dp.trainable_variables)
        clipped_gradients = [tf.clip_by_norm(g, l2_norm_clip) for g in gradients]
        
        # Explicitly call apply_gradients on the optimizer
        dp_optimizer.apply_gradients(zip(clipped_gradients, lstm_model_fasttext_dp.trainable_variables))
        
        if step % 100 == 0:
            print(f"Step {step}, Loss: {loss.numpy()}")
    
    # Validation step
    val_loss = 0
    val_steps = 0
    for x_batch_val, y_batch_val in val_dataset:
        val_predictions = lstm_model_fasttext_dp(x_batch_val, training=False)
        val_loss += tf.reduce_mean(loss_fn(y_batch_val, val_predictions))
        val_steps += 1

    val_loss /= val_steps
    print(f"Validation Loss: {val_loss.numpy()}")

# Compute the privacy budget
def compute_dp_epsilon(samples, batch_size, noise_multiplier, epochs, delta):
    if noise_multiplier == 0.0:
        return float('inf')
    orders = [1 + x / 10. for x in range(1, 100)] + list(range(12, 64))
    rdp = compute_rdp(q=samples / len(X_train), noise_multiplier=noise_multiplier, steps=epochs * (samples // batch_size), orders=orders)
    eps, _, _ = get_privacy_spent(orders, rdp, target_delta=delta)
    return eps

samples = len(X_train)
delta = 1e-5

# Compute epsilon
epsilon = compute_dp_epsilon(samples, batch_size, noise_multiplier, epochs, delta)
print(f"Achieved epsilon: {epsilon:.2f}")


ModuleNotFoundError: No module named 'tensorflow_privacy.privacy.analysis.rdp_accountant'

In [208]:
# Compute epsilon
epsilon = compute_dp_epsilon(samples, batch_size, noise_multiplier, epochs, delta)
print(f"Achieved epsilon: {epsilon:.2f}")

ModuleNotFoundError: No module named 'tensorflow_privacy.privacy.analysis.rdp_accountant'

In [203]:
# Compute the privacy budget
epsilon, _ = compute_dp_sgd_privacy(n=len(X_train), batch_size=batch_size, noise_multiplier=noise_multiplier, epochs=epochs, delta=1e-5)
print(f"Achieved epsilon: {epsilon:.2f}")


TypeError: 'module' object is not callable

In [204]:
# Evaluate the differentially private LSTM model
y_pred_fasttext_dp = (lstm_model_fasttext_dp.predict(X_val) > 0.5).astype(int)
print("Differentially Private LSTM Model:")
print(classification_report(y_val, y_pred_fasttext_dp))


[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step
Differentially Private LSTM Model:
              precision    recall  f1-score   support

           0       0.88      0.91      0.90      5234
           1       0.83      0.79      0.81      3028

    accuracy                           0.87      8262
   macro avg       0.86      0.85      0.85      8262
weighted avg       0.86      0.87      0.87      8262



In [205]:

# Compute privacy budget
epsilon, _ = compute_dp_sgd_privacy(n=len(X_train), batch_size=batch_size, noise_multiplier=noise_multiplier, epochs=epochs, delta=1e-5)
print(f"Privacy budget (ε, δ=1e-5): ε = {epsilon:.2f}")



# 3. Secure Model Deployment
# Ensure the model is deployed in a secure environment using HTTPS

# 4. Data Encryption
# Encrypt sensitive data both at rest and in transit

# These steps help integrate privacy and security into your text classification project, ensuring compliance with responsible AI practices.


TypeError: 'module' object is not callable

In [210]:
!pip install Flask




In [211]:
from flask import Flask, request, jsonify
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import os

app = Flask(__name__)

# Load the LSTM model
model = load_model('lstm_model_fasttext.h5')  # Adjust the path to your saved model file

# Function for text preprocessing
max_seq_length = 100  # Adjust according to your model's input shape

def preprocess_text(text):
    # Implement your text preprocessing steps here
    return text

# Define route for model prediction
@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    text = data['text']

    # Preprocess the input text
    preprocessed_text = preprocess_text(text)

    # Tokenize and pad the preprocessed text
    sequence = tokenizer.texts_to_sequences([preprocessed_text])
    X_pad = pad_sequences(sequence, maxlen=max_seq_length)

    # Make predictions
    prediction = model.predict(X_pad)

    # Prepare response
    result = {
        'prediction': prediction.tolist()[0][0]
    }

    return jsonify(result)

if __name__ == '__main__':
    app.run(ssl_context='adhoc', debug=True)  # Run the app with HTTPS support


TypeError: Error when deserializing class 'Attention' using config={'return_sequences': True, 'trainable': True, 'dtype': 'float32'}.

Exception encountered: Unrecognized keyword arguments passed to Attention: {'return_sequences': True}

1. Adversarial Training
Adversarial training involves generating adversarial examples during the model training process to improve its robustness against adversarial attacks.

In [213]:
!pip install cleverhans

Collecting cleverhans
  Downloading cleverhans-4.0.0-py3-none-any.whl.metadata (846 bytes)
Collecting nose (from cleverhans)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Collecting mnist (from cleverhans)
  Downloading mnist-0.2.2-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting easydict (from cleverhans)
  Downloading easydict-1.13-py3-none-any.whl.metadata (4.2 kB)
Downloading cleverhans-4.0.0-py3-none-any.whl (92 kB)
   ---------------------------------------- 0.0/92.3 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/92.3 kB ? eta -:--:--
   ------------------------------- -------- 71.7/92.3 kB 975.2 kB/s eta 0:00:01
   ---------------------------------------- 92.3/92.3 kB 872.6 kB/s eta 0:00:00
Downloading easydict-1.13-py3-none-any.whl (6.8 kB)
Downloading mnist-0.2.2-py2.py3-none-any.whl (3.5 kB)
Downloading nose-1.3.7-py3-none-any.whl (154 kB)
   ---------------------------------------- 0.0/154.7 kB ? eta -:--:--
   ------------------------------

In [214]:
import tensorflow as tf
from cleverhans.tf2.attacks.projected_gradient_descent import projected_gradient_descent

# Define adversarial training function
def adversarial_training(model, X_train, y_train):
    # Create an FGSM instance
    pgd = projected_gradient_descent.ProjectGradientDescent(model, sess=None)

    # Train the model with adversarial examples
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(pgd.generate(X_train), y_train, epochs=10, batch_size=32, verbose=1)


2. Data Augmentation
Data augmentation helps improve model generalization by creating variations of existing data during training.

In [218]:
import random
from nltk.corpus import wordnet
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set(words))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    return sentence

# Convert sequences back to text
def sequences_to_texts(sequences, tokenizer):
    return [' '.join([tokenizer.index_word.get(idx, '') for idx in seq if idx != 0]) for seq in sequences]

# Convert texts back to sequences
def texts_to_sequences(texts, tokenizer):
    return tokenizer.texts_to_sequences(texts)

# Assume tokenizer is already fitted on your data
X_train_texts = sequences_to_texts(X_train, tokenizer)

# Augment the training data
X_augmented_texts = []
for sentence in X_train_texts:
    augmented_sentence = synonym_replacement(sentence, 2)
    X_augmented_texts.append(augmented_sentence)

# Convert the augmented data back to sequences
X_augmented = texts_to_sequences(X_augmented_texts, tokenizer)

# Pad the sequences
X_augmented = pad_sequences(X_augmented, maxlen=max_seq_length, padding='post')

# Now you can proceed with the rest of your pipeline using X_augmented


3. Model Regularization
Regularization techniques such as dropout and weight regularization help prevent overfitting.

In [219]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers

# Add dropout layers to the model
model.add(Dropout(0.2))

# Apply weight regularization
model.add(Dense(64, kernel_regularizer=regularizers.l2(0.01), activation='relu'))


4. Cross-validation
Cross-validation ensures the model's performance is consistent across different data splits.

In [220]:
from sklearn.model_selection import KFold

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    # Train and evaluate the model on each fold


In [225]:
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras import regularizers
from sklearn.model_selection import KFold
import numpy as np
import tensorflow as tf


In [226]:
# Define the model architecture with dropout and weight regularization
def create_lstm_model_with_regularization(vocab_size, embedding_dim, max_seq_length, embedding_matrix):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size,
                                        output_dim=embedding_dim,
                                        weights=[embedding_matrix],
                                        input_length=max_seq_length,
                                        trainable=False))
    model.add(tf.keras.layers.LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))  # Add dropout layer
    model.add(tf.keras.layers.LSTM(128))
    model.add(Dense(64, kernel_regularizer=regularizers.l2(0.01), activation='relu'))  # Apply weight regularization
    model.add(Dense(1, activation='sigmoid'))
    return model


In [227]:



# Assuming you have tokenized your data and have an embedding matrix
# tokenizer, max_seq_length, embedding_matrix_fasttext

# Prepare your data
X = np.array(X_augmented)  # Assuming X_augmented is your augmented dataset
y = np.array(y_train)      # Assuming y_train is your training labels

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
fold_no = 1
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Create a new instance of the model for each fold
    model = create_lstm_model_with_regularization(len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model on the training data of the current fold
    print(f"Training fold {fold_no}...")
    history = model.fit(X_train_fold, y_train_fold, validation_data=(X_val_fold, y_val_fold), epochs=40, batch_size=32)
    
    # Evaluate the model on the validation data of the current fold
    loss, accuracy = model.evaluate(X_val_fold, y_val_fold)
    print(f"Fold {fold_no} validation accuracy: {accuracy}")
    fold_no += 1

print("Cross-validation completed.")


Training fold 1...
Epoch 1/40
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 100ms/step - accuracy: 0.6640 - loss: 0.9375 - val_accuracy: 0.6773 - val_loss: 0.6331
Epoch 2/40
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 99ms/step - accuracy: 0.6688 - loss: 0.6369 - val_accuracy: 0.6773 - val_loss: 0.6314
Epoch 3/40
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 100ms/step - accuracy: 0.6661 - loss: 0.6331 - val_accuracy: 0.6773 - val_loss: 0.6320
Epoch 4/40
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 101ms/step - accuracy: 0.6723 - loss: 0.6203 - val_accuracy: 0.6773 - val_loss: 0.6440
Epoch 5/40
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 100ms/step - accuracy: 0.6752 - loss: 0.5952 - val_accuracy: 0.6493 - val_loss: 0.6616
Epoch 6/40
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 98ms/step - accuracy: 0.7250 - loss: 0.5544 - val_accuracy: 0.6591 - val_loss: 

In [230]:
from sklearn.model_selection import KFold
from tensorflow.keras.layers import Dropout, Dense, BatchNormalization
from tensorflow.keras import regularizers
import numpy as np
import tensorflow as tf

# Define the model architecture with dropout, batch normalization, and weight regularization
def create_lstm_model_with_regularization(vocab_size, embedding_dim, max_seq_length, embedding_matrix):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size,
                                        output_dim=embedding_dim,
                                        weights=[embedding_matrix],
                                        input_length=max_seq_length,
                                        trainable=False))
    model.add(tf.keras.layers.LSTM(128, return_sequences=True))
    model.add(Dropout(0.5))  # Increase dropout rate
    model.add(BatchNormalization())  # Add batch normalization
    model.add(tf.keras.layers.LSTM(128))
    model.add(Dropout(0.5))  # Increase dropout rate
    model.add(Dense(64, kernel_regularizer=regularizers.l2(0.01), activation='relu'))  # Apply weight regularization
    model.add(Dense(1, activation='sigmoid'))
    return model

# Prepare your data
X = np.array(X_augmented)  # Assuming X_augmented is your augmented dataset
y = np.array(y_train)      # Assuming y_train is your training labels

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
fold_no = 1
val_accuracies = []
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Create a new instance of the model for each fold
    model = create_lstm_model_with_regularization(len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model on the training data of the current fold
    print(f"Training fold {fold_no}...")
    history = model.fit(X_train_fold, y_train_fold, validation_data=(X_val_fold, y_val_fold), epochs=50, batch_size=32, verbose=1)
    
    # Evaluate the model on the validation data of the current fold
    loss, accuracy = model.evaluate(X_val_fold, y_val_fold)
    print(f"Fold {fold_no} validation accuracy: {accuracy}")
    val_accuracies.append(accuracy)
    fold_no += 1

print("Cross-validation completed.")
print(f"Average validation accuracy: {np.mean(val_accuracies)}")




Training fold 1...
Epoch 1/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 107ms/step - accuracy: 0.6424 - loss: 1.0228 - val_accuracy: 0.6773 - val_loss: 0.6450
Epoch 2/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 106ms/step - accuracy: 0.6689 - loss: 0.6447 - val_accuracy: 0.6773 - val_loss: 0.6370
Epoch 3/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 106ms/step - accuracy: 0.6695 - loss: 0.6367 - val_accuracy: 0.6773 - val_loss: 0.6341
Epoch 4/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 107ms/step - accuracy: 0.6688 - loss: 0.6314 - val_accuracy: 0.6773 - val_loss: 0.6353
Epoch 5/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 107ms/step - accuracy: 0.6723 - loss: 0.6252 - val_accuracy: 0.6773 - val_loss: 0.6459
Epoch 6/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 107ms/step - accuracy: 0.6677 - loss: 0.6182 - val_accuracy: 0.6773 - val_loss

KeyboardInterrupt: 

In [232]:
from sklearn.model_selection import KFold
from tensorflow.keras.layers import Dropout, Dense, BatchNormalization, LSTM, Embedding, Bidirectional, SpatialDropout1D
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import numpy as np
import tensorflow as tf

# Define the Attention layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1), initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.dot(x, self.W) + self.b
        e = tf.keras.backend.tanh(e)
        a = tf.keras.backend.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)

# Define the model architecture with dropout, batch normalization, and weight regularization
def create_lstm_model_with_regularization(vocab_size, embedding_dim, max_seq_length, embedding_matrix):
    model = tf.keras.Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_seq_length,
                        trainable=False))
    model.add(SpatialDropout1D(0.2))
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Dropout(0.5))  # Adjust dropout rate
    model.add(BatchNormalization())  # Add batch normalization
    model.add(Attention())
    model.add(Dense(32, kernel_regularizer=regularizers.l2(0.01), activation='relu'))  # Reduce number of units
    model.add(Dense(1, activation='sigmoid'))
    return model

# Prepare your data
X = np.array(X_augmented)  # Assuming X_augmented is your augmented dataset
y = np.array(y_train)      # Assuming y_train is your training labels

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
fold_no = 1
val_accuracies = []
val_losses = []
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Create a new instance of the model for each fold
    model = create_lstm_model_with_regularization(len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Callbacks for early stopping and learning rate reduction
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001)
    
    # Train the model on the training data of the current fold
    print(f"Training fold {fold_no}...")
    history = model.fit(X_train_fold, y_train_fold,
                        validation_data=(X_val_fold, y_val_fold),
                        epochs=50,
                        batch_size=32,
                        callbacks=[early_stopping, reduce_lr],
                        verbose=1)
    
    # Evaluate the model on the validation data of the current fold
    loss, accuracy = model.evaluate(X_val_fold, y_val_fold)
    print(f"Fold {fold_no} validation accuracy: {accuracy}, validation loss: {loss}")
    val_accuracies.append(accuracy)
    val_losses.append(loss)
    fold_no += 1

print("Cross-validation completed.")
print(f"Average validation accuracy: {np.mean(val_accuracies)}")
print(f"Average validation loss: {np.mean(val_losses)}")


Training fold 1...
Epoch 1/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 56ms/step - accuracy: 0.6393 - loss: 0.8814 - val_accuracy: 0.6773 - val_loss: 0.6386 - learning_rate: 0.0010
Epoch 2/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 56ms/step - accuracy: 0.6734 - loss: 0.6371 - val_accuracy: 0.6773 - val_loss: 0.6314 - learning_rate: 0.0010
Epoch 3/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 55ms/step - accuracy: 0.6743 - loss: 0.6316 - val_accuracy: 0.6773 - val_loss: 0.6323 - learning_rate: 0.0010
Epoch 4/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 55ms/step - accuracy: 0.6681 - loss: 0.6340 - val_accuracy: 0.6773 - val_loss: 0.6332 - learning_rate: 0.0010
Epoch 5/50
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 54ms/step - accuracy: 0.6602 - loss: 0.6371 - val_accuracy: 0.6773 - val_loss: 0.6343 - learning_rate: 0.0010
Epoch 6/50
[1m482/482[0m [32m━━━━━━━━━

In [233]:
from sklearn.model_selection import KFold
from tensorflow.keras.layers import Dropout, Dense, BatchNormalization, LSTM, Embedding, Bidirectional, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import numpy as np
import tensorflow as tf

# Define the Attention layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1), initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.dot(x, self.W) + self.b
        e = tf.keras.backend.tanh(e)
        a = tf.keras.backend.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)

# Define the model architecture with dropout, batch normalization, and weight regularization
def create_lstm_model_with_regularization(vocab_size, embedding_dim, max_seq_length, embedding_matrix):
    model = tf.keras.Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_seq_length,
                        trainable=False))
    model.add(SpatialDropout1D(0.2))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))  # Increased LSTM units
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Bidirectional(LSTM(128, return_sequences=True)))  # Added another LSTM layer
    model.add(Dropout(0.5))
    model.add(Attention())
    model.add(Dense(128, kernel_regularizer=regularizers.l2(0.01), activation='relu'))  # Increased Dense layer units
    model.add(Dense(1, activation='sigmoid'))
    return model

# Prepare your data
X = np.array(X_augmented)  # Assuming X_augmented is your augmented dataset
y = np.array(y_train)      # Assuming y_train is your training labels

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
fold_no = 1
val_accuracies = []
val_losses = []
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Create a new instance of the model for each fold
    model = create_lstm_model_with_regularization(len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)
    optimizer = Adam(learning_rate=0.001)  # Use Adam optimizer with learning rate
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    # Callbacks for early stopping and learning rate reduction
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)
    
    # Train the model on the training data of the current fold
    print(f"Training fold {fold_no}...")
    history = model.fit(X_train_fold, y_train_fold,
                        validation_data=(X_val_fold, y_val_fold),
                        epochs=100,  # Increase epochs
                        batch_size=64,  # Increase batch size
                        callbacks=[early_stopping, reduce_lr],
                        verbose=1)
    
    # Evaluate the model on the validation data of the current fold
    loss, accuracy = model.evaluate(X_val_fold, y_val_fold)
    print(f"Fold {fold_no} validation accuracy: {accuracy}, validation loss: {loss}")
    val_accuracies.append(accuracy)
    val_losses.append(loss)
    fold_no += 1

print("Cross-validation completed.")
print(f"Average validation accuracy: {np.mean(val_accuracies)}")
print(f"Average validation loss: {np.mean(val_losses)}")


Training fold 1...
Epoch 1/100
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 570ms/step - accuracy: 0.6689 - loss: 1.3283 - val_accuracy: 0.6773 - val_loss: 0.6330 - learning_rate: 0.0010
Epoch 2/100
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 577ms/step - accuracy: 0.6779 - loss: 0.6327 - val_accuracy: 0.6773 - val_loss: 0.6362 - learning_rate: 0.0010
Epoch 3/100
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 592ms/step - accuracy: 0.6725 - loss: 0.6337 - val_accuracy: 0.6773 - val_loss: 0.6319 - learning_rate: 0.0010
Epoch 4/100
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 576ms/step - accuracy: 0.6715 - loss: 0.6342 - val_accuracy: 0.6773 - val_loss: 0.6317 - learning_rate: 0.0010
Epoch 5/100
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 592ms/step - accuracy: 0.6734 - loss: 0.6299 - val_accuracy: 0.6773 - val_loss: 0.6352 - learning_rate: 0.0010
Epoch 6/100
[1m 19/241[0

KeyboardInterrupt: 

In [263]:
from tensorflow.keras.layers import Dropout, Dense, BatchNormalization, LSTM, Embedding, Bidirectional, SpatialDropout1D
from tensorflow.keras import regularizers
import tensorflow as tf

# Define the Attention layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1), initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.dot(x, self.W) + self.b
        e = tf.keras.backend.tanh(e)
        a = tf.keras.backend.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)

# Define the model architecture with attention, dropout, batch normalization, and regularization
def create_lstm_model_with_regularization(vocab_size, embedding_dim, max_seq_length, embedding_matrix):
    model = tf.keras.Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_seq_length,
                        trainable=False))
    model.add(SpatialDropout1D(0.2))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))  # Increased LSTM units
    model.add(Attention())  # Assuming your custom Attention layer
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(64, kernel_regularizer=regularizers.l2(0.01), activation='relu'))  # Increased units in dense layer
    model.add(Dense(1, activation='sigmoid'))
    return model

# Assuming you already have X_train_fold, X_val_fold, y_train_fold, y_val_fold defined
model = create_lstm_model_with_regularization(len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_fold, y_train_fold,
                    validation_data=(X_val_fold, y_val_fold),
                    epochs=50,
                    batch_size=32,
                    callbacks=[early_stopping, reduce_lr],
                    verbose=1)


Epoch 1/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 83ms/step - accuracy: 0.8937 - loss: 0.7690 - val_accuracy: 0.9607 - val_loss: 0.1459 - learning_rate: 0.0010
Epoch 2/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 80ms/step - accuracy: 0.9537 - loss: 0.1540 - val_accuracy: 0.9417 - val_loss: 0.1657 - learning_rate: 0.0010
Epoch 3/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 81ms/step - accuracy: 0.9683 - loss: 0.1057 - val_accuracy: 0.9732 - val_loss: 0.0947 - learning_rate: 0.0010
Epoch 4/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 82ms/step - accuracy: 0.9730 - loss: 0.0870 - val_accuracy: 0.9739 - val_loss: 0.1008 - learning_rate: 0.0010
Epoch 5/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 80ms/step - accuracy: 0.9773 - loss: 0.0760 - val_accuracy: 0.9719 - val_loss: 0.0969 - learning_rate: 0.0010
Epoch 6/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [264]:
import numpy as np
from sklearn.model_selection import KFold
from tensorflow.keras.layers import Dropout, Dense, BatchNormalization, LSTM, Embedding, Bidirectional, SpatialDropout1D
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

# Assuming you have tokenizer, X_augmented, y_train, max_seq_length, and embedding_matrix_fasttext defined


In [265]:
from cleverhans.tf2.attacks.projected_gradient_descent import projected_gradient_descent

def adversarial_training(model, X_train, y_train, X_val, y_val, epochs=50, batch_size=32):
    for epoch in range(epochs):
        X_train_adv = projected_gradient_descent(model, X_train, eps=0.3, eps_iter=0.05)
        history = model.fit(X_train_adv, y_train, validation_data=(X_val, y_val),
                            epochs=1, batch_size=batch_size, verbose=1)
    return history


In [266]:
def augment_data(X_train, y_train, tokenizer, max_length, num_samples=1000):
    augmented_texts = []
    augmented_labels = []

    for text, label in zip(X_train, y_train):
        text_sequence = tokenizer.texts_to_sequences([text])[0]
        augmented_texts.append(text_sequence)
        augmented_labels.append(label)
        for _ in range(num_samples):
            permuted_text = np.random.permutation(text_sequence)
            augmented_texts.append(permuted_text)
            augmented_labels.append(label)

    X_augmented = pad_sequences(augmented_texts, maxlen=max_length)
    y_augmented = np.array(augmented_labels)

    return X_augmented, y_augmented


In [267]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1),
                                 initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1),
                                 initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.dot(x, self.W) + self.b
        e = tf.keras.backend.tanh(e)
        a = tf.keras.backend.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)

def create_lstm_model_with_regularization(vocab_size, embedding_dim, max_seq_length, embedding_matrix):
    model = tf.keras.Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_seq_length,
                        trainable=False))
    model.add(SpatialDropout1D(0.2))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    model.add(Attention())
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(64, kernel_regularizer=regularizers.l2(0.01), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model


In [269]:
import numpy as np
from sklearn.model_selection import KFold

# Convert y_train to numpy array
y_train_np = y_train.to_numpy()

# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
for train_index, val_index in kf.split(X_augmented):
    X_train_fold, X_val_fold = X_augmented[train_index], X_augmented[val_index]
    y_train_fold, y_val_fold = y_train_np[train_index], y_train_np[val_index]
    
    # Create model
    model = create_lstm_model_with_regularization(len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)
    
    # Compile model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Fit model
    history = model.fit(X_train_fold, y_train_fold, epochs=10, batch_size=32, validation_data=(X_val_fold, y_val_fold))
    

    # Evaluate model
    loss, accuracy = model.evaluate(X_val_fold, y_val_fold)
    print(f"Fold {fold_no} validation accuracy: {accuracy}, validation loss: {loss}")
    val_accuracies.append(accuracy)
    val_losses.append(loss)

    fold_no += 1

print("Cross-validation completed.")
print(f"Average validation accuracy: {np.mean(val_accuracies)}")
print(f"Average validation loss: {np.mean(val_losses)}")




Epoch 1/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 84ms/step - accuracy: 0.8901 - loss: 0.7746 - val_accuracy: 0.9578 - val_loss: 0.1654
Epoch 2/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 83ms/step - accuracy: 0.9594 - loss: 0.1430 - val_accuracy: 0.9660 - val_loss: 0.1128
Epoch 3/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 81ms/step - accuracy: 0.9662 - loss: 0.1102 - val_accuracy: 0.9685 - val_loss: 0.1066
Epoch 4/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 83ms/step - accuracy: 0.9780 - loss: 0.0815 - val_accuracy: 0.9639 - val_loss: 0.1178
Epoch 5/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 86ms/step - accuracy: 0.9798 - loss: 0.0725 - val_accuracy: 0.9744 - val_loss: 0.0925
Epoch 6/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 84ms/step - accuracy: 0.9839 - loss: 0.0590 - val_accuracy: 0.9739 - val_loss: 0.0842
Epoch 7/10
[1m5

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import KFold

# Assuming you have defined X_augmented, y_train, tokenizer, max_seq_length, and embedding_matrix_fasttext

# Define function to create individual models
def create_lstm_model(vocab_size, embedding_dim, max_seq_length, embedding_matrix):
    model = tf.keras.Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_seq_length,
                        trainable=False))
    model.add(SpatialDropout1D(0.2))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    model.add(Attention())
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(64, kernel_regularizer=regularizers.l2(0.01), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define function to train and evaluate model on a fold
def train_and_evaluate_model(X_train, y_train, X_val, y_val, vocab_size, embedding_dim, max_seq_length, embedding_matrix):
    model = create_lstm_model(vocab_size, embedding_dim, max_seq_length, embedding_matrix)
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32, verbose=1)
    y_pred = model.predict_classes(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return model, accuracy

# Initialize variables
kf = KFold(n_splits=10, shuffle=True, random_state=42)

models = []
val_accuracies = []

# Train and evaluate models on each fold
fold_no = 1
for train_index, val_index in kf.split(X_augmented):
    print(f"Train indices: {train_index}")
    print(f"Validation indices: {val_index}")
    
    X_train_fold, X_val_fold = X_augmented[train_index], X_augmented[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]  # Adjusted indexing
    
    # Continue with your training and evaluation

    
    # Train and evaluate individual model
    model, accuracy = train_and_evaluate_model(X_train_fold, y_train_fold, X_val_fold, y_val_fold,
                                               len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)
    
    models.append(('model_fold{}'.format(fold_no), model))  # Append trained model to list
    val_accuracies.append(accuracy)
    
    print(f"Fold {fold_no} validation accuracy: {accuracy}")
    fold_no += 1

print("Cross-validation completed.")
print(f"Average validation accuracy: {np.mean(val_accuracies)}")

# Create VotingClassifier with individual models
voting_classifier = VotingClassifier(estimators=models, voting='hard')

# Train VotingClassifier on full training data
voting_classifier.fit(X_augmented, y_train)

# Evaluate VotingClassifier (optional)
y_pred = voting_classifier.predict(X_augmented)
ensemble_accuracy = accuracy_score(y_train, y_pred)
print(f"Ensemble (VotingClassifier) accuracy on training data: {ensemble_accuracy}")


Train indices: [    0     1     2 ... 22027 22028 22029]
Validation indices: [   34    44    54 ... 21971 21986 22001]




Epoch 1/50
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 81ms/step - accuracy: 0.8920 - loss: 0.7679 - val_accuracy: 0.9591 - val_loss: 0.1550
Epoch 2/50
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 79ms/step - accuracy: 0.9601 - loss: 0.1427 - val_accuracy: 0.9532 - val_loss: 0.1750
Epoch 3/50
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 79ms/step - accuracy: 0.9690 - loss: 0.1037 - val_accuracy: 0.9650 - val_loss: 0.1273
Epoch 4/50
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 81ms/step - accuracy: 0.9738 - loss: 0.0815 - val_accuracy: 0.9764 - val_loss: 0.0895
Epoch 5/50
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 79ms/step - accuracy: 0.9813 - loss: 0.0664 - val_accuracy: 0.9728 - val_loss: 0.0930
Epoch 6/50
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 80ms/step - accuracy: 0.9840 - loss: 0.0619 - val_accuracy: 0.9764 - val_loss: 0.0759
Epoch 7/50
[1m6

In [None]:
# Initialize variables
kf = KFold(n_splits=5, shuffle=True, random_state=42)
models = []
val_accuracies = []
predictions = []

# Train and evaluate models on each fold
fold_no = 1
for train_index, val_index in kf.split(X_augmented):
    X_train_fold, X_val_fold = X_augmented[train_index], X_augmented[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    # Train and evaluate individual model
    model, accuracy = train_and_evaluate_model(X_train_fold, y_train_fold, X_val_fold, y_val_fold,
                                               len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)
    
    models.append(model)  # Append trained model to list
    val_accuracies.append(accuracy)
    
    # Predict probabilities for blending
    y_pred_proba = model.predict_proba(X_val_fold)
    predictions.append(y_pred_proba)
    
    print(f"Fold {fold_no} validation accuracy: {accuracy}")
    fold_no += 1

print("Cross-validation completed.")
print(f"Average validation accuracy: {np.mean(val_accuracies)}")

# Blend predictions (simple average)
predictions = np.mean(predictions, axis=0)
ensemble_predictions = np.where(predictions > 0.5, 1, 0)

# Evaluate ensemble predictions (optional)
ensemble_accuracy = accuracy_score(y_val_fold, ensemble_predictions)
print(f"Ensemble (manual blending) accuracy on validation data: {ensemble_accuracy}")


In [None]:
import random
from nltk.corpus import wordnet
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set(words))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    return sentence

# Convert sequences back to text
def sequences_to_texts(sequences, tokenizer):
    return [' '.join([tokenizer.index_word.get(idx, '') for idx in seq if idx != 0]) for seq in sequences]

# Convert texts back to sequences
def texts_to_sequences(texts, tokenizer):
    return tokenizer.texts_to_sequences(texts)

# Assume tokenizer is already fitted on your data
X_train_texts = sequences_to_texts(X_train, tokenizer)

# Augment the training data
X_augmented_texts = []
for sentence in X_train_texts:
    augmented_sentence = synonym_replacement(sentence, 2)
    X_augmented_texts.append(augmented_sentence)

# Convert the augmented data back to sequences
X_augmented = texts_to_sequences(X_augmented_texts, tokenizer)

# Pad the sequences
X_augmented = pad_sequences(X_augmented, maxlen=max_seq_length, padding='post')

# Now you can proceed with the rest of your pipeline using X_augmented


In [243]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, Bidirectional, LSTM, Dense, GlobalAveragePooling1D
from tensorflow.keras.initializers import Constant
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import Precision, Recall, AUC
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from cleverhans.tf2.attacks.projected_gradient_descent import projected_gradient_descent
import random
from nltk.corpus import wordnet
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [245]:
# Synonym replacement for data augmentation
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set(words))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    sentence = ' '.join(new_words)
    return sentence

# Example usage for data augmentation
X_train_texts = sequences_to_texts(X_train, tokenizer)
X_augmented_texts = [synonym_replacement(sentence, 2) for sentence in X_train_texts]
X_augmented = texts_to_sequences(X_augmented_texts, tokenizer)
X_augmented = pad_sequences(X_augmented, maxlen=max_seq_length, padding='post')


In [None]:
# Ensemble prediction
def ensemble_predict(models, X_test):
    predictions = [model.predict(X_test) for model in models]
    return np.mean(predictions, axis=0)

# Evaluate the ensemble on the test set
y_pred = ensemble_predict(models, X_test)
y_pred = (y_pred > 0.5).astype(int)
print(classification_report(y_test, y_pred))


In [None]:
import tensorflow as tf
from cleverhans.tf2.attacks.projected_gradient_descent import projected_gradient_descent

# Define adversarial training function
def adversarial_training(model, X_train, y_train, epsilon=0.1):
    # Create an instance of the Projected Gradient Descent (PGD) attack
    pgd = projected_gradient_descent.ProjectedGradientDescent(model, sess=None)
    
    # Train the model with adversarial examples
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    for epoch in range(10):
        # Generate adversarial examples
        X_train_adv = pgd.generate(X_train, eps=epsilon)
        
        # Train on both original and adversarial examples
        model.fit(X_train_adv, y_train, epochs=1, batch_size=32, verbose=1)
    
    return model

# Assuming you have a model instance `model`
# Call the adversarial_training function
model = adversarial_training(model, X_train, y_train)


In [None]:
from sklearn.ensemble import VotingClassifier
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Function to create model (required for KerasClassifier)
def create_model():
    return create_lstm_model_with_regularization(len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)

# Create the KerasClassifier
model1 = KerasClassifier(build_fn=create_model, epochs=40, batch_size=32, verbose=0)
model2 = KerasClassifier(build_fn=create_model, epochs=40, batch_size=32, verbose=0)

# Combine models into an ensemble
ensemble_model = VotingClassifier(estimators=[('model1', model1), ('model2', model2)], voting='soft')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
accuracy = ensemble_model.score(X_val, y_val)
print(f"Ensemble model validation accuracy: {accuracy}")


5. Monitoring and Logging
Implementing monitoring and logging helps track the model's performance over time.

In [228]:
import logging

# Set up logging
logging.basicConfig(filename='model.log', level=logging.INFO)

# Log model performance
logging.info(f"Training Accuracy: {history.history['accuracy'][-1]}, Validation Accuracy: {history.history['val_accuracy'][-1]}")


6. Ensemble Methods
Ensemble methods combine multiple models to improve prediction reliability.

In [229]:
from sklearn.ensemble import VotingClassifier

# Create and compile the models
model1 = create_lstm_model(len(tokenizer.word_index) + 1, 300, max_seq_length, embedding_matrix_fasttext)
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model2 = create_lstm_model(len(tokenizer.word_index) + 1, 100, max_seq_length, embedding_matrix_glove)
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the models
model1.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=40, batch_size=32)
model2.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=40, batch_size=32)

# Combine models into an ensemble

# Combine models into an ensemble
ensemble = VotingClassifier(estimators=[('lstm_fasttext', model1), ('lstm_glove', model2)], voting='soft')
ensemble.fit(X_train, y_train)

# Evaluate the ensemble
y_pred_ensemble = ensemble.predict(X_test)
print(f"Ensemble Test Accuracy: {accuracy_score(y_test, y_pred_ensemble):.2f}")
print(classification_report(y_test, y_pred_ensemble))
# Train the ensemble model


Epoch 1/40
[1m689/689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 55ms/step - accuracy: 0.8956 - loss: 0.2624 - val_accuracy: 0.9713 - val_loss: 0.0907
Epoch 2/40
[1m689/689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 55ms/step - accuracy: 0.9743 - loss: 0.0772 - val_accuracy: 0.9737 - val_loss: 0.0777
Epoch 3/40
[1m689/689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 55ms/step - accuracy: 0.9856 - loss: 0.0431 - val_accuracy: 0.9766 - val_loss: 0.0639
Epoch 4/40
[1m689/689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 54ms/step - accuracy: 0.9932 - loss: 0.0226 - val_accuracy: 0.9811 - val_loss: 0.0544
Epoch 5/40
[1m689/689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 55ms/step - accuracy: 0.9961 - loss: 0.0127 - val_accuracy: 0.9826 - val_loss: 0.0570
Epoch 6/40
[1m689/689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 55ms/step - accuracy: 0.9978 - loss: 0.0081 - val_accuracy: 0.9777 - val_loss: 0.0665
Epoch 7/40
[1m6

ValueError: The estimator Sequential should be a classifier.