In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/cleaned-amazon-reviews/cleaned_amazon_reviews.csv


In [15]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

In [4]:
data = pd.read_csv('/kaggle/input/cleaned-amazon-reviews/cleaned_amazon_reviews.csv')
data = data.drop(columns=['Title', 'Review'])


In [5]:
data.head()

Unnamed: 0,Sentiment,Cleaned_Review
0,0,vietnam veteran one tour enlisted one tour off...
1,1,found book informative helpful however title b...
2,0,movie certainly good others however biggest pr...
3,0,looking forward film actually waited thanksgiv...
4,0,son read first book disappointed rude lots hat...


In [6]:
max_features = 5000     # Number of words to consider as features
maxlen = 150            # Cut texts after this number of words (reduced for memory efficiency)
embedding_dim = 50      # Dimension of the embedding vector
latent_dim = 100        # Latent space dimension for GAN

In [7]:
df=data

In [8]:
# data=data.drop(columns=['Title','Review'])

In [9]:
# data.head()

Unnamed: 0,Sentiment,Cleaned_Review
0,0,vietnam veteran one tour enlisted one tour off...
1,1,found book informative helpful however title b...
2,0,movie certainly good others however biggest pr...
3,0,looking forward film actually waited thanksgiv...
4,0,son read first book disappointed rude lots hat...


In [10]:
# max_features = 5000     # Number of words to consider as features
# maxlen = 154            # Cut texts after this number of words
# embedding_dim = 50      # Dimension of the embedding vector
# latent_dim = 100       

In [8]:
# Tokenizer and sequence preparation
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data.Cleaned_Review)
sequences = tokenizer.texts_to_sequences(data.Cleaned_Review)
word_index = tokenizer.word_index
int_to_word = {i: word for word, i in word_index.items()}

In [9]:
# Padding sequences
x_data = pad_sequences(sequences, maxlen=maxlen)
x_data = x_data[:1500]  # Limiting data for faster testing

In [10]:
# Pre-trained GloVe embeddings (optional)
def load_glove_embeddings(file_path, embedding_dim):
    embeddings_index = {}
    with open(file_path) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

In [12]:
embedding_layer = Embedding(input_dim=max_features, output_dim=embedding_dim)

In [13]:
input_sequences = Input(shape=(maxlen,))
embeddings = embedding_layer(input_sequences)
embedding_model = Model(inputs=input_sequences, outputs=embeddings)

In [16]:
# Generate embeddings for the input data
x_train_embed = embedding_model.predict(x_data)

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [18]:
# Flatten embeddings
x_train_embed_flat = x_train_embed.reshape((x_train_embed.shape[0], -1))

In [19]:
# Normalize the embeddings to the range [-1, 1]
min_val = x_train_embed_flat.min()
max_val = x_train_embed_flat.max()
x_train_embed_flat = 2 * (x_train_embed_flat - min_val) / (max_val - min_val) - 1

In [22]:
def build_generator(latent_dim, output_dim):
    input_layer = Input(shape=(latent_dim,))
    x = Dense(256, activation='relu')(input_layer)
    x = BatchNormalization(momentum=0.8)(x)
    x = Dense(512, activation='relu')(x)
    output_layer = Dense(output_dim, activation='tanh')(x)
    
    model = Model(input_layer, output_layer)
    return model

In [23]:
# Initialize generator
output_dim = x_train_embed_flat.shape[1]
generator = build_generator(latent_dim, output_dim)

In [24]:
def build_discriminator(input_dim):
    input_layer = Input(shape=(input_dim,))
    x = Dense(512, activation='relu')(input_layer)
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.4)(x)
    output_layer = Dense(1, activation='sigmoid')(x)
    
    model = Model(input_layer, output_layer)
    return model

In [25]:
# Initialize discriminator
discriminator = build_discriminator(output_dim)
discriminator.compile(optimizer=Adam(0.0002, 0.5, clipvalue=1.0), loss='binary_crossentropy', metrics=['accuracy'])

In [26]:
discriminator.trainable = False

In [27]:
# GAN model
gan_input = Input(shape=(latent_dim,))
fake_samples = generator(gan_input)
gan_output = discriminator(fake_samples)
gan = Model(gan_input, gan_output)
gan.compile(optimizer=Adam(0.0002, 0.5, clipvalue=1.0), loss='binary_crossentropy')

In [28]:
# Train GAN
def train_gan(generator, discriminator, gan, x_train_embed_flat, epochs=30, batch_size=32):
    batch_count = x_train_embed_flat.shape[0] // batch_size

    for epoch in range(epochs):
        for _ in range(batch_count):
            # Select a random batch of real samples
            idx = np.random.randint(0, x_train_embed_flat.shape[0], batch_size)
            real_samples = x_train_embed_flat[idx]

            # Generate fake samples
            noise = np.random.normal(0, 1, (batch_size, latent_dim))
            fake_samples = generator.predict(noise, verbose=0)

            # Combine real and fake samples
            x = np.vstack((real_samples, fake_samples))
            y = np.vstack((np.ones((batch_size, 1)), np.zeros((batch_size, 1))))

            # Train the discriminator
            discriminator.trainable = True
            d_loss = discriminator.train_on_batch(x, y)

            # Train the generator via the GAN
            noise = np.random.normal(0, 1, (batch_size, latent_dim))
            y_gen = np.ones((batch_size, 1))
            discriminator.trainable = False
            g_loss = gan.train_on_batch(noise, y_gen)

        print(f"Epoch {epoch+1}/{epochs}, Discriminator Loss: {d_loss[0]}, Generator Loss: {g_loss}")


In [29]:
# Start training
train_gan(generator, discriminator, gan, x_train_embed_flat, epochs=30, batch_size=32)

Epoch 1/30, Discriminator Loss: 0.3551475703716278, Generator Loss: [array(0.35514757, dtype=float32), array(0.35514757, dtype=float32), array(0.8206522, dtype=float32)]
Epoch 2/30, Discriminator Loss: 0.26035553216934204, Generator Loss: [array(0.26035553, dtype=float32), array(0.26035553, dtype=float32), array(0.88383156, dtype=float32)]
Epoch 3/30, Discriminator Loss: 0.29895466566085815, Generator Loss: [array(0.29895467, dtype=float32), array(0.29895467, dtype=float32), array(0.8633379, dtype=float32)]
Epoch 4/30, Discriminator Loss: 0.35960111021995544, Generator Loss: [array(0.3596011, dtype=float32), array(0.3596011, dtype=float32), array(0.8207371, dtype=float32)]
Epoch 5/30, Discriminator Loss: 0.4010169208049774, Generator Loss: [array(0.40101692, dtype=float32), array(0.40101692, dtype=float32), array(0.7908967, dtype=float32)]
Epoch 6/30, Discriminator Loss: 0.43032458424568176, Generator Loss: [array(0.43032458, dtype=float32), array(0.43032458, dtype=float32), array(0.77

In [30]:
batch_size = 32
fake_samples = generator.predict(np.random.normal(0, 1, (batch_size, latent_dim)))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


In [31]:
real_samples = x_train_embed_flat[:batch_size]

In [32]:
# Combine real and fake samples
x = np.vstack((real_samples, fake_samples))
y_true = np.vstack((np.ones((batch_size, 1)), np.zeros((batch_size, 1))))

In [33]:
# Get predictions from discriminator
y_pred = discriminator.predict(x)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step  


In [34]:
# Threshold predictions
y_pred_class = (y_pred > 0.5).astype(int)

In [35]:
# Calculate accuracy
accuracy = np.mean(y_pred_class == y_true)
print(f"Discriminator Accuracy: {accuracy * 100:.2f}%")

Discriminator Accuracy: 50.00%


In [36]:
# Generate synthetic text from the Generator's embeddings
def get_closest_token(embedding, word_index, embedding_matrix):
    similarities = np.dot(embedding_matrix, embedding) / (np.linalg.norm(embedding_matrix, axis=1) * np.linalg.norm(embedding))
    closest_token = np.argmax(similarities)
    return closest_token

In [37]:
# Generate synthetic text
noise = np.random.normal(0, 1, (1, latent_dim))
generated_sample = generator.predict(noise)
generated_sample = generated_sample.reshape((maxlen, embedding_dim))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step


In [39]:
# Define an embedding layer
embedding_layer = Embedding(input_dim=max_features, output_dim=embedding_dim)
embedding_layer.build((None,))  # This ensures the embedding matrix is created
embedding_matrix = embedding_layer.get_weights()[0]  # This is the actual matrix

In [40]:
generated_text = []
for embedding in generated_sample:
    token_index = get_closest_token(embedding, word_index, embedding_matrix)
    word = int_to_word.get(token_index, '?')
    generated_text.append(word)

In [41]:
# Join the generated words into a sentence
generated_sentence = ' '.join(generated_text)
print("Generated sentence: ", generated_sentence)

Generated sentence:  meditation meditation meditation zipper meditation bag meditation meditation meditation meditation zipper meditation meditation meditation meditation struggle meditation meditation bag bag bag bag meditation meditation inside bag bag meditation bag struggle struggle meditation beast bag meditation meditation meditation inside beast meditation zipper bag brutal meditation meditation meditation meditation zipper zipper meditation bag meditation struggle bag meditation meditation meditation bag zipper meditation bag meditation meditation bag meditation zipper meditation meditation meditation bag zipper meditation meditation meditation bag meditation zipper meditation meditation meditation meditation bag meditation meditation meditation meditation jess zipper inside meditation meditation inside struggle bag bag violence meditation bag bag struggle bag storage meditation inside bag meditation difficulty zipper bag meditation meditation struggle lacks meditation zipper b

In [21]:
# Define an embedding layer
embedding_layer = Embedding(input_dim=max_features, output_dim=embedding_dim, input_length=maxlen)

# Create a model to transform word indices to embeddings
input_sequences = Input(shape=(maxlen,))
embeddings = embedding_layer(input_sequences)
embedding_model = Model(inputs=input_sequences, outputs=embeddings)

# Get embeddings for the training data
x_train_embed = embedding_model.predict(x_data)


# Get the embedding matrix from the embedding layer
embedding_matrix = embedding_layer.get_weights()[0]  # This is the actual matrix

# Example usage: Get embedding vector for a specific word index
word_index = 5  # Example word index
embedding_vector = embedding_matrix[word_index]

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [22]:
# Flatten the embeddings to create a feature vector
x_train_embed_flat = x_train_embed.reshape((x_train_embed.shape[0], -1))

# Normalize the embeddings to the range [-1, 1]
min_val = x_train_embed_flat.min()
max_val = x_train_embed_flat.max()
if max_val - min_val == 0:
    max_val = min_val + 1  # Avoid division by zero
x_train_embed_flat = 2 * (x_train_embed_flat - min_val) / (max_val - min_val) - 1


In [23]:
def build_generator(latent_dim, output_dim):
    model = Sequential()
    model.add(Dense(256, activation='relu', input_dim=latent_dim))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(output_dim, activation='tanh'))
    return model
    
# Initialize the generator
output_dim = x_train_embed_flat.shape[1]
generator = build_generator(latent_dim, output_dim)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [24]:
def build_discriminator(input_dim):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.4))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='sigmoid'))
    return model

# Initialize the discriminator
discriminator = build_discriminator(output_dim)
discriminator.compile(optimizer=Adam(0.0002, 0.5), loss='binary_crossentropy', metrics=['accuracy'])


In [25]:
# Make the discriminator untrainable when training the GAN
discriminator.trainable = False

# Input for the GAN
gan_input = Input(shape=(latent_dim,))
# The generator produces fake samples
fake_samples = generator(gan_input)
# The discriminator evaluates the fake samples
gan_output = discriminator(fake_samples)
# Define the GAN model
gan = Model(gan_input, gan_output)
gan.compile(optimizer=Adam(0.0002, 0.5), loss='binary_crossentropy')

In [26]:
def train_gan(generator, discriminator, gan, x_train_embed_flat, epochs=10, batch_size=64):
    batch_count = x_train_embed_flat.shape[0] // batch_size
    for epoch in range(epochs):
        for _ in range(batch_count):
            # Select a random batch of real samples
            idx = np.random.randint(0, x_train_embed_flat.shape[0], batch_size)
            real_samples = x_train_embed_flat[idx]

            # Generate fake samples
            noise = np.random.normal(0, 1, (batch_size, latent_dim))
            fake_samples = generator.predict(noise ,verbose=0)

            # Combine real and fake samples
            x = np.vstack((real_samples, fake_samples))
            y = np.vstack((np.ones((batch_size, 1)), np.zeros((batch_size, 1))))

            # Train the discriminator
            discriminator.trainable = True
            d_loss = discriminator.train_on_batch(x, y)

            # Train the generator via the GAN
            noise = np.random.normal(0, 1, (batch_size, latent_dim))
            y_gen = np.ones((batch_size, 1))
            discriminator.trainable = False
            g_loss = gan.train_on_batch(noise, y_gen)

        # Print the progress
        print(f"Epoch {epoch+1}/{epochs}, Discriminator Loss: {d_loss[0]}, Generator Loss: {g_loss}")

# Start training
train_gan(generator, discriminator, gan, x_train_embed_flat, epochs=30, batch_size=64)


Epoch 1/30, Discriminator Loss: 0.5590204000473022, Generator Loss: [array(0.5590204, dtype=float32), array(0.5590204, dtype=float32), array(0.64504075, dtype=float32)]
Epoch 2/30, Discriminator Loss: 0.3605270981788635, Generator Loss: [array(0.3605271, dtype=float32), array(0.3605271, dtype=float32), array(0.8201427, dtype=float32)]
Epoch 3/30, Discriminator Loss: 0.26730671525001526, Generator Loss: [array(0.26730672, dtype=float32), array(0.26730672, dtype=float32), array(0.87975544, dtype=float32)]
Epoch 4/30, Discriminator Loss: 0.22021184861660004, Generator Loss: [array(0.22021185, dtype=float32), array(0.22021185, dtype=float32), array(0.90735394, dtype=float32)]
Epoch 5/30, Discriminator Loss: 0.19923366606235504, Generator Loss: [array(0.19923367, dtype=float32), array(0.19923367, dtype=float32), array(0.91813856, dtype=float32)]
Epoch 6/30, Discriminator Loss: 0.2524383068084717, Generator Loss: [array(0.2524383, dtype=float32), array(0.2524383, dtype=float32), array(0.8978

In [27]:
batch_size=64
# Generate fake samples
fake_samples = generator.predict(np.random.normal(0, 1, (batch_size, latent_dim)))

# Take some real samples
real_samples = x_train_embed_flat[:batch_size]

# Combine real and fake samples
x = np.vstack((real_samples, fake_samples))
y_true = np.vstack((np.ones((batch_size, 1)), np.zeros((batch_size, 1))))

# Get the predictions from the discriminator
y_pred = discriminator.predict(x)

# Threshold predictions (discriminator outputs probabilities)
y_pred_class = (y_pred > 0.5).astype(int)

# Calculate accuracy
accuracy = np.mean(y_pred_class == y_true)

print(f"Discriminator Accuracy: {accuracy * 100:.2f}%")


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  
Discriminator Accuracy: 65.62%


In [28]:
# Assuming you have a tokenizer and int_to_word dictionary ready
# tokenizer = your tokenizer used to tokenize the training data
# int_to_word = a dictionary mapping token indices back to words

# Function to map embedding back to nearest word/token index
def get_closest_token(embedding, word_index, embedding_matrix):
    # Calculate cosine similarity between generated embedding and actual embeddings
    similarities = np.dot(embedding_matrix, embedding) / (np.linalg.norm(embedding_matrix, axis=1) * np.linalg.norm(embedding))
    closest_token = np.argmax(similarities)  # Find the index of the most similar embedding
    return closest_token

# Generate synthetic movie review embeddings
noise = np.random.normal(0, 1, (1, latent_dim))
generated_sample = generator.predict(noise)

# Reshape the output to match the (maxlen, embedding_dim)
generated_sample = generated_sample.reshape((maxlen, embedding_dim))

# Load your word embedding matrix (e.g., from a pre-trained model or from training data)
# embedding_matrix is the matrix where each row corresponds to a word's embedding
# word_index is the tokenizer's word-to-index mapping (reverse of int_to_word)
# Assuming embedding_matrix and word_index are already loaded/available

generated_text = []
for embedding in generated_sample:
    token_index = get_closest_token(embedding, word_index, embedding_matrix)
    word = int_to_word.get(token_index, '?')  # Get the word corresponding to the token
    generated_text.append(word)

# Join the generated words into a sentence
generated_sentence = ' '.join(generated_text)
print("Generated sentence: ", generated_sentence)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step
Generated sentence:  ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? pretentious ? ? ? ? ? bored ? disappointment discovering ? pretentious challenged features furthermore mind small profession accessory ordinary collection enemy tedious techniques spiritual hip
