In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Input, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.initializers import RandomUniform
from tensorflow.keras import regularizers
from gensim.models.keyedvectors import KeyedVectors
import zipfile
import requests

# Step 1: Download and Prepare GloVe Embeddings

def download_glove_embeddings(glove_dir='glove'):
    if not os.path.exists(glove_dir):
        os.makedirs(glove_dir)
    glove_zip = os.path.join(glove_dir, 'glove.6B.zip')
    if not os.path.exists(glove_zip):
        print("Downloading GloVe embeddings...")
        url = 'http://nlp.stanford.edu/data/glove.6B.zip'
        r = requests.get(url, stream=True)
        with open(glove_zip, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print("Download complete.")
    with zipfile.ZipFile(glove_zip, 'r') as zip_ref:
        zip_ref.extractall(glove_dir)
    print("GloVe embeddings extracted.")

def load_glove_embeddings(glove_file_path, word_index, embedding_dim):
    print("Loading GloVe embeddings...")
    embeddings_index = {}
    with open(glove_file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = ' '.join(values[:-embedding_dim])
            coefs = np.asarray(values[-embedding_dim:], dtype='float32')
            embeddings_index[word] = coefs
    print("GloVe embeddings loaded.")
    embedding_matrix = np.random.uniform(-0.05, 0.05, (len(word_index) +1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Step 2: Prepare IMDB Dataset

def load_imdb_dataset(num_words=20000, maxlen=200):
    print("Loading IMDB dataset...")
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=num_words)
    word_index = tf.keras.datasets.imdb.get_word_index()
    index_word = {v:k for k, v in word_index.items()}
    # Shift indices by 3 to match Keras's internal processing
    word_index = {k:(v+3) for k,v in word_index.items()}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2
    word_index["<UNUSED>"] = 3
    x_train = pad_sequences(x_train, maxlen=maxlen, padding='post')
    x_test = pad_sequences(x_test, maxlen=maxlen, padding='post')
    print("IMDB dataset loaded and preprocessed.")
    return (x_train, y_train), (x_test, y_test), word_index

# Step 3: Define Custom Complex Embedding Layer

class ComplexEmbedding(Layer):
    def __init__(self, input_dim, output_dim, embeddings_initializer='uniform',
                 embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None,
                 **kwargs):
        super(ComplexEmbedding, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.embeddings_initializer = tf.keras.initializers.get(embeddings_initializer)
        self.embeddings_regularizer = tf.keras.regularizers.get(embeddings_regularizer)
        self.activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
        self.embeddings_constraint = tf.keras.constraints.get(embeddings_constraint)

    def build(self, input_shape):
        # Amplitude (Real part)
        self.amplitude_embeddings = self.add_weight(shape=(self.input_dim, self.output_dim),
                                                    initializer=self.embeddings_initializer,
                                                    name='amplitude_embeddings',
                                                    regularizer=self.embeddings_regularizer,
                                                    constraint=self.embeddings_constraint)
        # Phase (Imaginary part)
        self.phase_embeddings = self.add_weight(shape=(self.input_dim, self.output_dim),
                                                initializer=RandomUniform(minval=0, maxval=2*np.pi),
                                                name='phase_embeddings',
                                                regularizer=self.embeddings_regularizer,
                                                constraint=self.embeddings_constraint)
        super(ComplexEmbedding, self).build(input_shape)

    def call(self, inputs):
        amplitude = tf.nn.embedding_lookup(self.amplitude_embeddings, inputs)
        phase = tf.nn.embedding_lookup(self.phase_embeddings, inputs)
        real_part = amplitude * tf.math.cos(phase)
        imag_part = amplitude * tf.math.sin(phase)
        complex_embedding = tf.complex(real_part, imag_part)
        magnitude = tf.abs(complex_embedding)  # Calculate the magnitude inside the layer
        return magnitude

    def compute_output_shape(self, input_shape):
        return input_shape + (self.output_dim,)

# Step 4: Build and Compile Model

def build_model(input_length, vocab_size, embedding_dim, embedding_matrix):
    inputs = Input(shape=(input_length,), dtype='int32')
    complex_embedding_layer = ComplexEmbedding(input_dim=vocab_size,
                                               output_dim=embedding_dim,
                                               embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                               embeddings_regularizer=regularizers.l2(1e-6),
                                               name='complex_embedding')
    x = complex_embedding_layer(inputs)
    x = GlobalAveragePooling1D()(x)  # Use the magnitude of the complex embeddings
    x = Dropout(0.5)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Step 5: Training and Evaluation

def main():
    # Parameters
    MAX_NUM_WORDS = 20000
    MAX_SEQUENCE_LENGTH = 200
    EMBEDDING_DIM = 100
    GLOVE_DIR = 'glove'
    GLOVE_FILE = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')

    # Download and prepare GloVe embeddings
    download_glove_embeddings(GLOVE_DIR)

    # Load IMDB dataset
    (x_train, y_train), (x_test, y_test), word_index = load_imdb_dataset(num_words=MAX_NUM_WORDS, maxlen=MAX_SEQUENCE_LENGTH)

    # Prepare embedding matrix
    embedding_matrix = load_glove_embeddings(GLOVE_FILE, word_index, EMBEDDING_DIM)

    # Build model
    model = build_model(input_length=MAX_SEQUENCE_LENGTH,
                        vocab_size=len(word_index)+1,
                        embedding_dim=EMBEDDING_DIM,
                        embedding_matrix=embedding_matrix)

    # Train model
    model.summary()
    model.fit(x_train, y_train,
              batch_size=128,
              epochs=10,
              validation_split=0.2)

    # Evaluate model
    results = model.evaluate(x_test, y_test)
    print(f'Test Loss: {results[0]:.4f}')
    print(f'Test Accuracy: {results[1]:.4f}')

if __name__ == '__main__':
    main()


GloVe embeddings extracted.
Loading IMDB dataset...
IMDB dataset loaded and preprocessed.
Loading GloVe embeddings...
GloVe embeddings loaded.


Epoch 1/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 596ms/step - accuracy: 0.5090 - loss: 116.6303 - val_accuracy: 0.7026 - val_loss: 110.2769
Epoch 2/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 603ms/step - accuracy: 0.5506 - loss: 108.2666 - val_accuracy: 0.7752 - val_loss: 102.2874
Epoch 3/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 568ms/step - accuracy: 0.5961 - loss: 100.4061 - val_accuracy: 0.7682 - val_loss: 94.8436
Epoch 4/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 561ms/step - accuracy: 0.6450 - loss: 93.0924 - val_accuracy: 0.7878 - val_loss: 87.9059
Epoch 5/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 679ms/step - accuracy: 0.6828 - loss: 86.2745 - val_accuracy: 0.7946 - val_loss: 81.4353
Epoch 6/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 626ms/step - accuracy: 0.7115 - loss: 79.9202 - val_accuracy: 0.8068 - val_lo

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Input, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.initializers import RandomUniform
from tensorflow.keras import regularizers
from gensim.models.keyedvectors import KeyedVectors
import zipfile
import requests

# Step 1: Download and Prepare GloVe Embeddings

def download_glove_embeddings(glove_dir='glove'):
    if not os.path.exists(glove_dir):
        os.makedirs(glove_dir)
    glove_zip = os.path.join(glove_dir, 'glove.6B.zip')
    if not os.path.exists(glove_zip):
        print("Downloading GloVe embeddings...")
        url = 'http://nlp.stanford.edu/data/glove.6B.zip'
        r = requests.get(url, stream=True)
        with open(glove_zip, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print("Download complete.")
    with zipfile.ZipFile(glove_zip, 'r') as zip_ref:
        zip_ref.extractall(glove_dir)
    print("GloVe embeddings extracted.")

def load_glove_embeddings(glove_file_path, word_index, embedding_dim):
    print("Loading GloVe embeddings...")
    embeddings_index = {}
    with open(glove_file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = ' '.join(values[:-embedding_dim])
            coefs = np.asarray(values[-embedding_dim:], dtype='float32')
            embeddings_index[word] = coefs
    print("GloVe embeddings loaded.")
    embedding_matrix = np.random.uniform(-0.05, 0.05, (len(word_index) +1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Step 2: Prepare IMDB Dataset

def load_imdb_dataset(num_words=20000, maxlen=200):
    print("Loading IMDB dataset...")
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=num_words)
    word_index = tf.keras.datasets.imdb.get_word_index()
    index_word = {v:k for k, v in word_index.items()}
    # Shift indices by 3 to match Keras's internal processing
    word_index = {k:(v+3) for k,v in word_index.items()}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2
    word_index["<UNUSED>"] = 3
    x_train = pad_sequences(x_train, maxlen=maxlen, padding='post')
    x_test = pad_sequences(x_test, maxlen=maxlen, padding='post')
    print("IMDB dataset loaded and preprocessed.")
    return (x_train, y_train), (x_test, y_test), word_index

# Step 3: Define Custom Complex Embedding Layer

class ComplexEmbedding(Layer):
    def __init__(self, input_dim, output_dim, embeddings_initializer='uniform',
                 embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None,
                 **kwargs):
        super(ComplexEmbedding, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.embeddings_initializer = tf.keras.initializers.get(embeddings_initializer)
        self.embeddings_regularizer = tf.keras.regularizers.get(embeddings_regularizer)
        self.activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
        self.embeddings_constraint = tf.keras.constraints.get(embeddings_constraint)

    def build(self, input_shape):
        # Amplitude (Real part)
        self.amplitude_embeddings = self.add_weight(shape=(self.input_dim, self.output_dim),
                                                    initializer=self.embeddings_initializer,
                                                    name='amplitude_embeddings',
                                                    regularizer=self.embeddings_regularizer,
                                                    constraint=self.embeddings_constraint)
        # Phase (Imaginary part)
        self.phase_embeddings = self.add_weight(shape=(self.input_dim, self.output_dim),
                                                initializer=RandomUniform(minval=0, maxval=2*np.pi),
                                                name='phase_embeddings',
                                                regularizer=self.embeddings_regularizer,
                                                constraint=self.embeddings_constraint)
        super(ComplexEmbedding, self).build(input_shape)

    def call(self, inputs):
        amplitude = tf.nn.embedding_lookup(self.amplitude_embeddings, inputs)
        phase = tf.nn.embedding_lookup(self.phase_embeddings, inputs)
        real_part = amplitude * tf.math.cos(phase)
        imag_part = amplitude * tf.math.sin(phase)
        complex_embedding = tf.complex(real_part, imag_part)
        magnitude = tf.abs(complex_embedding)  # Calculate the magnitude inside the layer
        return magnitude

    def compute_output_shape(self, input_shape):
        return input_shape + (self.output_dim,)

# Step 4: Build and Compile Model

def build_model(input_length, vocab_size, embedding_dim, embedding_matrix):
    inputs = Input(shape=(input_length,), dtype='int32')
    complex_embedding_layer = ComplexEmbedding(input_dim=vocab_size,
                                               output_dim=embedding_dim,
                                               embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                               embeddings_regularizer=regularizers.l2(1e-6),
                                               name='complex_embedding')
    x = complex_embedding_layer(inputs)
    x = GlobalAveragePooling1D()(x)  # Use the magnitude of the complex embeddings
    x = Dropout(0.5)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Step 5: Process and Analyze a Test Sentence

def process_sentence(sentence, model, tokenizer, maxlen):
    # Tokenize and pad the sentence
    print(f"Original Sentence: {sentence}")
    sequence = tokenizer.texts_to_sequences([sentence])
    print(f"Tokenized Sentence: {sequence}")

    padded_sequence = pad_sequences(sequence, maxlen=maxlen, padding='post')
    print(f"Padded Sentence: {padded_sequence}")

    # Extract the complex embedding layer from the model
    complex_embedding_layer = model.get_layer('complex_embedding')

    # Get the embedding output
    embedding_output = complex_embedding_layer(padded_sequence)
    print(f"Complex Embedding Magnitude (Amplitude):\n{embedding_output.numpy()}")

    # Apply Global Average Pooling
    pooled_output = GlobalAveragePooling1D()(embedding_output)
    print(f"Global Average Pooled Output:\n{pooled_output.numpy()}")

    # Dense layer output
    final_output = model.layers[-1](pooled_output)
    print(f"Final Output (after Dense Layer):\n{final_output.numpy()}")

    return final_output

# Step 6: Training, Evaluation, and Sentence Processing

def main():
    # Parameters
    MAX_NUM_WORDS = 20000
    MAX_SEQUENCE_LENGTH = 200
    EMBEDDING_DIM = 100
    GLOVE_DIR = 'glove'
    GLOVE_FILE = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')

    # Sample test sentence
    test_sentence = "This subject is called quantum natural language."

    # Download and prepare GloVe embeddings
    download_glove_embeddings(GLOVE_DIR)

    # Load IMDB dataset
    (x_train, y_train), (x_test, y_test), word_index = load_imdb_dataset(num_words=MAX_NUM_WORDS, maxlen=MAX_SEQUENCE_LENGTH)

    # Prepare embedding matrix
    embedding_matrix = load_glove_embeddings(GLOVE_FILE, word_index, EMBEDDING_DIM)

    # Build model
    model = build_model(input_length=MAX_SEQUENCE_LENGTH,
                        vocab_size=len(word_index)+1,
                        embedding_dim=EMBEDDING_DIM,
                        embedding_matrix=embedding_matrix)

    # Train model (optional, you may skip this if you don't need training)
    model.summary()
    model.fit(x_train, y_train,
              batch_size=128,
              epochs=10,
              validation_split=0.2)

    # Evaluate model
    results = model.evaluate(x_test, y_test)
    print(f'Test Loss: {results[0]:.4f}')
    print(f'Test Accuracy: {results[1]:.4f}')

    # Tokenizer setup for the test sentence
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts([test_sentence])

    # Process the test sentence
    process_sentence(test_sentence, model, tokenizer, MAX_SEQUENCE_LENGTH)

if __name__ == '__main__':
    main()



Downloading GloVe embeddings...
Download complete.
GloVe embeddings extracted.
Loading IMDB dataset...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
IMDB dataset loaded and preprocessed.
Loading GloVe embeddings...
GloVe embeddings loaded.


Epoch 1/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 564ms/step - accuracy: 0.5085 - loss: 116.6536 - val_accuracy: 0.6778 - val_loss: 110.3025
Epoch 2/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 553ms/step - accuracy: 0.5405 - loss: 108.2898 - val_accuracy: 0.7652 - val_loss: 102.3113
Epoch 3/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 559ms/step - accuracy: 0.5964 - loss: 100.4271 - val_accuracy: 0.7506 - val_loss: 94.8675
Epoch 4/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 518ms/step - accuracy: 0.6405 - loss: 93.1114 - val_accuracy: 0.7870 - val_loss: 87.9300
Epoch 5/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 520ms/step - accuracy: 0.6828 - loss: 86.2958 - val_accuracy: 0.7946 - val_loss: 81.4579
Epoch 6/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 500ms/step - accuracy: 0.7180 - loss: 79.9324 - val_accuracy: 0.8020 - val_loss: