In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
import numpy as np
from tqdm import tqdm
from itertools import islice

## Load and prepare data

In [2]:
# Load your text data
with open('movie.txt', 'r', encoding="utf-8") as f:
    text_data = f.readlines()

text_data = [line.strip() for line in text_data]
text_data = [line + ' eol' for line in text_data]

# Define the maximum sequence length and vocabulary size
print("Define the maximum sequence length and vocabulary size")
max_seq_length = 100  # for example
vocab_size = 3000  # for example

# Convert the text to a sequence of integer IDs
print("Convert the text to a sequence of integer IDs")
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(text_data)
seqs = tokenizer.texts_to_sequences(text_data)

# Create input/output pairs for the language model
print("Create input/output pairs for the language model")
input_seqs = []
output_seqs = []
for seq in seqs:
    for i in range(1, len(seq)):
        input_seq = seq[:i]
        output_seq = seq[i]
        input_seqs.append(input_seq)
        output_seqs.append(output_seq)

# Pad the input sequences to a fixed length
print("Pad the input sequences to a fixed length")
max_input_length = max(len(seq) for seq in input_seqs)
input_seqs = pad_sequences(input_seqs, maxlen=max_input_length)

Define the maximum sequence length and vocabulary size
Convert the text to a sequence of integer IDs
Create input/output pairs for the language model
Pad the input sequences to a fixed length


MemoryError: Unable to allocate 29.5 GiB for an array with shape (19993135, 396) and data type int32

In [4]:
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Train the model

In [27]:
# Set up CPU and GPU usage
#physical_devices = tf.config.list_physical_devices('GPU')
#print(physical_devices)
#tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Define the RNN language model architecture
print("Define the RNN language model architecture")
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_input_length))
#model.add(LSTM(100))
model.add(LSTM(200, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(200))
model.add(Dropout(0.3))
model.add(Dense(vocab_size, activation='softmax'))
optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

#model = tf.keras.models.load_model('model_movie.h5')

# Train the language model
print("Train the language model")
#output_seqs_categorical = tf.keras.utils.to_categorical(output_seqs, num_classes=vocab_size)
#model.fit(input_seqs, output_seqs_categorical, epochs=10, batch_size=128)

# Set the batch size and number of epochs
batch_size = 1280
num_epochs = 5

# Calculate the number of batches
num_batches = int(np.ceil(len(output_seqs) / batch_size))

# Train the language model in batches
for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch+1, num_epochs))
    epoch_losses = []
    epoch_accs = []
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(output_seqs))
        input_batch = input_seqs[start_idx:end_idx]
        output_batch = output_seqs[start_idx:end_idx]

        # Convert output to categorical
        output_batch_categorical = tf.keras.utils.to_categorical(output_batch, num_classes=vocab_size)

        # Train the model on the current batch
        batch_loss, batch_acc = model.train_on_batch(input_batch, output_batch_categorical)
        epoch_losses.append(batch_loss)
        epoch_accs.append(batch_acc)
        if (i % 200 == 0):
            print('Batch {}/{} - loss: {:.4f} - acc: {:.4f}'.format(i+1, num_batches, batch_loss, batch_acc))

    # Calculate and print the epoch loss and accuracy
    epoch_loss = np.mean(epoch_losses)
    epoch_acc = np.mean(epoch_accs)
    print('Epoch loss: {:.4f} - epoch acc: {:.4f}'.format(epoch_loss, epoch_acc))
    model.save('model_literature_2.h5')

model.save('model_literature_2.h5')


# Generate text using the language model
print("Generate text using the language model")
seed_text = "The quick brown fox"
for i in range(10):
    input_seq = tokenizer.texts_to_sequences([seed_text])[0]
    input_seq = pad_sequences([input_seq], maxlen=max_input_length)
    output_probs = model.predict(input_seq)[0]
    next_word_id = np.random.choice(vocab_size, p=output_probs)
    next_word = tokenizer.index_word[next_word_id]
    seed_text += " " + next_word
print(seed_text)


Define the RNN language model architecture
Train the language model
Epoch 1/5
Batch 1/13946 - loss: 8.0062 - acc: 0.0000
Batch 201/13946 - loss: 5.7960 - acc: 0.1305
Batch 401/13946 - loss: 5.6860 - acc: 0.1383
Batch 601/13946 - loss: 5.4640 - acc: 0.1469
Batch 801/13946 - loss: 5.3639 - acc: 0.1531
Batch 1001/13946 - loss: 5.3963 - acc: 0.1633
Batch 1201/13946 - loss: 5.0524 - acc: 0.1875
Batch 1401/13946 - loss: 5.1272 - acc: 0.1750
Batch 1601/13946 - loss: 5.0117 - acc: 0.1914
Batch 1801/13946 - loss: 4.9222 - acc: 0.1898
Batch 2001/13946 - loss: 4.9589 - acc: 0.1898
Batch 2201/13946 - loss: 4.8706 - acc: 0.1813
Batch 2401/13946 - loss: 4.7460 - acc: 0.2055
Batch 2601/13946 - loss: 4.9985 - acc: 0.1664
Batch 2801/13946 - loss: 4.7661 - acc: 0.1844
Batch 3001/13946 - loss: 4.8235 - acc: 0.1984
Batch 3201/13946 - loss: 4.7286 - acc: 0.1977
Batch 3401/13946 - loss: 4.5682 - acc: 0.2250
Batch 3601/13946 - loss: 4.5681 - acc: 0.2156
Batch 3801/13946 - loss: 4.6953 - acc: 0.2125
Batch 400

Batch 6801/13946 - loss: 4.3370 - acc: 0.2305
Batch 7001/13946 - loss: 4.3541 - acc: 0.2266
Batch 7201/13946 - loss: 4.4164 - acc: 0.2352
Batch 7401/13946 - loss: 4.4323 - acc: 0.2336
Batch 7601/13946 - loss: 4.3352 - acc: 0.2719
Batch 7801/13946 - loss: 4.5367 - acc: 0.2227
Batch 8001/13946 - loss: 4.3668 - acc: 0.2461
Batch 8201/13946 - loss: 4.4924 - acc: 0.2211
Batch 8401/13946 - loss: 4.3972 - acc: 0.2438
Batch 8601/13946 - loss: 4.4098 - acc: 0.2430
Batch 8801/13946 - loss: 4.3637 - acc: 0.2336
Batch 9001/13946 - loss: 4.3430 - acc: 0.2578
Batch 9201/13946 - loss: 4.4216 - acc: 0.2313
Batch 9401/13946 - loss: 4.4691 - acc: 0.2352
Batch 9601/13946 - loss: 4.2985 - acc: 0.2523
Batch 9801/13946 - loss: 4.4496 - acc: 0.2398
Batch 10001/13946 - loss: 4.2688 - acc: 0.2641
Batch 10201/13946 - loss: 4.1860 - acc: 0.2602
Batch 10401/13946 - loss: 4.3021 - acc: 0.2438
Batch 10601/13946 - loss: 4.3292 - acc: 0.2422
Batch 10801/13946 - loss: 4.3419 - acc: 0.2422
Batch 11001/13946 - loss: 4.4

Batch 13801/13946 - loss: 4.5652 - acc: 0.2422
Epoch loss: 4.3209 - epoch acc: 0.2407
Generate text using the language model
The quick brown fox is at sun unknown eol eol to eat fried it


In [6]:
model.save('model_movie_2.h5')

## Upload model 

In [5]:
model2 = tf.keras.models.load_model('model_movie_2.h5')

## Get top words for first word

In [6]:
import collections

# Open the file and read in the data
with open('movie.txt', 'r', encoding='utf-8') as f:
    data = f.read()

# Split the data into individual sentences
sentences = data.split('\n')

# Create a dictionary to store the word frequencies
word_freq = collections.defaultdict(int)

# Loop through each sentence and count the frequency of the first word
for sentence in sentences:
    # Check if the sentence is empty
    if sentence.strip():
        first_word = sentence.split()[0]
        word_freq[first_word] += 1

# Sort the dictionary by frequency in descending order
sorted_freq = sorted(word_freq.items(), key=lambda x: x[1] if x[0] not in ["'", 'eol', 'EOL'] else -1, reverse=True)

# Extract the 100 most frequent words
first_top_words = [word for word, freq in sorted_freq[:100]]

# Print the keyword list
print(first_top_words)

['the', 'i', 'this', 'it', 'and', 'but', "it's", 'in', 'if', 'a', 'there', 'he', 'unknown', 'as', 'they', 'what', 'so', 'you', 'when', 'she', 'for', 'one', 'not', 'however', 'all', "i'm", 'we', 'that', 'my', 'well', 'even', 'at', 'after', 'no', 'to', 'then', 'while', 'also', 'his', 'some', 'now', "there's", 'why', 'how', 'with', 'just', "don't", 'although', 'of', "i've", 'oh', 'on', 'maybe', "that's", 'unfortunately', 'its', 'or', 'first', 'is', 'yes', 'like', 'overall', 'from', 'most', 'these', 'an', 'her', 'by', "he's", 'instead', 'another', 'who', 'perhaps', 'here', 'because', 'though', 'still', 'very', 'anyway', 'only', 'do', 'every', 'despite', 'yet', 'watch', 'many', 'great', "i'd", 'where', 'which', 'nothing', 'sure', 'both', 'once', 'ok', 'good', 'other', 'everything', 'too', 'john']


## IDX + Ziphian

In [7]:
import math
from collections import Counter

# Tokenize the text data
with open('movie.txt', 'r', encoding="utf-8") as f:
    text = f.readlines()
    
text = [line.strip() for line in text]
text = [line + ' eol' for line in text]
words = ' '.join(text).split()

print('open')
# Build a dictionary of document frequencies
doc_freq = Counter(words)
print('freq')
# Calculate the IDF weights
num_docs = len(words)
idf = {}
for word in doc_freq:
    idf[word] = math.log(num_docs / (1 + doc_freq[word]))
print('idf')
# Store the IDF weights
# You can print or use this dictionary to perform further processing or analysis
print(idf['eol'])

open
freq
idf
2.9386061893283184


In [8]:
def idx_ziphian_tree(output_probs, tokenizer, n):
    index_to_prob = {idx: output_probs[idx] for idx in range(len(output_probs))}
    word_to_prob = {tokenizer.index_word[idx + 1]: prob for idx, prob in index_to_prob.items()}

    keys = word_to_prob.keys()
    for key in keys:
        word_to_prob[key] *= idf[key]
    
    # Sort the words in descending order of probability
    sorted_word_prob = sorted(word_to_prob.items(), key=lambda x: x[1], reverse=True)

    # Get the sorted probabilities
    sorted_probs = [p for w, p in sorted_word_prob]

    # Define the value of alpha
    alpha = 1.0

    # Calculate the Zipfian distribution
    zipf_probs = [1 / (i + 1) ** alpha for i in range(len(sorted_probs))]
    zipf_probs = np.array(zipf_probs) / np.sum(zipf_probs)

    # Apply the Zipfian distribution to the sorted probabilities
    weighted_probs = zipf_probs * sorted_probs

    # Assign the new probabilities to the original words
    for i, (word, prob) in enumerate(sorted_word_prob):
        word_to_prob[word] = weighted_probs[i]

    sorted_cp_final = sorted(word_to_prob.items(), key=lambda x: x[1], reverse=True)

        # Get the top N keys
    top_n = [sorted_cp_final[i][0] for i in range(min(n, len(sorted_cp_final)))]
    
    return top_n


## Binary tree

In [9]:
def apply_pbt(lst, N, input_string):
    # build dictionary
    dictionary = {}
    for i in range(len(lst)):
        bit_code = format(i, f'0{N}b')
        dictionary[lst[i]] = bit_code

    # find key
    output=""
    for key, value in dictionary.items():
        if value == input_string:
            output = key
            break

    return output

## PROCESSING TEXT

In [10]:
import heapq

input_text = "My secret code is: 10284324."
bitstream = ""
for c in input_text:
    bitstream += bin(ord(c))[2:].zfill(8)
print(bitstream)

k = 4
size_pool = 16
gen_output = ""
gen_sentence = ""

while bitstream:
    while len(bitstream) < k:
        bitstream = bitstream + '0'
    print(bitstream)
    if len(gen_sentence) == 0 or gen_sentence.split()[-1] == 'eol':
        if len(gen_sentence) != 0:
            gen_output += gen_sentence + " "
        gen_sentece = ""
        gen_sentence = np.random.choice(first_top_words)
        #print(gen_sentence)
        # get random sec word
        input_seq = tokenizer.texts_to_sequences([gen_sentence])[0]
        input_seq = pad_sequences([input_seq], maxlen=max_input_length)
        output_probs = model2.predict(input_seq)[0]
        next_word_id = np.random.choice(vocab_size, p=output_probs)
        next_word = tokenizer.index_word[next_word_id]
        gen_sentence += " " + next_word
        #print(gen_sentence)
    else:
        input_seq = tokenizer.texts_to_sequences([gen_sentence])[0]
        input_seq = pad_sequences([input_seq], maxlen=max_input_length)
        output_probs = model2.predict(input_seq)[0]
        '''top_indices = np.argsort(output_probs)[::-1][:8]
        top_probs = output_probs[top_indices]
        next_word_id = np.random.choice(top_indices, p=top_probs/sum(top_probs))
        next_word = tokenizer.index_word[next_word_id]'''
        top_words = idx_ziphian_tree(output_probs, tokenizer, size_pool)
        next_word = apply_pbt(top_words, k, bitstream[:k])
        gen_sentence += " " + next_word
        
        bitstream = bitstream[k:]

if len(gen_sentence) != 0:
    gen_output += gen_sentence
print('output')
print(gen_output)

sentences = gen_output.split(' eol')

# Capitalize the first letter of each sentence and replace 'i' with 'I'
output_sentences = []
for sentence in sentences:
    sentence = sentence.strip()  # remove any leading or trailing white space
    sentence = sentence.capitalize()
    sentence = sentence.replace(' i ', ' I ')
    output_sentences.append(sentence)

# Join the sentences back into a single string with '. ' as the separator
output_string = '. '.join(output_sentences)

print('final')
print(output_string)

01001101011110010010000001110011011001010110001101110010011001010111010000100000011000110110111101100100011001010010000001101001011100110011101000100000001100010011000000110010001110000011010000110011001100100011010000101110
01001101011110010010000001110011011001010110001101110010011001010111010000100000011000110110111101100100011001010010000001101001011100110011101000100000001100010011000000110010001110000011010000110011001100100011010000101110
01001101011110010010000001110011011001010110001101110010011001010111010000100000011000110110111101100100011001010010000001101001011100110011101000100000001100010011000000110010001110000011010000110011001100100011010000101110
1101011110010010000001110011011001010110001101110010011001010111010000100000011000110110111101100100011001010010000001101001011100110011101000100000001100010011000000110010001110000011010000110011001100100011010000101110
01111001001000000111001101100101011000110111001001100101011101000010000001100011011011110110010001100101

In [None]:
model.save('movies_model_2.h5')

In [13]:
import tensorflow as tf

print('1: ', tf.config.list_physical_devices('GPU'))
print('2: ', tf.test.is_built_with_cuda)
print('3: ', tf.test.gpu_device_name())
print('4: ', tf.config.get_visible_devices())

1:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
2:  <function is_built_with_cuda at 0x000002A1328629E0>
3:  /device:GPU:0
4:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## DECODE

In [11]:
stegotext = output_string
original_stego = gen_output
print(original_stego)
print('-----')
print(stegotext)
print('-----')
stegotext = stegotext.replace(".", " eol").lower()
print(stegotext)
print('-----')

sentences = []
sentence = ""
for word in stegotext.split():
    if "eol" in word:
        sentence += word
        sentences.append(sentence)
        sentence = ""
    else:
        sentence += word + " "

# Add any remaining text as a final sentence
if sentence:
    sentences.append(sentence.strip())

    
print(sentences)

there was at she one it out to can bad in about about have my soon it is they bad is of they bad watch than in have script of and and no where of of of by in eol every scene it unknown a few to he were like they had my thought was eol great lack a did be they
-----
There was at she one it out to can bad in about about have my soon it is they bad is of they bad watch than in have script of and and no where of of of by in. Every scene it unknown a few to he were like they had my thought was. Great lack a did be they
-----
there was at she one it out to can bad in about about have my soon it is they bad is of they bad watch than in have script of and and no where of of of by in eol every scene it unknown a few to he were like they had my thought was eol great lack a did be they
-----
['there was at she one it out to can bad in about about have my soon it is they bad is of they bad watch than in have script of and and no where of of of by in eol', 'every scene it unknown a few to he were l

In [14]:
output_bitstream = ""
i = 0
for sentence in sentences:
    words = sentence.split()
    sentence_random = ' '.join(words[:2])
    sentence_main = ' '.join(words[2:])
    #print(sentence_random)
    #print(sentence_main)
    
    words_main = sentence_main.split()
    stego_sentence = sentence_random
    for word in words_main:
        input_seq = tokenizer.texts_to_sequences([stego_sentence])[0]
        input_seq = pad_sequences([input_seq], maxlen=max_input_length)
        output_probs = model2.predict(input_seq)[0]
        top_words = idx_ziphian_tree(output_probs, tokenizer, size_pool)
        print(top_words)
        print(word)
        idx_target = get_index(top_words, word)
        #print(idx_target)
        word_bits = int_to_binstr(idx_target, k)
        print(word_bits)
        output_bitstream += word_bits
        #print(output_bitstream)
        stego_sentence += ' ' + word
        i += 1

print(output_bitstream)
print(i)

# Split bitstream into 8-bit chunks
chunks = [output_bitstream[i:i+8] for i in range(0, len(output_bitstream), 8)]

# Convert each chunk to its corresponding ASCII character
string = ""
for chunk in chunks:
    string += chr(int(chunk, 2))

print(string)

['my', 'of', 'going', 'there', 'at', 'are', 'into', 'eol', 'they', 'like', 'finally', 'brings', 'such', 'really', 'her', 'modern']
at
0100
['almost', 'eol', 'by', 'of', 'must', "don't", 'know', 'that', 'there', 'was', 'at', 'have', 'in', 'she', 'they', 'then']
she
1101
['unknown', 'as', 'out', 'eol', 'were', 'is', 'and', 'one', 'it', 'a', '2', 'of', 'effects', 'least', 'in', 'her']
one
0111
['to', 'and', 'left', 'so', 'goes', "doesn't", "couldn't", "he's", 'a', 'it', 'over', 'was', 'is', 'in', 'will', 'unknown']
it
1001
['as', 'unknown', 'out', 'time', 'least', 'and', "didn't", 'world', 'place', 'sense', '2', 'better', 'too', 'were', 'her', 'actor']
out
0010
['to', 'and', 'it', 'a', 'eol', 'you', 'is', 'movie', 'for', 'unknown', 'was', 'i', 'one', 'what', 'all', 'with']
to
0000
['he', 'eol', 'could', 'bad', 'other', 'of', 'have', 'can', 'should', 'and', 'be', "they're", 'about', 'look', 'obvious', 'in']
can
0111
['he', 'could', 'be', 'bad', 'other', 'should', 'look', 'eol', 'killer', '

['of', 'it', 'sort', 'are', 'eol', 'new', 'and', 'woman', 'like', 'is', 'definitely', 'her', 'up', 'a', 'an', 'in']
like
1000
['of', 'eol', 'and', 'they', 'have', 'is', 'about', 'one', 'in', 'a', 'it', 'i', 'was', 'would', 'from', 'at']
they
0011
['his', 'me', 'be', 'bad', 'had', 'other', 'were', 'by', 'them', 'too', 'acting', 'pretty', 'should', 'people', 'and', 'look']
had
0100
['is', 'of', 'get', 'my', 'eol', 'they', 'there', 'and', 'have', 'it', 'line', 'going', 'script', 'about', 'make', 'which']
my
0011
['next', 'soon', 'plays', 'thought', 'their', 'and', 'live', 'show', 'is', 'high', 'in', 'shots', 'shoot', 'myself', 'a', 'to']
thought
0011
['and', 'is', 'to', 'was', 'movie', 'a', 'good', 'it', 'eol', 'from', 'film', 'i', 'you', 'one', 'just', 'if']
was
0011
['of', 'and', 'eol', 'new', 'are', 'was', 'her', 'like', 'is', 'it', 'they', 'if', 'sort', 'a', 'in', 'an']
eol
0010
['to', 'and', 'is', 'a', 'it', 'movie', 'unknown', 'for', 'eol', 'you', 'what', 'big', 'man', 'was', 'as', 

In [13]:
def get_index(lst, val):
    try:
        return lst.index(val)
    except ValueError:
        return -1

def int_to_binstr(a, k):
    binstr = bin(a)[2:]
    return "0" * (k - len(binstr)) + binstr

print(int_to_binstr(2, 5))

00010
