<a href="https://colab.research.google.com/github/mohamedamr13/Machine-Learning-/blob/main/NLP_session.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import keras
import numpy as np

**Text Understanding**

In [None]:
# Retrieve the training sequences.
(x_train, y_train), (x_test,y_test) = tf.keras.datasets.imdb.load_data()
# Retrieve the word index file mapping words to indices
word_index = keras.datasets.imdb.get_word_index()
# Reverse the word index to obtain a dict mapping indices to words
inverted_word_index = dict((i, word) for (word, i) in word_index.items())
# Decode the first sequence in the dataset
decoded_sequence = " ".join(inverted_word_index[i] for i in x_train[0])

In [None]:
x_train.shape

(25000,)

In [None]:
vocab_size = len(word_index)


In [None]:
bag_of_words_x_train = []
for sentence in x_train[:2000]:
    temp = [0 for _ in range(vocab_size+1)]
    for word in sentence:
        temp[word]+=1
    bag_of_words_x_train.append(temp)
bag_of_words_x_train = np.array(bag_of_words_x_train)

In [None]:
bag_of_words_x_test = []
for sentence in x_test[:100]:
    temp = [0 for _ in range(vocab_size+1)]
    for word in sentence:
        temp[word]+=1
    bag_of_words_x_test.append(temp)
bag_of_words_x_test = np.array(bag_of_words_x_test)

In [None]:
bag_of_words_x_train.shape, bag_of_words_x_test.shape

((2000, 88585), (100, 88585))

In [None]:
max_seq = 0
min_seq = 1000000
for sentence in x_train[:2000]:
    max_seq = max(max_seq, len(sentence))
    min_seq = min(min_seq, len(sentence))
print(max_seq, min_seq)

1038 19


In [None]:
pad_character = vocab_size + 1
padded_x_train = []
for sentence in x_train[:2000]:
    while(len(sentence) < max_seq):
        sentence.append(pad_character)
    padded_x_train.append(np.array(sentence))
padded_x_train = np.array(padded_x_train)
padded_y_train = y_train[:2000]

In [None]:
model = keras.models.Sequential()

model.add(keras.layers.Dense(1024,input_shape=(vocab_size+1,),activation='relu'))
model.add(keras.layers.Dense(512,activation='relu'))
model.add(keras.layers.Dense(256,activation='relu'))
model.add(keras.layers.Dense(1,activation='sigmoid')) ## why not tanh




In [None]:
model.fit(bag_of_words_x_train,y_train[:2000], validation_data = (bag_of_words_x_test,y_test[:2000]), epochs=10, batch_size = 16)

Epoch 1/10

In [None]:
# model_lstm = keras.models.Sequential()
input_layer = keras.layers.Input((max_seq,))
embedding = keras.layers.Embedding(vocab_size+1, 64,)(input_layer)

lstm_layer = keras.layers.LSTM(16, return_sequences= True)(embedding)
lstm_layer_2 = keras.layers.LSTM(16, return_sequences= False)(lstm_layer)

output_layer = keras.layers.Dense(1,activation='sigmoid')(lstm_layer_2)
model_lstm = keras.models.Model(input_layer,output_layer)

model_lstm.compile(loss="binary_crossentropy",optimizer='adam',metrics=["accuracy"])


In [None]:
model_lstm.fit(padded_x_train,padded_y_train, epochs=10, batch_size = 16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efeebfecc90>

**Text Generation**

In [None]:
pred = model_lstm.predict(padded_x_train[:1])

In [None]:
pred.shape

(1, 1038, 16)

In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

Stripping

In [None]:
text = text.replace("\n",' ').replace(':',"").replace(";","").replace(".","").replace(",","") #.......
text = text.lower()


Tokenization

In [None]:
word_tokenized_text = text.split(" ")
print(word_tokenized_text)



In [None]:
char_tokenized_text = []
for char in text[:100_000]:
  if(len(char)<1):
    continue
  char_tokenized_text.append(char)
print(char_tokenized_text)

['f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ' ', 'b', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 'e', 'd', ' ', 'a', 'n', 'y', ' ', 'f', 'u', 'r', 't', 'h', 'e', 'r', ' ', 'h', 'e', 'a', 'r', ' ', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', ' ', ' ', 'a', 'l', 'l', ' ', 's', 'p', 'e', 'a', 'k', ' ', 's', 'p', 'e', 'a', 'k', ' ', ' ', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ' ', 'y', 'o', 'u', ' ', 'a', 'r', 'e', ' ', 'a', 'l', 'l', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', ' ', 'r', 'a', 't', 'h', 'e', 'r', ' ', 't', 'o', ' ', 'd', 'i', 'e', ' ', 't', 'h', 'a', 'n', ' ', 't', 'o', ' ', 'f', 'a', 'm', 'i', 's', 'h', '?', ' ', ' ', 'a', 'l', 'l', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', ' ', ' ', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ' ', 'f', 'i', 'r', 's', 't', ' ', 'y', 'o', 'u', ' ', 'k', 'n', 'o', 'w', ' ', 'c', 'a', 'i', 'u', 's', ' ',

In [None]:
vectorized_char_text = []
char2idx = dict()
idx2char = dict()
i = 0
for char in char_tokenized_text: 
  if(char not in char2idx):
      char2idx[char] = i
      idx2char[i] = char
      i+=1
char2idx

{' ': 5,
 '!': 26,
 '&': 31,
 "'": 24,
 '-': 28,
 '?': 23,
 'a': 15,
 'b': 10,
 'c': 6,
 'd': 14,
 'e': 8,
 'f': 0,
 'g': 25,
 'h': 18,
 'i': 1,
 'j': 27,
 'k': 20,
 'l': 21,
 'm': 19,
 'n': 9,
 'o': 11,
 'p': 13,
 'q': 29,
 'r': 2,
 's': 3,
 't': 4,
 'u': 17,
 'v': 22,
 'w': 12,
 'x': 30,
 'y': 16,
 'z': 7}

In [None]:
vectorized_word_text = []
word2idx = dict()
idx2word = dict()
i = 0
for word in word_tokenized_text: 
  if(word not in word2idx):
      word2idx[word] = i
      idx2word[i] = word
      i+=1
word2idx

In [None]:
word_x_train = []
word_y_train = []

prev_words = 5
for i in range(0,len(word_tokenized_text)-6,5):
    x = word_tokenized_text[i:i+5]
    y = word_tokenized_text[i+5]
    word_x_train.append([word2idx[w] for w in x])
    word_y_train.append(word2idx[y])


In [None]:
word_x_train = np.array(word_x_train)
word_y_train = np.array(word_y_train)
print(word_x_train.shape, word_y_train.shape)

(41978, 5) (41978,)


In [None]:
char_x_train = []
char_y_train = []

prev_chars = 10
for i in range(0,len(char_tokenized_text)-(prev_chars+1),prev_chars):
    x = char_tokenized_text[i:i+prev_chars]
    y = char_tokenized_text[i+prev_chars]
    char_x_train.append([char2idx[c] for c in x])
    char_y_train.append(char2idx[y])
char_x_train = np.array(char_x_train)
char_y_train = np.array(char_y_train)

In [None]:
print(char_x_train.shape, char_y_train.shape)

(9999, 10) (9999,)


In [None]:
vocab_size = len(word2idx)
vocab_size

15456

In [None]:
# model_lstm = keras.models.Sequential()
input_layer = keras.layers.Input((prev_words,))
embedding = keras.layers.Embedding(in_vocab_size+1, 64,)(input_layer)

lstm_layer = keras.layers.LSTM(16, return_sequences= True)(embedding)
lstm_layer_2 = keras.layers.LSTM(out_vocab_size, return_sequences= False)(lstm_layer)

# output_layer = keras.layers.Dense(vocab_size,activation='softmax')(lstm_layer_2)
model_lstm = keras.models.Model(input_layer,output_layer)

model_lstm.compile(loss="sparse_categorical_crossentropy",optimizer='adam',metrics=["accuracy"])


In [None]:
pred = model_lstm.predict(word_x_train[:1])
idx2word[np.argmax(pred)]


'spectatorship'

In [None]:
def get_sentence(vectorized_words):
  sentence = ""
  for vector in vectorized_words:
    sentence += idx2word[vector] + ' '
  return sentence

In [None]:
get_sentence(word_x_train[0])

'first citizen before we proceed '

In [None]:
model_lstm.fit(word_x_train,word_y_train, epochs=10, batch_size = 16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efe61b5c8d0>

In [None]:
vocab_size = len(char2idx)
vocab_size

32

In [None]:
# model_lstm = keras.models.Sequential()
input_layer = keras.layers.Input((prev_chars,))
embedding = keras.layers.Embedding(vocab_size+1, 64,)(input_layer)

lstm_layer = keras.layers.LSTM(16, return_sequences= True)(embedding)
lstm_layer_2 = keras.layers.LSTM(16, return_sequences= False)(lstm_layer)

output_layer = keras.layers.Dense(vocab_size,activation='softmax')(lstm_layer_2)
model_lstm = keras.models.Model(input_layer,output_layer)

model_lstm.compile(loss="sparse_categorical_crossentropy",optimizer='adam',metrics=["accuracy"])


In [None]:
model_lstm.fit(char_x_train,char_y_train, epochs=10, batch_size = 16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efe60d2cdd0>