In [1]:
#Word levele one-hot encoding
import numpy as np

samples = ['The cat sat on the  mat', 'The dog ate may homework.']

token_index = {}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) +1


In [2]:
token_index

{'The': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'the': 5,
 'mat': 6,
 'dog': 7,
 'ate': 8,
 'may': 9,
 'homework.': 10}

In [4]:
max_length = 10

results = np.zeros(shape = (len(samples),
                           max_length,
                           max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i,j,index] = 1

In [7]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer

In [13]:
tokenizer = Tokenizer(num_words = 1000)
tokenizer.fit_on_texts(samples)

In [14]:
tokenizer.word_index

{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'ate': 7,
 'may': 8,
 'homework': 9}

In [15]:
sequences = tokenizer.texts_to_sequences(samples)
sequences

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]

In [16]:
one_hot_results = tokenizer.texts_to_matrix(samples,
                                           mode = 'binary')
one_hot_results

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [17]:
from keras.datasets import imdb
from keras import preprocessing

max_features = 10000
maxlen = 20

(x_train, y_train), (x_test, y_test) = imdb.load_data(
num_words=max_features)

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test  = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [19]:
x_train.shape

(25000, 20)

In [21]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

model = Sequential()
model.add(Embedding(10000,8,input_length = maxlen))

model.add(Flatten())

model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 8)             80000     
                                                                 
 flatten (Flatten)           (None, 160)               0         
                                                                 
 dense (Dense)               (None, 1)                 161       
                                                                 
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [22]:
history = model.fit(x_train, y_train,
                   epochs = 10,
                   batch_size=32,
                    validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
import os
  
imdb_dir = 'C:\\Users\\pk\\Desktop\\aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
  
labels = []
texts = []
  
for label_type in ['neg', 'pos']:
     dir_name = os.path.join(train_dir, label_type)
     for fname in os.listdir(dir_name):
         if fname[-4:] == '.txt':
             f = open(os.path.join(dir_name, fname), encoding="utf8")
             texts.append(f.read())
             f.close()
             if label_type == 'neg':
                 labels.append(0)
             else:
                 labels.append(1)

In [41]:
#simple numpy implimentation of RNN
import numpy as np

timesteps = 100
input_features = 32
output_features = 64

inputs = np.random.random((timesteps,input_features))
state_t = np.zeros((output_features,))

W = np.random.random((output_features, input_features))
U = np.random.random((output_features, output_features))
b = np.random.random((output_features,))

successive_outputs = []

for input_t in inputs:
    output_t = np.tanh(np.dot(W,input_t) + np.dot(U, state_t) +b)
    
    successive_outputs.append(output_t)
    
    state_t = output_t

final_output = np.concatenate(successive_outputs, axis = 0)

In [43]:
inputs.shape

(100, 32)

In [45]:
state_t.shape

(64,)

In [47]:
W.shape

(64, 32)

In [51]:
final_output

array([0.99999992, 0.99999927, 0.99999976, ..., 1.        , 1.        ,
       1.        ])

In [52]:
final_output.shape

(6400,)