In [11]:
import pandas as pd
import numpy as np

In [4]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [5]:
imdb = tfds.load('imdb_reviews', as_supervised=True)

[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete3N3V2K/imdb_reviews-train.tfrecord*...…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete3N3V2K/imdb_reviews-test.tfrecord*...:…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete3N3V2K/imdb_reviews-unsupervised.tfrec…

[1mDataset imdb_reviews downloaded and prepared to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [6]:
train_sent = []
test_sent = []
train_labels = []
test_labels = []

In [7]:
for sent, label in imdb['train']:
  train_sent.append(sent.numpy().decode('utf8'))
  train_labels.append(label.numpy())

In [8]:
for sent1, label1 in imdb['test']:
  test_sent.append(sent1.numpy().decode('utf8'))
  test_labels.append(label1.numpy())

In [12]:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [13]:
np.unique(train_labels)

array([0, 1])

In [14]:
np.unique(test_labels)

array([0, 1])

In [15]:
len(train_sent)

25000

In [16]:
len(test_labels)

25000

In [17]:
vocab_size  = 1000
embedding_dim = 32
max_len = 100

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<UKW>')

In [20]:
tokenizer.fit_on_texts(train_sent)

In [21]:
word_index = tokenizer.word_index

In [23]:
tr_sequence = tokenizer.texts_to_sequences(train_sent)
tr_padded_seq = pad_sequences(tr_sequence, padding='post', truncating='post', maxlen=max_len)

In [24]:
ts_sequence = tokenizer.texts_to_sequences(test_sent)
ts_padded_seq = pad_sequences(ts_sequence, padding='post', truncating='post', maxlen=max_len)

## Simple model with Embedding layer

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

In [26]:
model = Sequential()

In [27]:
model.add(Embedding(vocab_size,
                    embedding_dim,
                    input_length=max_len))

In [29]:
model.add(Flatten())

In [30]:
model.add(Dense(128, activation='relu'))

In [31]:
model.add(Dense(1, activation='sigmoid'))

In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           32000     
                                                                 
 flatten (Flatten)           (None, 3200)              0         
                                                                 
 dense (Dense)               (None, 128)               409728    
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 441,857
Trainable params: 441,857
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [34]:
model.fit(tr_padded_seq, train_labels,
          validation_data=(ts_padded_seq, test_labels),
          epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fa189bd3910>

In [35]:
new_reviews = ["I bought this for my husband who plays the piano.  He is having a wonderful time playing these old hymns.  The music  is at times hard to read because we think the book was published for singing from more than playing from.  Great purchase though!"]

In [36]:
new_reviews = tokenizer.texts_to_sequences(new_reviews)

In [37]:
new_reviews = pad_sequences(new_reviews,
                            maxlen=max_len,
                            padding='post',
                            truncating='post')

In [38]:
model.predict(new_reviews)

array([[0.98876584]], dtype=float32)

In [39]:
new_reviews = ["I am not happy with the product. It stopped working after 2 days. Customer service is bad."]

In [40]:
new_reviews = tokenizer.texts_to_sequences(new_reviews)

In [41]:
new_reviews = pad_sequences(new_reviews,
                            maxlen=max_len,
                            padding='post',
                            truncating='post')

In [42]:
model.predict(new_reviews)

array([[0.00058785]], dtype=float32)

## LSTM Network

In [43]:
from tensorflow.keras.layers import LSTM, Bidirectional

In [44]:
bidi_model = Sequential()

In [45]:
bidi_model.add(Embedding(vocab_size,
                    embedding_dim,
                    input_length=max_len))

In [46]:
bidi_model.add(Bidirectional(LSTM(128)))
bidi_model.add(Dense(128,
                     activation='relu',
                     kernel_regularizer = tf.keras.regularizers.L2()))
bidi_model.add(Dense(1, activation='sigmoid'))

In [47]:
bidi_model.compile(optimizer='adam',
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

In [48]:
bidi_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 32)           32000     
                                                                 
 bidirectional (Bidirectiona  (None, 256)              164864    
 l)                                                              
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 229,889
Trainable params: 229,889
Non-trainable params: 0
_________________________________________________________________


In [49]:
bidi_model.fit(tr_padded_seq, train_labels,
          validation_data=(ts_padded_seq, test_labels),
          epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa188faacd0>

## Visualising word embedding on Tensorflow Projector

In [50]:
emb_layer = bidi_model.layers[0]
emb_weights = emb_layer.get_weights()[0]
print(emb_weights.shape)

(1000, 32)


In [51]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [52]:
import io

In [53]:
# Vectors for the weights of the Embedding layer
vectors = io.open('vectors_01Oct_01.tsv', 'w', encoding='utf-8')
# Metadata for the data
metadata = io.open('metadata_01Oct_01.tsv', 'w', encoding='utf-8')
for x in range(1, vocab_size):
  word = reverse_word_index[x]
  embeddings = emb_weights[x]
  metadata.write(word + "\n")
  vectors.write('\t'.join([str(x) for x in embeddings]) + "\n")
vectors.close()
metadata.close()

# Download above 2 files and upload it into https://projector.tensorflow.org/
# Try to visualize it in the above website

## Classification using Transformer model

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 42.6 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 65.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


In [2]:
from transformers import pipeline
generator = pipeline('text-generation', model='EleutherAI/gpt-neo-125M')

Downloading:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/526M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [3]:
res = generator("The movie is good",
                max_length=20,
                do_sample=True, 
                temperature=0.9)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [4]:
print(res[0]['generated_text'])

The movie is good: it gives us a rough estimate of the quality of the movie. Sure of
