## Sentiment Analysis  
The Model consists of two phases:
#### Embedding Layer
Pretrained embedding layer using byte-pair encoding; BPEmb toolkit is used.
#### CNN
Complex features from sentences are extracted using a convolutional neural network. Classification is done using a softmax layer directly after the CNN.  
  
The dataset is used from kaggle's Sentiment analysis challenge  
https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data  

In [0]:
!kaggle competitions download -c sentiment-analysis-on-movie-reviews

Downloading sentiment-analysis-on-movie-reviews.zip to /content
  0% 0.00/1.90M [00:00<?, ?B/s]
100% 1.90M/1.90M [00:00<00:00, 126MB/s]


In [0]:
!unzip sentiment-analysis-on-movie-reviews.zip -d sentiment-analysis-on-movie-reviews/

Archive:  sentiment-analysis-on-movie-reviews.zip
  inflating: sentiment-analysis-on-movie-reviews/sampleSubmission.csv  
  inflating: sentiment-analysis-on-movie-reviews/test.tsv.zip  
  inflating: sentiment-analysis-on-movie-reviews/train.tsv.zip  


In [0]:
!unzip sentiment-analysis-on-movie-reviews/train.tsv.zip

Archive:  sentiment-analysis-on-movie-reviews/train.tsv.zip
  inflating: train.tsv               


In [0]:
!unzip sentiment-analysis-on-movie-reviews/test.tsv.zip

Archive:  sentiment-analysis-on-movie-reviews/test.tsv.zip
  inflating: test.tsv                


In [0]:
!pip install bpemb

Collecting bpemb
  Downloading bpemb-0.3.0-py3-none-any.whl (19 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 8.0 MB/s 
Installing collected packages: sentencepiece, bpemb
Successfully installed bpemb-0.3.0 sentencepiece-0.1.85


In [0]:
!pip install tensorflow-gpu==2.0.0

Collecting tensorflow-gpu==2.0.0
  Downloading tensorflow_gpu-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (380.8 MB)
[K     |████████████████████████████████| 380.8 MB 31 kB/s 
Collecting tensorflow-estimator<2.1.0,>=2.0.0
  Downloading tensorflow_estimator-2.0.1-py2.py3-none-any.whl (449 kB)
[K     |████████████████████████████████| 449 kB 52.9 MB/s 
Collecting tensorboard<2.1.0,>=2.0.0
  Downloading tensorboard-2.0.2-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 61.0 MB/s 
[31mERROR: tensorflow 1.15.0 has requirement tensorboard<1.16.0,>=1.15.0, but you'll have tensorboard 2.0.2 which is incompatible.[0m
[31mERROR: tensorflow 1.15.0 has requirement tensorflow-estimator==1.15.1, but you'll have tensorflow-estimator 2.0.1 which is incompatible.[0m
Installing collected packages: tensorflow-estimator, tensorboard, tensorflow-gpu
  Attempting uninstall: tensorflow-estimator
    Found existing installation: tensorflow-estimator 1.15.1
    Uninstalling tens

In [0]:
import os
import pandas as pd
from bpemb import BPEmb
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers

In [0]:
train_df = pd.read_csv("train.tsv", sep="\t")
test_df = pd.read_csv("test.tsv", sep="\t")
print(train_df.shape)
print(test_df.shape)

In [0]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [0]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [0]:
bpemb_en = BPEmb(lang="en", dim=50)

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.model


100%|██████████| 400869/400869 [00:00<00:00, 564973.01B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d50.w2v.bin.tar.gz


100%|██████████| 1924908/1924908 [00:01<00:00, 1885182.70B/s]
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
phrases_list = train_df['Phrase'].to_list()
print(len(phrases_list))
labels_list = [int(l) for l in train_df['Sentiment'].to_list()]
subword_list = [" ".join(bpemb_en.encode(line)) for line in phrases_list]

156060


In [0]:
def caster(example, label):
  return example, tf.cast(int(label), tf.float32)

Create a dictionary to encode subwords that is integratable with tensorflow graph.

In [0]:
encoder = tf.lookup.StaticVocabularyTable(
    tf.lookup.KeyValueTensorInitializer(bpemb_en.words, tf.cast(range(len(bpemb_en.words)), tf.int64)), 3)

In [0]:
lines_dataset = tf.data.Dataset.from_tensor_slices(subword_list)
labels_dataset = tf.data.Dataset.from_tensor_slices(labels_list)
lines_labels_ds = tf.data.Dataset.zip((lines_dataset, labels_dataset))
lines_labels_ds = lines_labels_ds.map(lambda ex, label: caster(ex, label))
lines_labels_ds = lines_labels_ds.shuffle(200000)
ids_labels_ds = lines_labels_ds.map(lambda a, b: (encoder.lookup(tf.strings.split(a)), b))

In [0]:
bpe_tensor = tf.convert_to_tensor(bpemb_en.vectors)

In [0]:
class PretrainedEmbedding(tf.keras.layers.Layer):
    """Non-trainable embedding layer."""

    def __init__(self, embeddings, rate=0.1, **kwargs):
        """"Instantiate the layer using a pre-defined embedding matrix."""
        super().__init__(**kwargs)
        self.embeddings = tf.constant(embeddings)
        # if you want to add some dropout (or normalization, etc.)
        self.dropout = tf.keras.layers.Dropout(rate=rate)


    def call(self, inputs, training=None):
        """Embed some input tokens and optionally apply dropout."""

        output = tf.nn.embedding_lookup(self.embeddings, inputs)
        return self.dropout(output, training=training)

In [0]:
def conv_block(x, width, growth_rate, name):

  x1 = layers.BatchNormalization(epsilon=1.001e-5,
                                  name=name + '_0_bn')(x)
  x1 = layers.ReLU(max_value=6., name=name + '_0_relu')(x1)
  x1 = layers.Dropout(0.2)(x1)
  x1 = layers.Conv2D(4 * growth_rate, (1, width),
                      use_bias=False,
                      name=name + '_1_conv')(x1)
  x1 = tf.squeeze(x1, axis=-2)
  x1 = tf.expand_dims(x1, -1)

  x1 = layers.BatchNormalization(epsilon=1.001e-5,
                                  name=name + '_1_bn')(x1)
  x1 = layers.ReLU(max_value=6., name=name + '_1_relu')(x1)
  x1 = layers.Dropout(0.2)(x1)

  x2 = tf.pad(x1, [[0,0],[1,1],[0,0],[0,0]])
  x2 = layers.Conv2D(growth_rate, (3, 4 * growth_rate),
                      use_bias=False,
                      name=name + '_2_conv')(x2)
  x2 = tf.squeeze(x2, axis=-2)
  x2 = tf.expand_dims(x2, -1)

  x3 = tf.pad(x1, [[0,0],[2,2],[0,0],[0,0]])
  x3 = layers.Conv2D(growth_rate, (5, 4 * growth_rate),
                      use_bias=False,
                      name=name + '_3_conv')(x3)
  x3 = tf.squeeze(x3, axis=-2)
  x3 = tf.expand_dims(x3, -1)

  x = layers.Concatenate(name=name + '_concat', axis=-2)([x, x2, x3])
  return x

def dense_block(x, inp_depth, blocks, name, growth=64):

  width = inp_depth
  for i in range(blocks):
      x = conv_block(x, width, growth, name=name + '_block' + str(i + 1))
      width = growth * 2 + width
  return x

In [0]:
inputs = keras.layers.Input(shape=(None,), dtype=tf.int64)

x = PretrainedEmbedding(tf.pad(bpe_tensor, [[0, 3], [0, 0]]))(inputs)
x = tf.expand_dims(x, -1)
x = layers.Conv2D(32, (1, 50),
                    use_bias=False)(x)
x = tf.squeeze(x, axis=-2)
x = tf.expand_dims(x, -1)

x = dense_block(x, 32, 3, "dense_1", 12)
x = layers.BatchNormalization(epsilon=1.001e-5)(x)
x = layers.ReLU(max_value=6.)(x)
x = layers.Dropout(0.2)(x)

# x = layers.Conv2D(64, (1, 80), use_bias=False)(x)
# x = tf.squeeze(x, axis=-2)
# x = tf.expand_dims(x, -1)

# x = dense_block(x, 64, 2, "dense_2", 12)
# x = keras.layers.BatchNormalization()(x)
# x = keras.layers.ReLU(6.)(x)
# x = layers.Dropout(0.2)(x)

# x = layers.Conv2D(96, (1, 224), use_bias=False)(x)
# x = tf.squeeze(x, axis=-2)
# x = tf.expand_dims(x, -1)

# x = dense_block(x, 96, 5, "dense_3", 32)
# x = keras.layers.BatchNormalization()(x)
# x = keras.layers.ReLU(6.)(x)
# x = layers.Dropout(0.2)(x)

x = tf.squeeze(x, axis=-1)
x = tf.expand_dims(x, -2)
x = keras.layers.GlobalMaxPooling2D()(x)

x = keras.layers.Dense(6, activation='softmax')(x)

model = keras.models.Model(inputs, x)
model.summary()


Model: "model_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
pretrained_embedding_16 (Pretra (None, None, 50)     0           input_17[0][0]                   
__________________________________________________________________________________________________
tf_op_layer_ExpandDims_316 (Ten [(None, None, 50, 1) 0           pretrained_embedding_16[0][0]    
__________________________________________________________________________________________________
conv2d_25 (Conv2D)              (None, None, 1, 32)  1600        tf_op_layer_ExpandDims_316[0][0] 
___________________________________________________________________________________________

In [0]:
BATCH_SIZE = 96
steps_per_epoch = 156060*0.8 // BATCH_SIZE

In [0]:
ids_one_hot_ds = ids_labels_ds.map(lambda a, b: (a, tf.one_hot(tf.cast(b, tf.int32), 6)))

In [0]:
ds = ids_one_hot_ds.padded_batch(BATCH_SIZE, padded_shapes=((None,), [6]))

In [0]:
for a, b in ds.take(1):
  tf.print(tf.shape(a), tf.shape(b))

[96 51] [96 6]


In [0]:
train_batches = int(156060 * 0.8 // BATCH_SIZE)
valid_batches = int(156060 // BATCH_SIZE) - train_batches
train_ds = ds.take(train_batches).repeat()
valid_ds = ds.skip(train_batches).repeat()

In [0]:
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
  0.001,
  decay_steps=train_batches*3,
  decay_rate=1,
  staircase=False)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

In [0]:
model.evaluate(train_ds, steps=20)



[1.5820520162582397, 0.29895833]

In [0]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=7, restore_best_weights=True)

history = model.fit(train_ds,
                    callbacks=[callback],
                    steps_per_epoch=train_batches,
                    epochs=100,
                    validation_data=valid_ds,
                    validation_steps=valid_batches)
model.save("./drive/My Drive/Models/bpe_cnn_sentiment_dense_25k")

Train for 1300 steps, validate for 325 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

In [0]:
model.predict(next(iter(valid_ds)))

array([[5.71481764e-01, 3.49923372e-01, 6.03016578e-02, 1.50448903e-02,
        3.24834231e-03, 1.82212845e-09],
       [4.34644185e-02, 2.61657894e-01, 4.84695137e-01, 1.84816062e-01,
        2.53661871e-02, 2.94708030e-07],
       [2.28226017e-02, 2.08390594e-01, 6.27331972e-01, 1.30020961e-01,
        1.14337737e-02, 6.14191293e-08],
       [1.15798097e-02, 4.53853607e-02, 1.01600423e-01, 5.16200960e-01,
        3.25233519e-01, 1.03121742e-08],
       [1.39030023e-02, 1.28053069e-01, 7.41781831e-01, 1.06276020e-01,
        9.98499244e-03, 1.13110684e-06],
       [6.25831401e-03, 9.43449736e-02, 7.07537949e-01, 1.77843809e-01,
        1.40145561e-02, 4.22808938e-07],
       [1.79886833e-01, 3.89868408e-01, 1.41829178e-01, 2.31314451e-01,
        5.71011640e-02, 4.72720663e-09],
       [3.41352122e-03, 2.74295788e-02, 1.69493884e-01, 4.84207720e-01,
        3.15455347e-01, 5.69518050e-08],
       [1.26760276e-02, 1.32148415e-01, 6.39648676e-01, 1.90029860e-01,
        2.54964549e-02, 