In [1]:
import pandas as pd
import numpy as np
import os,sys
import re
pwd = os.getcwd()

In [2]:
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

In [3]:
# reading dataset
kb_df = pd.read_csv(pwd+"//kabitakitchen.csv")
kb_df

Unnamed: 0,id,commentText,Labels
0,Ugy_CBm-_CKA3YqrzcB4AaABAg,Pudina ptta nhi dalu to,7
1,Ugy9mx9nuTWJu4dRac14AaABAg,Chiken kacha tu ni rhy ga sis,7
2,Ugz8T2MKLYucL3dM9nh4AaABAg,"Hello mam, I love your all recipes.... 😋😋😋\nAl...",4
3,Ugx_1cCjRbCaDgL0FLF4AaABAg,Its awesome recipe plzz make handi chicken in ...,2
4,UgzLhKVAJ6NN3nZXyjN4AaABAg,Yeh jo measurement hai.........kitne logon ke ...,7
...,...,...,...
4895,UgjFXyC0Qhzk5ngCoAEC,i love chole...thank you kabitaji for sharing ...,1
4896,UghP3bitlJuM13gCoAEC,thnakyou mm,1
4897,UghztLZOqvedfXgCoAEC,thanks mam,1
4898,UggX5Fi2Y430zXgCoAEC,u r fabulous,4


In [4]:
kb_df.shape

(4900, 3)

In [5]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [6]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [7]:
comments = []
sentences = list(kb_df['commentText'])
for sen in sentences:
    comments.append(preprocess_text(sen))

In [8]:
comments

['Pudina ptta nhi dalu to',
 'Chiken kacha tu ni rhy ga sis',
 'Hello mam love your all recipes All the ingredients are easily available and your way of explaining is too good ',
 'Its awesome recipe plzz make handi chicken in handi ',
 'Yeh jo measurement hai kitne logon ke liye hai ',
 'Kabita mam tried ur egg biryani everyone in my house just loved it thank so much that was so delicious it was all because of ur recipe',
 'cooker me kar sakte he na',
 'Mujhe bhot ache lagi apki respi mene subscribe kardia bhot ache he',
 'Mam dahi jgh kuch or use kr skte kya',
 'Wooooooo it very yummmmmm love it',
 'This is perfect biryani recipe Apko follow kar banaya acchi bani biryani ',
 'Hi Didi was always curious that How Biryani Made Thank you so much for putting this detailed video This Weekend will try and serve it to family Really Motivatied ',
 'thanx respect from Madam appne tel nahy dala ',
 'I made this it taste awesome thank you kabita ji ',
 'You re amazing ',
 'nice video',
 'Aur kya

In [9]:
print(kb_df.columns.values)

['id' 'commentText' 'Labels']


In [10]:
kb_df['Labels'].unique()

array([7, 4, 2, 5, 1, 3, 6], dtype=int64)

### Creating a BERT Tokenizer

In [13]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
print("1")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=False)
print("2")
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
print("3")
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
print("4")
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)
print("5")

1
2
3
4
5


In [14]:
# checking tokenization
tokenizer.tokenize("don't be so judgmental")

['don', "'", 't', 'be', 'so', 'judgment', '##al']

In [15]:
# checking tokens assigning with ids
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("dont be so judgmental"))

[2123, 2102, 2022, 2061, 8689, 2389]

In [16]:
# function to create tokens for comments
def tokenize_reviews(comment_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(comment_reviews))

In [17]:
# applying function of tokenize_reviews on comments
tokenized_comments = [tokenize_reviews(comment) for comment in comments]

### Prerparing Data For Training

In [20]:
comments_with_len = [[comment, kb_df['Labels'][i], len(comment)] for i, comment in enumerate(tokenized_comments)]

In [21]:
comments_with_len

[[[16405, 18979, 13866, 2696, 18699, 2072, 17488, 2226, 2000], 7, 9],
 [[9610, 7520, 10556, 7507, 10722, 9152, 1054, 10536, 11721, 24761], 7, 10],
 [[7592,
   5003,
   2213,
   2293,
   2115,
   2035,
   19328,
   2035,
   1996,
   12760,
   2024,
   4089,
   2800,
   1998,
   2115,
   2126,
   1997,
   9990,
   2003,
   2205,
   2204],
  4,
  21],
 [[2049, 12476, 17974, 20228, 13213, 2191, 2192, 2072, 7975, 1999, 2192, 2072],
  2,
  12],
 [[6300,
   2232,
   8183,
   10903,
   15030,
   8934,
   2638,
   8154,
   2078,
   17710,
   5622,
   6672,
   15030],
  7,
  13],
 [[10556,
   16313,
   2050,
   5003,
   2213,
   2699,
   24471,
   8288,
   12170,
   20444,
   3490,
   3071,
   1999,
   2026,
   2160,
   2074,
   3866,
   2009,
   4067,
   2061,
   2172,
   2008,
   2001,
   2061,
   12090,
   2009,
   2001,
   2035,
   2138,
   1997,
   24471,
   17974],
  5,
  32],
 [[16546, 2099, 2033, 10556, 2099, 7842, 25509, 2063, 2002, 6583], 7, 10],
 [[14163,
   3501,
   5369,
   1038,
  

In [22]:
# sorting the dataset with incresing order of tokenized length
comments_with_len.sort(key=lambda x: x[2])

In [23]:
comments_with_len

[[[15429], 3, 1],
 [[4283], 1, 1],
 [[4283], 1, 1],
 [[21688], 3, 1],
 [[3835], 3, 1],
 [[6919], 3, 1],
 [[21688], 3, 1],
 [[2190], 3, 1],
 [[10166], 3, 1],
 [[2204], 3, 1],
 [[3835], 3, 1],
 [[10166], 3, 1],
 [[3835], 3, 1],
 [[3835], 3, 1],
 [[16392], 3, 1],
 [[3565], 3, 1],
 [[2190], 3, 1],
 [[3835], 3, 1],
 [[3565], 3, 1],
 [[3835], 3, 1],
 [[3835], 3, 1],
 [[10166], 3, 1],
 [[10166], 3, 1],
 [[21688], 3, 1],
 [[7929], 6, 1],
 [[12476], 3, 1],
 [[3835], 3, 1],
 [[10166], 3, 1],
 [[2200], 6, 1],
 [[3835], 3, 1],
 [[2204], 3, 1],
 [[21688], 3, 1],
 [[2204], 3, 1],
 [[7286], 1, 1],
 [[10166], 3, 1],
 [[21688], 3, 1],
 [[3835], 3, 1],
 [[2204], 3, 1],
 [[3835], 3, 1],
 [[10166], 3, 1],
 [[3819], 3, 1],
 [[9212], 6, 1],
 [[3835], 3, 1],
 [[3835], 3, 1],
 [[3835], 3, 1],
 [[7632], 6, 1],
 [[3835], 3, 1],
 [[27969], 3, 1],
 [[2204], 3, 1],
 [[3835], 3, 1],
 [[3835], 3, 1],
 [[3835], 3, 1],
 [[3835], 3, 1],
 [[2204], 3, 1],
 [[12476], 3, 1],
 [[15030], 6, 1],
 [[21688], 3, 1],
 [[2204], 3,

In [25]:
# taking only tokenized comments and label of comment
sorted_comments_labels = [(comment_lab[0], comment_lab[1]) for comment_lab in comments_with_len]

In [26]:
sorted_comments_labels

[([15429], 3),
 ([4283], 1),
 ([4283], 1),
 ([21688], 3),
 ([3835], 3),
 ([6919], 3),
 ([21688], 3),
 ([2190], 3),
 ([10166], 3),
 ([2204], 3),
 ([3835], 3),
 ([10166], 3),
 ([3835], 3),
 ([3835], 3),
 ([16392], 3),
 ([3565], 3),
 ([2190], 3),
 ([3835], 3),
 ([3565], 3),
 ([3835], 3),
 ([3835], 3),
 ([10166], 3),
 ([10166], 3),
 ([21688], 3),
 ([7929], 6),
 ([12476], 3),
 ([3835], 3),
 ([10166], 3),
 ([2200], 6),
 ([3835], 3),
 ([2204], 3),
 ([21688], 3),
 ([2204], 3),
 ([7286], 1),
 ([10166], 3),
 ([21688], 3),
 ([3835], 3),
 ([2204], 3),
 ([3835], 3),
 ([10166], 3),
 ([3819], 3),
 ([9212], 6),
 ([3835], 3),
 ([3835], 3),
 ([3835], 3),
 ([7632], 6),
 ([3835], 3),
 ([27969], 3),
 ([2204], 3),
 ([3835], 3),
 ([3835], 3),
 ([3835], 3),
 ([3835], 3),
 ([2204], 3),
 ([12476], 3),
 ([15030], 6),
 ([21688], 3),
 ([2204], 3),
 ([2307], 4),
 ([2204], 3),
 ([16392], 3),
 ([3835], 3),
 ([7632], 6),
 ([21688], 3),
 ([3835], 3),
 ([3835], 3),
 ([3835], 3),
 ([9850], 6),
 ([21688], 3),
 ([10166], 3

Once the reviews are sorted we will convert thed dataset so that it can be used to train TensorFlow 2.0 models. we are running the following code to convert the sorted dataset into a TensorFlow 2.0-compliant input dataset shape.

In [27]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_comments_labels, output_types=(tf.int32, tf.int32))

In [28]:
# checking for 32 size batch
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [29]:
next(iter(batched_dataset))

(<tf.Tensor: shape=(32, 1), dtype=int32, numpy=
 array([[15429],
        [ 4283],
        [ 4283],
        [21688],
        [ 3835],
        [ 6919],
        [21688],
        [ 2190],
        [10166],
        [ 2204],
        [ 3835],
        [10166],
        [ 3835],
        [ 3835],
        [16392],
        [ 3565],
        [ 2190],
        [ 3835],
        [ 3565],
        [ 3835],
        [ 3835],
        [10166],
        [10166],
        [21688],
        [ 7929],
        [12476],
        [ 3835],
        [10166],
        [ 2200],
        [ 3835],
        [ 2204],
        [21688]])>,
 <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 6, 3, 3, 3, 6, 3, 3, 3])>)

Once we have applied padding to our dataset, the next step is to divide the dataset into test and training sets. We can do that with the help of following code:

In [38]:
# dividing 80 for training and 20 for testing model
import math
TOTAL_BATCHES = math.ceil(len(sorted_comments_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 20
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [39]:
print(test_data)
print(train_data)

<TakeDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>
<SkipDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>


### Creating the Model

In [61]:
# creating class for model
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [62]:
# defining the values for hyper parameters
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 7
DROPOUT_RATE = 0.2
NB_EPOCHS = 5

In [63]:
# creating object for TEXT_MODEL class
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [64]:
# compiling the model
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [65]:
# training the model
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/5


InvalidArgumentError: Graph execution error:

Detected at node 'text_model/conv1d_14/Conv1D' defined at (most recent call last):
    File "C:\Users\murth\anaconda3\lib\runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\murth\anaconda3\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\murth\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\murth\anaconda3\lib\site-packages\traitlets\config\application.py", line 845, in launch_instance
      app.start()
    File "C:\Users\murth\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
      self.io_loop.start()
    File "C:\Users\murth\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\murth\anaconda3\lib\asyncio\base_events.py", line 570, in run_forever
      self._run_once()
    File "C:\Users\murth\anaconda3\lib\asyncio\base_events.py", line 1859, in _run_once
      handle._run()
    File "C:\Users\murth\anaconda3\lib\asyncio\events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\murth\anaconda3\lib\site-packages\tornado\ioloop.py", line 688, in <lambda>
      lambda f: self._run_callback(functools.partial(callback, future))
    File "C:\Users\murth\anaconda3\lib\site-packages\tornado\ioloop.py", line 741, in _run_callback
      ret = callback()
    File "C:\Users\murth\anaconda3\lib\site-packages\tornado\gen.py", line 814, in inner
      self.ctx_run(self.run)
    File "C:\Users\murth\anaconda3\lib\site-packages\tornado\gen.py", line 775, in run
      yielded = self.gen.send(value)
    File "C:\Users\murth\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
      yield gen.maybe_future(dispatch(*args))
    File "C:\Users\murth\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
      yielded = ctx_run(next, result)
    File "C:\Users\murth\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
      yield gen.maybe_future(handler(stream, idents, msg))
    File "C:\Users\murth\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
      yielded = ctx_run(next, result)
    File "C:\Users\murth\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 543, in execute_request
      self.do_execute(
    File "C:\Users\murth\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
      yielded = ctx_run(next, result)
    File "C:\Users\murth\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\murth\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "C:\Users\murth\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2894, in run_cell
      result = self._run_cell(
    File "C:\Users\murth\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2940, in _run_cell
      return runner(coro)
    File "C:\Users\murth\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\murth\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3165, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\murth\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3357, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "C:\Users\murth\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3437, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "<ipython-input-65-9b1c984d5184>", line 2, in <module>
      text_model.fit(train_data, epochs=NB_EPOCHS)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\engine\training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\engine\training.py", line 859, in train_step
      y_pred = self(x, training=True)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "<ipython-input-45-d107befed150>", line 46, in call
      l_3 = self.cnn_layer3(l)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\layers\convolutional.py", line 248, in call
      outputs = self.convolution_op(inputs, self.kernel)
    File "C:\Users\murth\anaconda3\lib\site-packages\keras\layers\convolutional.py", line 233, in convolution_op
      return tf.nn.convolution(
Node: 'text_model/conv1d_14/Conv1D'
Computed output size would be negative: -2 [input_size: 1, effective_filter_size: 4, stride: 1]
	 [[{{node text_model/conv1d_14/Conv1D}}]] [Op:__inference_train_function_27377]

## 