In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk import word_tokenize
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import random, math

Using TensorFlow backend.


Before you can go and use the BERT text representation, you need to install BERT for TensorFlow 2.0. Execute the following pip commands on your terminal to install BERT for TensorFlow 2.0.

In [2]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/35/5c/6439134ecd17b33fe0396fb0b7d6ce3c5a120c42a4516ba0e9a2d6e43b25/bert-for-tf2-0.14.4.tar.gz (40kB)
[K     |████████                        | 10kB 26.3MB/s eta 0:00:01[K     |████████████████▏               | 20kB 6.2MB/s eta 0:00:01[K     |████████████████████████▎       | 30kB 8.7MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 4.8MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/ac/0d/615c0d4aea541b4f47c761263809a02e160e7a2babd175f0ddd804776cf4/params-flow-0.8.0.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

Next, you need to make sure that you are running TensorFlow 2.0. Therefore, to make sure that you are running your script via TensorFlow 2.0, execute the following script:

In [0]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

In the above script, in addition to TensorFlow 2.0, we also import tensorflow_hub, which basically is a place where you can find all the prebuilt and pretrained models developed in TensorFlow. We will be importing and using a built-in BERT model from TF hub.

### **Importing and Preprocessing the Dataset**
The following script imports the dataset using the read_csv() method of the Pandas dataframe. The script also prints the shape of the dataset.

In [4]:
data = pd.read_csv("dataset.csv")

print(data.head())

data.isnull().values.any()

data.shape

   type                                              posts
0  INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1  ENTP  'I'm finding the lack of me in these posts ver...
2  INTP  'Good one  _____   https://www.youtube.com/wat...
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...
4  ENTJ  'You're fired.|||That's another silly misconce...


(8675, 2)

In [5]:
!pip install emoji
import emoji

def char_is_emoji(character):
    return character in emoji.UNICODE_EMOJI

def text_has_emoji(text):
    return(bool(emoji.get_emoji_regexp().search(text)))

def returnEmojis(text):
    listEmojis=[]
    for c in text:
        if (char_is_emoji(c)):
            listEmojis.append(c)
    return(listEmojis)

Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/40/8d/521be7f0091fe0f2ae690cc044faf43e3445e0ff33c574eae752dd7e39fa/emoji-0.5.4.tar.gz (43kB)
[K     |███████▌                        | 10kB 23.1MB/s eta 0:00:01[K     |███████████████                 | 20kB 5.5MB/s eta 0:00:01[K     |██████████████████████▋         | 30kB 7.0MB/s eta 0:00:01[K     |██████████████████████████████▏ | 40kB 7.9MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.6MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.5.4-cp36-none-any.whl size=42176 sha256=dd3e6e3f8e8df102b4469e3362722ee210bfa5cea448ef12b5e7379519f1a083
  Stored in directory: /root/.cache/pip/wheels/2a/a9/0a/4f8e8cce8074232aba240caca3fade315bb49fac68808d1a9c
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.5.4


Now we are going to extract some features from the posts and add them to the dataset. For example we will use the number of words used in the user posts, the number of emojis ...

In [7]:
data['words_per_comment'] = data['posts'].apply(lambda x: len(x.split())/50)
data['question_per_comment'] = data['posts'].apply(lambda x: x.count('?')/50)
data['excl_per_comment'] = data['posts'].apply(lambda x: x.count('!')/50)
data['ellipsis_per_comment'] = data['posts'].apply(lambda x: x.count('...')/50)
data['@_per_comment'] = data['posts'].apply(lambda x: x.count('@')/50)
data['#_per_comment'] = data['posts'].apply(lambda x: x.count('#')/50)
data['emojis_per_comment'] = data['posts'].apply(lambda x: (len(returnEmojis(x)))/50)
print(data.head())
data.shape

   type  ... emojis_per_comment
0  INFJ  ...                0.0
1  ENTP  ...                0.0
2  INTP  ...                0.0
3  INTJ  ...                0.0
4  ENTJ  ...                0.0

[5 rows x 9 columns]


(8675, 9)

In [8]:
full_Pers_list = {'INFP' :0 ,'INTJ' :1 ,'INFJ' :2, 'INTP' :3 ,'ENFP' :4 ,'ENTJ' :5, 'ENTP' :6 ,'ENFJ' :7, 'ISFJ' :8 ,'ISFP' :9 ,'ISTJ' :10 ,'ISTP' :11 ,'ESFJ' :12,'ESFP' :13 ,'ESTJ' :14 ,'ESTP' :15}
def type_to_16(typeList):
    labels=[]
    for t in typeList:
        labels.append(full_Pers_list[t])
    return(labels)

def type16_to_vector_label(data):
    for i,label in enumerate(data):
        translation=np.zeros((16))
        translation[int(label)]=1
        data[i]=translation
        
labels=type_to_16(data["type"])
data["labels"]=labels
data.tail()

Unnamed: 0,type,posts,words_per_comment,question_per_comment,excl_per_comment,ellipsis_per_comment,@_per_comment,#_per_comment,emojis_per_comment,labels
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,15.92,0.18,0.12,0.14,0.0,0.0,0.0,9
8671,ENFP,'So...if this thread already exists someplace ...,26.18,0.2,0.66,0.82,0.04,0.02,0.0,4
8672,INTP,'So many questions when i do these things. I ...,18.96,0.18,0.02,0.38,0.02,0.0,0.0,3
8673,INFP,'I am very conflicted right now when it comes ...,34.1,0.18,0.06,0.94,0.0,0.0,0.0,0
8674,INFP,'It has been too long since I have been on per...,27.22,0.12,0.1,0.48,0.02,0.0,0.0,0


In [9]:
data.head()

Unnamed: 0,type,posts,words_per_comment,question_per_comment,excl_per_comment,ellipsis_per_comment,@_per_comment,#_per_comment,emojis_per_comment,labels
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,11.12,0.36,0.06,0.3,0.0,0.0,0.0,2
1,ENTP,'I'm finding the lack of me in these posts ver...,23.4,0.1,0.0,0.38,0.0,0.0,0.0,6
2,INTP,'Good one _____ https://www.youtube.com/wat...,16.72,0.24,0.08,0.26,0.0,0.0,0.0,3
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",21.28,0.22,0.06,0.52,0.0,0.0,0.0,1
4,ENTJ,'You're fired.|||That's another silly misconce...,19.34,0.2,0.02,0.42,0.04,0.0,0.0,5


In [0]:
b_Pers = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
b_Pers_list = [{0:'I', 1:'E'}, {0:'N', 1:'S'}, {0:'F', 1:'T'}, {0:'J', 1:'P'}]

def translate_personality(personality):
    # transform mbti to binary vector
    
    return [b_Pers[l] for l in personality]

In [11]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
# We want to remove these from the posts
unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
  
unique_type_list = [x.lower() for x in unique_type_list]


# Lemmatize
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()

# Cache the stop words for speed 
cachedStopWords = stopwords.words("english")

def pre_process_data(data, remove_stop_words=True, remove_mbti_profiles=True):

    list_personality = []
    list_posts = []
    len_data = len(data)
    i=0
    
    for row in data.iterrows():
        i+=1
        if (i % 500 == 0 or i == 1 or i == len_data):
            print("%s of %s rows" % (i, len_data))

        ##### Remove and clean comments using regular expressions
        posts = row[1].posts
        temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', posts)
        temp = re.sub("[^a-zA-Z]", " ", temp)
        temp = re.sub(' +', ' ', temp).lower()
        if remove_stop_words:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in cachedStopWords])
        else:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])
            
        if remove_mbti_profiles:
            for t in unique_type_list:
                temp = temp.replace(t,"")
        ##### Remove single-charactered words that remain from the cleaning
        words=temp.split()
        finalTemp=""
        for word in words:
            if(len(word)>1):
                finalTemp=finalTemp+" "+word
        type_labelized = translate_personality(row[1].type)
        list_personality.append(type_labelized)
        list_posts.append(finalTemp)

    list_posts = np.array(list_posts)
    list_personality = np.array(list_personality)
    return list_posts, list_personality

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [12]:
list_posts, list_personality  = pre_process_data(data, remove_stop_words=True)

1 of 8675 rows
500 of 8675 rows
1000 of 8675 rows
1500 of 8675 rows
2000 of 8675 rows
2500 of 8675 rows
3000 of 8675 rows
3500 of 8675 rows
4000 of 8675 rows
4500 of 8675 rows
5000 of 8675 rows
5500 of 8675 rows
6000 of 8675 rows
6500 of 8675 rows
7000 of 8675 rows
7500 of 8675 rows
8000 of 8675 rows
8500 of 8675 rows
8675 of 8675 rows


In [13]:
print(list_posts[1])
print(list_personality[1])
print(data["labels"][1])

 finding lack post alarming sex boring position often example girlfriend currently environment creatively use cowgirl missionary enough giving new meaning game theory hello grin take converse flirting acknowledge presence return word smooth wordplay cheeky grin lack balance hand eye coordination real iq test score internet iq test funny score higher like former response thread mention believe iq test banish know vanish site year half return find people still commenting post liking idea thought know think thing sometimes go old sherlock holmes quote perhaps man special knowledge special power like rather encourages seek complex cheshirewolf tumblr com post really never thought real function judge use use ne ti dominates fe emotion rarely si also use ni due strength know though ingenious saying really want try see happens playing first person shooter back drive around want see look rock paper one best make lol guy lucky really high tumblr system hear new first person shooter game rocking

Our dataset contains ten columns, as can be verified from the following script:

In [14]:
print(data.columns.values)

['type' 'posts' 'words_per_comment' 'question_per_comment'
 'excl_per_comment' 'ellipsis_per_comment' '@_per_comment' '#_per_comment'
 'emojis_per_comment' 'labels']


In [15]:
Y = list_personality[:,1]
print(Y)

[0 0 0 ... 0 0 0]


In [16]:
print(list_posts[10])

 one time parent fighting dad affair dad pushed mom fall broke finger pointed gun made get knee beg life gonna talk piece shit dad alcoholic kind serious mental problem come complying irs word law apply omg woman center lived run catholic charity fat bully program manager took upon change policy tenant forced attend christmas party work calling committed vacation day ever kundalini mystic oh get paid either one destined thing art teacher high school stack art school catalog saw one school ended going immediately knew one without research like communication design nope much execution mystic got degree one best school world field actually career unsustainable engaged inferior function directly case even absolutely know tf think day age familiar employment planet living ever pas kindergarten somebody actually employ issue taking responsibility sexual response called self discipline start mind internalize sexuality mercy oh think also say imply say make sense always healthy allowed develop

In [17]:
print(Y[10])

0


The output 0 confirms that this is an Introvert. We have now preprocessed our data and we are now ready to create BERT representations from our text data.

### ***Creating a BERT Tokenizer***
In order to use BERT text embeddings as input to train text classification model, we need to tokenize our posts. Tokenization refers to dividing a sentence into individual words. To tokenize our text, we will be using the BERT tokenizer. Look at the following script:

In [0]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In the script above we first create an object of the FullTokenizer class from the bert.bert_tokenization module. Next, we create a BERT embedding layer by importing the BERT model from hub.KerasLayer. The trainable parameter is set to False, which means that we will not be training the BERT embedding. In the next line, we create a BERT vocabulary file in the form a numpy array. We then set the text to lowercase and finally we pass our vocabulary_file and to_lower_case variables to the BertTokenizer object.


Let's now see if our BERT tokenizer is actually working. To do so, we will tokenize a random sentence, as shown below:

In [19]:
tokenizer.tokenize("This is a personality classifier")

['this', 'is', 'a', 'personality', 'class', '##ifier']

You can see that the text has been successfully tokenized. You can also get the ids of the tokens using the convert_tokens_to_ids() of the tokenizer object. Look at the following script:

In [20]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("This is a personality classifier"))

[2023, 2003, 1037, 6180, 2465, 18095]

Now will define a function that accepts a text and returns the ids of the tokenized words in the text. Execute the following script:

In [0]:
def tokenize_text(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

And execute the following script to actually tokenize all the posts in the input dataset:

In [0]:
tokenized_posts = [tokenize_text(post) for post in list_posts]

In [23]:
print(tokenized_posts[0])

[2617, 2998, 13013, 2121, 2327, 2702, 2377, 26418, 2166, 5278, 3325, 2166, 9377, 2651, 2089, 2566, 2278, 3325, 10047, 16862, 2063, 2197, 2518, 2767, 6866, 9130, 16873, 5920, 2279, 2154, 2717, 3521, 7592, 3374, 2963, 12893, 3019, 3276, 15401, 2051, 2296, 2617, 4598, 3046, 3275, 2524, 2051, 2051, 3930, 6160, 4933, 2208, 2275, 2674, 4013, 4143, 2278, 2092, 19892, 21823, 2078, 2560, 4228, 3371, 3048, 4190, 2812, 3048, 3564, 4624, 3242, 17901, 5549, 8156, 2672, 3046, 21006, 2740, 3771, 4522, 10468, 2272, 2093, 8875, 4340, 2828, 29221, 2828, 2215, 2052, 3497, 2224, 2445, 2828, 10699, 3853, 2054, 17048, 2187, 2518, 5549, 8156, 18135, 5262, 2678, 2208, 2204, 2028, 3602, 2204, 2028, 5399, 20714, 3294, 7694, 2331, 2445, 21934, 6203, 5440, 2678, 2208, 3652, 2783, 5440, 2678, 2208, 4658, 3544, 2397, 6517, 2619, 3071, 3524, 2245, 7023, 2204, 2518, 24188, 4509, 2051, 22560, 7065, 2884, 2306, 5110, 2088, 6168, 2051, 2147, 2378, 5959, 2051, 4737, 2111, 2467, 2105, 10930, 3203, 19394, 5649, 6180, 2092,

Next we will add the extracted features to the end of each tokenized user posts

In [0]:
waste=[post.append(data['words_per_comment'][i]) for i,post in enumerate(tokenized_posts)]
waste=[post.append(data['question_per_comment'][i]) for i,post in enumerate(tokenized_posts)]
waste=[post.append(data['excl_per_comment'][i]) for i,post in enumerate(tokenized_posts)]
waste=[post.append(data['ellipsis_per_comment'][i]) for i,post in enumerate(tokenized_posts)]
waste=[post.append(data['@_per_comment'][i]) for i,post in enumerate(tokenized_posts)]
waste=[post.append(data['#_per_comment'][i]) for i,post in enumerate(tokenized_posts)]
waste=[post.append(data['emojis_per_comment'][i]) for i,post in enumerate(tokenized_posts)]

In [25]:
print(tokenized_posts[0])

[2617, 2998, 13013, 2121, 2327, 2702, 2377, 26418, 2166, 5278, 3325, 2166, 9377, 2651, 2089, 2566, 2278, 3325, 10047, 16862, 2063, 2197, 2518, 2767, 6866, 9130, 16873, 5920, 2279, 2154, 2717, 3521, 7592, 3374, 2963, 12893, 3019, 3276, 15401, 2051, 2296, 2617, 4598, 3046, 3275, 2524, 2051, 2051, 3930, 6160, 4933, 2208, 2275, 2674, 4013, 4143, 2278, 2092, 19892, 21823, 2078, 2560, 4228, 3371, 3048, 4190, 2812, 3048, 3564, 4624, 3242, 17901, 5549, 8156, 2672, 3046, 21006, 2740, 3771, 4522, 10468, 2272, 2093, 8875, 4340, 2828, 29221, 2828, 2215, 2052, 3497, 2224, 2445, 2828, 10699, 3853, 2054, 17048, 2187, 2518, 5549, 8156, 18135, 5262, 2678, 2208, 2204, 2028, 3602, 2204, 2028, 5399, 20714, 3294, 7694, 2331, 2445, 21934, 6203, 5440, 2678, 2208, 3652, 2783, 5440, 2678, 2208, 4658, 3544, 2397, 6517, 2619, 3071, 3524, 2245, 7023, 2204, 2518, 24188, 4509, 2051, 22560, 7065, 2884, 2306, 5110, 2088, 6168, 2051, 2147, 2378, 5959, 2051, 4737, 2111, 2467, 2105, 10930, 3203, 19394, 5649, 6180, 2092,

### ***Prerparing Data For Training***
The posts in our dataset have varying lengths. Some posts are very small while others are very long. To train the model, the input sentences should be of equal length. To create sentences of equal length, one way is to pad the shorter sentences by 0s. However, this can result in a sparse matrix contain large number of 0s. The other way is to pad sentences within each batch. Since we will be training the model in batches, we can pad the sentences within the training batch locally depending upon the length of the longest sentence. To do so, we first need to find the length of each sentence.

The following script creates a list of lists where each sublist contains tokenized user's posts, the label of the posts and the length of the posts:

In [0]:
posts_with_len = [[post, Y[i], len(post)]
                 for i, post in enumerate(tokenized_posts)]

The following script shuffles the data randomly:

In [0]:
random.shuffle(posts_with_len)

In [28]:
print(posts_with_len[0])

[[2941, 3047, 4067, 7098, 2295, 7619, 6904, 15088, 6866, 3041, 11689, 4033, 2102, 2464, 3087, 5254, 2295, 2428, 2204, 3689, 2019, 11631, 5162, 5522, 10194, 1051, 12352, 4328, 12849, 9527, 2080, 2572, 2063, 24924, 4038, 3689, 23066, 6499, 2226, 9004, 6583, 22827, 29147, 2080, 11895, 3217, 3676, 3683, 2941, 2036, 4083, 2653, 7098, 5121, 2524, 3849, 2295, 4083, 3811, 16755, 3225, 22827, 2050, 2568, 2478, 2529, 2887, 5791, 12943, 28199, 2146, 4676, 3228, 4752, 2111, 2342, 2344, 5959, 2166, 2986, 2469, 2553, 3722, 2109, 5223, 5791, 2498, 3308, 2204, 3266, 2272, 2744, 2092, 2196, 2245, 3037, 5995, 2590, 2126, 2488, 2500, 4792, 2367, 2903, 12157, 9398, 3067, 9544, 4512, 2177, 2485, 2767, 6684, 26616, 6594, 2672, 2428, 2066, 3233, 11281, 3116, 3507, 5588, 4011, 4678, 4658, 7910, 2213, 2812, 6160, 7098, 6289, 2156, 5580, 2963, 4283, 8430, 2092, 2228, 25652, 2296, 2828, 6082, 6211, 2500, 2092, 2036, 2191, 10427, 3432, 2183, 2111, 7302, 2518, 2066, 1999, 15549, 2890, 5448, 2242, 2467, 2507, 7481,

Once the data is shuffled, we will sort the data by the length of the posts. To do so, we will use the sort() function of the list and will tell it that we want to sort the list with respect to the third item in the sublist i.e. the length of the posts.

In [0]:
posts_with_len.sort(key=lambda x: x[2])

In [30]:
print(posts_with_len[1])

[[5931, 3119, 2051, 12383, 6151, 8586, 14097, 1.02, 1.82, 0.0, 0.0, 0.0, 0.0, 0.0], 1, 14]


Once the posts are sorted by length, we can remove the length attribute . Execute the following script to do so:

In [0]:
sorted_posts_labels = [(post_lab[0], post_lab[1]) for post_lab in posts_with_len]

In [32]:
print(sorted_posts_labels[1])

([5931, 3119, 2051, 12383, 6151, 8586, 14097, 1.02, 1.82, 0.0, 0.0, 0.0, 0.0, 0.0], 1)


Once the posts are sorted we will convert the dataset so that it can be used to train TensorFlow 2.0 models. Run the following code to convert the sorted dataset into a TensorFlow 2.0-compliant input dataset shape.

In [0]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_posts_labels, output_types=(tf.int32, tf.int32))

Finally, we can now pad our dataset for each batch. The batch size we are going to use is 32 which means that after processing the posts of 32 users, the weights of the neural network will be updated. To pad the posts locally with respect to batches, execute the following:

In [0]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

Let's print the first batch and see how padding has been applied to it:

In [35]:
next(iter(batched_dataset))

(<tf.Tensor: shape=(32, 131), dtype=int32, numpy=
 array([[    0,     0,     0, ...,     0,     0,     0],
        [ 5931,  3119,  2051, ...,     0,     0,     0],
        [ 2559,  2830,  3116, ...,     0,     0,     0],
        ...,
        [21505,  3944,  3626, ...,     0,     0,     0],
        [ 2525,  2411,  2224, ...,     0,     0,     0],
        [ 2296,  2309,  3412, ...,     0,     0,     0]], dtype=int32)>,
 <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 1, 0, 1, 0, 0, 1, 0, 0], dtype=int32)>)

The above output shows the first and last few padded user's posts. From the last users, you can see that the total number of words in the largest sentence were 131. Therefore, in the first users the 0s are added at the end of the sentences so that their total length is also 131. The padding for the next batch will be different depending upon the size of the largest sentence in the batch.

 *ps.For the last users the 0 are the features added not the padding zeros

Once we have applied padding to our dataset, the next step is to divide the dataset into test and training sets. We can do that with the help of following code:

In [0]:
TOTAL_BATCHES = math.ceil(len(sorted_posts_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
VALID_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)
valid_data = batched_dataset.take(VALID_BATCHES)
train_data = batched_dataset.skip(VALID_BATCHES)

In the code above we first find the total number of batches by dividing the total records by 32. Next, 10% of the data is left aside for testing and another 10% for validation. To do so, we use the take() method of batched_dataset() object to store 10% of the data in the test_data variable. The remaining data is stored in the train_data object for training using the skip() method.
The same is done for the validation data.

The dataset has been prepared and now we are ready to create our text classification model.

### ***Creating the Model***
Now we are all set to create our model. To do so, we will create a class named TEXT_MODEL that inherits from the tf.keras.Model class. Inside the class we will define our model layers. Our model will consist of three convolutional neural network layers. You can use LSTM layers instead and can also increase or decrease the number of layers.

Let's now create out model class:

In [0]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

The above script is pretty straightforward. In the constructor of the class, we initialze some attributes with default values. These values will be replaced later on by the values passed when the object of the TEXT_MODEL class is created.

Next, three convolutional neural network layers have been initialized with the kernel or filter values of 2, 3, and 4, respectively. Again, you can change the filter sizes if you want.

Next, inside the call() function, global max pooling is applied to the output of each of the convolutional neural network layer. Finally, the three convolutional neural network layers are concatenated together and their output is fed to the first densely connected neural network. The second densely connected neural network is used to predict the output sentiment since it only contains 2 classes. In case you have more classes in the output, you can updated the output_classes variable accordingly.

Let's now define the values for the hyper parameters of our model.

In [0]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

Next, we need to create an object of the TEXT_MODEL class and pass the hyper paramters values that we defined in the last step to the constructor of the TEXT_MODEL class.

In [0]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

Before we can actually train the model we need to compile it. The following script compiles the model:

In [0]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

Finally to train our model, we can use the fit method of the model class.

In [42]:
text_model.fit(train_data, validation_data=valid_data, epochs=NB_EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f53febaa630>

Let's now evaluate our model's performance on the test set:

In [43]:
results = text_model.evaluate(test_data)
print(results)

[0.9411980509757996, 0.6747685074806213]
