# Importing library and data

In [None]:
pip install tensorflow_text

In [1]:
import tensorflow as tf
tf.__version__

'2.13.0'

In [14]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [1]:
import pandas as pd

dt = pd.read_csv('./datasets/spam.csv')
dt.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


## **Data preprocessing**
### *Getting all categories*

In [2]:
categories = dt['Category'].unique()
print(categories)

['ham' 'spam']


In [4]:
dt.groupby('Category').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Books,11820,6256,Think & Grow Rich About the Author NAPOLEON HI...,30
Clothing & Accessories,8670,5674,Diverse Men's Formal Shirt Diverse is a wester...,23
Electronics,10621,5308,HP 680 Original Ink Advantage Cartridge (Black...,26
Household,19313,10564,Nilkamal Series-24 Chest of Drawers (Cream Tra...,13


### *Handling dataframes*

In [5]:
df_ham = dt[dt['Category'] == 'ham']
df_ham.head(10)
df_ham.shape


(4825, 2)

In [6]:
df_spam = dt[dt['Category'] == 'spam']
df_spam.shape

(747, 2)

### ***Balancing dataframes***

In [7]:
df_ham_downsample = df_ham.head(df_spam.shape[0])
df_ham_downsample.shape

(747, 2)

### ***Merging dataframes***

In [8]:
dfs_array = [df_ham_downsample, df_spam]
df_merged = pd.concat(dfs_array)
df_merged[df_merged['Category'] == 'ham']

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...
...,...,...
883,ham,I love to give massages. I use lots of baby oi...
884,ham,Dude we should go sup again
885,ham,Yoyyooo u know how to change permissions for a...
886,ham,Gibbs unsold.mike hussey


### ***Adding 'type' column***

In [9]:
df_merged['Type'] = df_merged['Category'].apply(
    lambda x: 0 if x == 'ham' else 1
)
df_merged.sample(50)

Unnamed: 0,Category,Message,Type
1017,spam,FREE for 1st week! No1 Nokia tone 4 ur mob eve...,1
241,ham,Tomarrow final hearing on my laptop case so i ...,0
380,ham,I taught that Ranjith sir called me. So only i...,0
5068,spam,83039 62735=£450 UK Break AccommodationVoucher...,1
240,spam,U 447801259231 have a secret admirer who is lo...,1
814,ham,I borrow ur bag ok.,0
472,ham,"How long has it been since you screamed, princ...",0
1414,spam,Dear U've been invited to XCHAT. This is our f...,1
246,ham,Too late. I said i have the website. I didn't ...,0
596,ham,I am great! How are you?,0


### ***Splitting dataframe***

In [10]:
cols = df_merged.select_dtypes(include=['object'])
for col in cols.columns.values:
    df_merged[col] = df_merged[col].fillna('')

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_merged['Message'], df_merged['Type'], stratify=df_merged['Type'])
X_train.head()

2791    U’ve Bin Awarded £50 to Play 4 Instant Cash. C...
362                                       Oh ok no prob..
469          Yup, no need. I'll jus wait 4 e rain 2 stop.
457     LOOK AT AMY URE A BEAUTIFUL, INTELLIGENT WOMAN...
112                       Going for dinner.msg you after.
Name: Message, dtype: object

### ***Importing BERT and getting embeding vectors for data***

In [15]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

**Example getting embeding of sentence**

In [16]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351695, -0.5132727 , -0.88845736, ..., -0.74748874,
        -0.75314736,  0.91964495],
       [-0.87208354, -0.50543964, -0.94446677, ..., -0.8584749 ,
        -0.7174534 ,  0.88082975]], dtype=float32)>

**Building model**

In [17]:
#BERT Layer
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name = "text")
preprocessed_inputs = bert_preprocess(text_input)
encoded_outputs = bert_encoder(preprocessed_inputs)

#Neural network
layer = tf.keras.layers.Dropout(0.1, name='dropout') (encoded_outputs['pooled_output'])
layer= tf.keras.layers.Dense(1, activation='sigmoid', name="output")(layer)

#Construct the final model
model = tf.keras.Model(inputs=[text_input], outputs=[layer])

In [18]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [19]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=[tf.keras.metrics.BinaryAccuracy(name = 'Accuracy'),
                       tf.keras.metrics.Precision(name = 'Precision'), 
                       tf.keras.metrics.Recall(name = 'Recall')])

In [21]:
import numpy as np
y_train = np.asarray(y_train).astype(np.int64)
model.fit(X_train, y_train, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x22f9a7adde0>