In [1]:
import tensorflow as tf
import tensorflow_hub as hub

In [2]:
import tensorflow_text as text

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('spam.csv')

In [5]:
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [7]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
747/4825

0.15481865284974095

In [9]:
df_spam = df[df['Category'] == 'spam']
df_spam.shape

(747, 2)

In [10]:
df_ham = df[df['Category'] == 'ham']
df_ham.shape

(4825, 2)

In [11]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])

In [12]:
df_ham_downsampled.shape

(747, 2)

In [13]:
df_balanced = pd.concat([df_spam ,df_ham_downsampled ])
df_balanced.shape

(1494, 2)

In [14]:
df_balanced['Category'].value_counts()

Category
spam    747
ham     747
Name: count, dtype: int64

In [15]:
df_balanced['spam'] = df['Category'].apply(lambda x: 1 if x=='spam'  else 0 )

In [16]:
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
2556,spam,FreeMSG You have been awarded a FREE mini DIGI...,1
931,ham,I'm really sorry i won't b able 2 do this frid...,0
2941,spam,You have 1 new message. Please call 08712400200.,1
1699,spam,"Free msg. Sorry, a service you ordered from 81...",1
4249,spam,"accordingly. I repeat, just text the word ok o...",1


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train , x_test , y_train , y_test = train_test_split(df_balanced['Message'] ,df_balanced['spam'] , stratify=df_balanced['spam'])

In [19]:
x_train.head(5)

4236    FREEMSG: Our records indicate you may be entit...
2547    Company is very good.environment is terrific a...
3128    Thats cool. i liked your photos. You are very ...
1260    We have sent JD for Customer Service cum Accou...
4741    I keep seeing weird shit and bein all "woah" t...
Name: Message, dtype: object

In [20]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [25]:
def get_sentence_embedding(sentence):
    preprocessed_text = bert_preprocess(sentence)
    return bert_encoder(preprocessed_text)['pooled_output']
get_sentence_embedding(
    [
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351707, -0.51327276, -0.88845736, ..., -0.7474889 ,
        -0.75314736,  0.91964495],
       [-0.8720836 , -0.5054399 , -0.94446695, ..., -0.85847527,
        -0.71745366,  0.8808299 ]], dtype=float32)>

In [26]:
e = get_sentence_embedding(
    [
        "banana",
        "mango",
        "grapes",
        "jeff bezos",
        "elon musk",
        "SRK"
    ]
)

In [27]:
# import coesine_similarity from sklearn.metrics.pairwise
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
# coesine similarity compute two vector if there identical or opposite direction
cosine_similarity([e[0]], [e[5]])

array([[0.77495563]], dtype=float32)

In [31]:
# BERT layer
input_text = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(input_text)
outputs = bert_encoder(preprocessed_text)

In [36]:
# Nueral network layer
dropout_layer = tf.keras.layers.Dropout(0.1, name='dropout')(outputs['pooled_output'])
dropout_layer = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(dropout_layer)

In [40]:
# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[input_text], outputs=[dropout_layer])

In [41]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [43]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]

In [44]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [45]:
model.fit(x_train, y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x171cf1b7970>