In [44]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [23]:
df = pd.read_csv("22_spam.csv")
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
df.groupby("Category").describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [25]:
df["spam"] = df["Category"] == "spam"
df["spam"] = df["spam"].apply(lambda x: 0 if not x else 1)
df.head(5)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["Message"], df["spam"], test_size=0.2, stratify=df["spam"]) # stratify adds balance

In [27]:
y_train.value_counts(), y_test.value_counts() # pretty even

(0    3859
 1     598
 Name: spam, dtype: int64,
 0    966
 1    149
 Name: spam, dtype: int64)

In [28]:
# download models
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [29]:
def get_sentence_embeding(sentences):
    preprocess_text = bert_preprocess(sentences)
    return bert_encoder(preprocess_text)["pooled_output"]

In [30]:
get_sentence_embeding([
    "500$ discount. hurry up", 
    "AJ, are you up for a volleybal game tomorrow?"
])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84369963, -0.51361525, -0.8888222 , ..., -0.7479082 ,
        -0.7532988 ,  0.91979617],
       [-0.9269273 , -0.58069086, -0.9709319 , ..., -0.9227075 ,
        -0.7643288 ,  0.9226816 ]], dtype=float32)>

In [33]:
e = get_sentence_embeding([
    "banana", 
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
])

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([e[0]], [e[5]])

array([[0.9571785]], dtype=float32)

In [38]:
# build a functional model instead of sequential

text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs["pooled_output"])
l = tf.keras.layers.Dense(1, activation="sigmoid", name="outputs")(l)

model = tf.keras.Model(inputs=[text_input], outputs=[l])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        {'input_mask': (None 0           text[0][0]                       
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      {'default': (None, 7 109482241   keras_layer[1][0]                
                                                                 keras_layer[1][1]                
                                                                 keras_layer[1][2]                
____________________________________________________________________________________________

In [39]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [47]:
model.fit(X_train, y_train, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x1ee2f221f70>

In [48]:
model.evaluate(X_test, y_test)



[0.11996053159236908, 0.9542601108551025]

In [51]:
reviews = [
    'Reply to win Â£100 weekly! Where will the 2006 FIFA World Cup be held? Send STOP to 87239 to end service',
    'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99',
    'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
    'Hey Sam, Are you coming for a cricket game tomorrow',
    "Why don't you wait 'til at least wednesday to see if you get your ."
]

model.predict(reviews)

array([[0.55474955],
       [0.6199408 ],
       [0.3863582 ],
       [0.0286942 ],
       [0.00938411]], dtype=float32)