In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

In [2]:
df = pd.read_csv('spam.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.groupby('Category').describe().reset_index()

Unnamed: 0_level_0,Category,Message,Message,Message,Message
Unnamed: 0_level_1,Unnamed: 1_level_1,count,unique,top,freq
0,ham,4825,4516,"Sorry, I'll call later",30
1,spam,747,641,Please call our customer service representativ...,4


In [5]:
## Imbalannde in the datasets as hams are way more than spams
## How do deal with imbalanced data sets?
## Undersampling, oversampling, SMOT etc are the methods

In [6]:
df_spam = df[df['Category']=='spam']
df_spam.shape

(747, 2)

In [7]:
df_ham = df[df['Category']=='ham']
df_ham.shape

(4825, 2)

In [8]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [9]:
df_balanced = pd.concat([df_spam,df_ham_downsampled])
df_balanced.shape

(1494, 2)

In [10]:
df_balanced['Category'].value_counts()

spam    747
ham     747
Name: Category, dtype: int64

In [11]:
df_balanced['spam'] = df_balanced['Category'].apply(lambda x:1 if x=='spam' else 0)

In [12]:
df_balanced['spam'].value_counts()

1    747
0    747
Name: spam, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'],df_balanced['spam'],stratify=df_balanced['spam'])


In [14]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")




In [15]:
def get_sentence_embedding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']
    

In [16]:
get_sentence_embedding(['rohan is great man','upen is also a great man'])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.88975793, -0.5032988 , -0.6860507 , ..., -0.5176246 ,
        -0.68693817,  0.9016982 ],
       [-0.76988846, -0.40970317, -0.6731869 , ..., -0.27667814,
        -0.5837108 ,  0.7789192 ]], dtype=float32)>

In [17]:
sample_lst = get_sentence_embedding(['banana','mango','fruit','flower','elon musk','jeff bezos'])



In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
cosine_similarity([sample_lst[0]],[sample_lst[1]])

array([[0.99540985]], dtype=float32)

In [20]:
cosine_similarity([sample_lst[4]],[sample_lst[5]])

array([[0.9872035]], dtype=float32)

In [21]:
text_input

NameError: name 'text_input' is not defined

In [None]:
## Bert layers
text_input = tf.keras.layers.Input(shape=(),dtype=tf.string,name='string')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

## Neural network layers
l = tf.keras.layers.Dropout(0.1,name='dropout')(outputs['pooled_output'])
l = tf.keras.layers.Dense(1,activation='sigmoid',name='output')(l)

## Construct a final model
model = tf.keras.Model(inputs=[text_input],outputs=[1])