# Import Libraries

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv('https://raw.githubusercontent.com/krishnaladdha/sequence-classification-using-bert/main/spamham.csv', encoding='ISO-8859-1')

In [7]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.columns

Index(['Category', 'Message'], dtype='object')

In [9]:
df['Message'] = df['Message'].str.lower()

In [10]:
df['Category'].unique()

array(['ham', 'spam', '{"mode":"full"'], dtype=object)

In [11]:
df

Unnamed: 0,Category,Message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."
...,...,...
5568,ham,will ü b going to esplanade fr home?
5569,ham,"pity, * was in mood for that. so...any other s..."
5570,ham,the guy did some bitching but i acted like i'd...
5571,ham,rofl. its true to its name


In [12]:
df['Category'].value_counts()

ham               4825
spam               747
{"mode":"full"       1
Name: Category, dtype: int64

# Data Preparation

Encode spam = 1 and ham = 0

In [13]:
df['spam'] = df['Category'].apply(lambda X:1 if X=='spam' else 0)

In [14]:
df

Unnamed: 0,Category,Message,spam
0,ham,"go until jurong point, crazy.. available only ...",0
1,ham,ok lar... joking wif u oni...,0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,1
3,ham,u dun say so early hor... u c already then say...,0
4,ham,"nah i don't think he goes to usf, he lives aro...",0
...,...,...,...
5568,ham,will ü b going to esplanade fr home?,0
5569,ham,"pity, * was in mood for that. so...any other s...",0
5570,ham,the guy did some bitching but i acted like i'd...,0
5571,ham,rofl. its true to its name,0


In [15]:
# remove punctuation

import string
puncts = string.punctuation
def remove_punctuation(text):
  return text.translate(str.maketrans('', '', puncts))

In [16]:
# remove emoji

import re
def remove_emoji(string):
  emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  
                        u"\U0001F300-\U0001F5FF"  
                        u"\U0001F680-\U0001F6FF"  
                        u"\U0001F1E0-\U0001F1FF"  
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags = re.UNICODE)
  return emoji_pattern.sub(r'', string)

In [17]:
df["Message"] = df["Message"].apply(lambda text: remove_punctuation(text))

In [18]:
df["Message"] = df["Message"].apply(lambda text: remove_emoji(text))

In [19]:
df

Unnamed: 0,Category,Message,spam
0,ham,go until jurong point crazy available only in ...,0
1,ham,ok lar joking wif u oni,0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,1
3,ham,u dun say so early hor u c already then say,0
4,ham,nah i dont think he goes to usf he lives aroun...,0
...,...,...,...
5568,ham,will ü b going to esplanade fr home,0
5569,ham,pity was in mood for that soany other suggest...,0
5570,ham,the guy did some bitching but i acted like id ...,0
5571,ham,rofl its true to its name,0


In [20]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 6.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 70.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [21]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 9.1 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


# BERT (embedding) model

In [22]:
from transformers import BertTokenizer
from transformers import TFBertModel

In [23]:
t = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X = np.array(df['Message'])
y = np.array(df['spam'])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2)

In [27]:
X_train

array(['hey what time is your driving on fri we go for evaluation on fri',
       'cheers lou yeah was a goodnite shame u neva came c ya gailxx',
       '8 at the latest gs still there if you can scrounge up some ammo and want to give the new ak a try',
       ...,
       'hurt me tease me make me cry but in the end of my life when i die plz keep one rose on my grave and say stupid i miss u have a nice day bslvyl',
       'nothing smsing u n xy lor sorry lor da guys neva c u in person but they sort of know u lor so u wan 2 meet them xy ask me 2 bring u along 4 our next meeting',
       'kate jackson rec center before 7ish right'], dtype=object)

In [28]:
d = t(X_train.tolist(),
      max_length = 100,
      padding = 'max_length',
      truncation = True)

In [29]:
d.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [30]:
len(d['input_ids'][0])

100

In [31]:
# Features or BERT encodings
q = np.array(d['input_ids'])
q1 = np.array(d['attention_mask'])

In [34]:
model=TFBertModel.from_pretrained('bert-base-uncased')

Downloading tf_model.h5:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [35]:
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, LSTM, Bidirectional, Concatenate
from tensorflow.keras.models import Model

inp1=Input(shape=(100,),dtype='int32')
inp2=Input(shape=(100,),dtype='int32')
emb=model(inp1, attention_mask=inp2)[0]
print(emb)
l=Bidirectional(LSTM(64,return_sequences=True))(emb)
l=Bidirectional(LSTM(32,return_sequences=True))(l)
la=GlobalMaxPooling1D()(l)
l=GlobalAveragePooling1D()(l)
l=Concatenate()([l,la])
l=Dense(300,activation='relu')(l)
l=Dense(64,activation='relu')(l)
l=Dropout(0.5)(l)
l=Dense(2,activation='softmax')(l)
m=Model(inputs=[inp1,inp2],outputs=l)

KerasTensor(type_spec=TensorSpec(shape=(None, 100, 768), dtype=tf.float32, name=None), name='tf_bert_model/bert/encoder/layer_._11/output/LayerNorm/batchnorm/add_1:0', description="created by layer 'tf_bert_model'")


In [36]:
model(inp1,attention_mask=inp2)[0]

<KerasTensor: shape=(None, 100, 768) dtype=float32 (created by layer 'tf_bert_model')>

In [37]:
model.trainable=False

In [38]:
m.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 100,                                           

In [39]:
d1 = t(X_test.tolist(),max_length=100,padding='max_length',truncation=True)

In [40]:
q2=np.array(d1['input_ids'])
q3=np.array(d1['attention_mask'])

# Compile

In [41]:
import keras
m.compile(optimizer=tf.keras.optimizers.Adam(),
          loss='sparse_categorical_crossentropy',
          metrics=['accuracy'],
          steps_per_execution=32)

In [45]:
m.fit([q,q1],y_train,epochs=1,batch_size=64)



<keras.callbacks.History at 0x7f2faefc6650>

# Prediction

In [46]:
res1 = m.predict([q2,q3])

In [47]:
res1

array([[9.9998868e-01, 1.1343566e-05],
       [9.9997377e-01, 2.6224008e-05],
       [9.9996889e-01, 3.1166168e-05],
       ...,
       [9.9995279e-01, 4.7207759e-05],
       [9.9997759e-01, 2.2382546e-05],
       [9.9999070e-01, 9.2713917e-06]], dtype=float32)

In [48]:
p = np.argmax(res1,axis=1)

In [49]:
p

array([0, 0, 0, ..., 0, 0, 0])

# Evaluations

In [50]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,p,digits=3))

              precision    recall  f1-score   support

           0      0.995     0.998     0.996       970
           1      0.986     0.966     0.976       145

    accuracy                          0.994      1115
   macro avg      0.990     0.982     0.986      1115
weighted avg      0.994     0.994     0.994      1115



In [51]:
cm = confusion_matrix(y_test,p)

In [52]:
cm

array([[968,   2],
       [  5, 140]])

# References

https://github.com/krishnaladdha/sequence-classification-using-bert