In [None]:
data_file="/content/spam.csv"
glove_file="/content/glove.6B.50d.txt"

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# loading the Glove pretrained model

embeddings_index=dict()

with open("glove.6B.50d.txt", encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        coefs=np.asarray(values[1:],dtype='float32')
        embeddings_index[word]=coefs
        
print("Loaded %s word vectors"%len(embeddings_index))

Loaded 198917 word vectors


In [None]:
df=pd.read_csv(data_file,names=['label','text'],encoding='latin-1',skiprows=1,usecols=[0,1])

In [None]:
def remove_nonascii(s):
  return "".join([i for i in s if ord(i)<128])

In [None]:
df['text']=df['text'].map(lambda x:remove_nonascii(x))

In [None]:
df['label']=df['label'].apply(lambda x: 1 if x=='ham' else 0)

In [None]:
docs=df['text'].values
labels=df['label'].values

## Preprocessing

In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(docs)
vocab_size=len(tokenizer.word_index)
encoded_docs=tokenizer.texts_to_sequences(docs)

#padding sequences
padded_sequence=pad_sequences(encoded_docs,maxlen=20,padding='post')   # each sentence to 20 length
print(len(encoded_docs))

5572


We need only words from glove embeddings which are common in our docs

In [None]:
print(encoded_docs[0:2])
print(padded_sequence[0:2])

[[50, 468, 4379, 837, 749, 656, 64, 8, 1314, 89, 120, 350, 1315, 147, 2961, 1316, 67, 58, 4380, 144], [46, 337, 1485, 469, 6, 1915]]
[[  50  468 4379  837  749  656   64    8 1314   89  120  350 1315  147
  2961 1316   67   58 4380  144]
 [  46  337 1485  469    6 1915    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]]


In [None]:
count=0
for k,v in tokenizer.word_index.items():
  print(k,v)
  count=count +1
  if count>4:
    break

i 1
to 2
you 3
a 4
the 5


In [None]:
# embedding matrix for words which are present in out docs

embedding_matrix=np.zeros((vocab_size +1,50))

for word,i in tokenizer.word_index.items():
  embedding_vector=embeddings_index.get(word)

  if embedding_vector is not None:
    embedding_matrix[i] =embedding_vector

In [None]:
print(embedding_matrix.shape)

(8838, 50)


In [None]:
# keras network for classification
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten
from tensorflow.keras.layers import Embedding


model=Sequential()
model.add(Embedding(vocab_size+1,50,weights=[embedding_matrix], input_length=20,trainable=False))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

print(model.summary)

<bound method Model.summary of <tensorflow.python.keras.engine.sequential.Sequential object at 0x7fd9f74732d0>>


In [None]:
# Training and Evaluation
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(padded_sequence, labels,test_size=0.3,random_state=43)


In [None]:
model.fit(X_train,y_train,epochs=10,verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7fd9eb7a57d0>

In [None]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 50)            441900    
_________________________________________________________________
flatten (Flatten)            (None, 1000)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 1001      
Total params: 442,901
Trainable params: 1,001
Non-trainable params: 441,900
_________________________________________________________________
None


In [None]:
# evaluate 
loss, accuracy=model.evaluate(X_test,y_test,verbose=0)
print("Accuracy: %f"%(accuracy *100))

Accuracy: 95.873207


In [None]:
emb_file='/content/glove.6B.50d.txt'
data_file='spam.csv'

In [None]:
embeddings_index=dict()

with open(emb_file,'r',encoding='utf-8') as f:
  for line in f:
    values=line.split()
    word=values[0]
    coefs=np.asarray(values[1:],dtype='float32')
    embeddings_index[word]=coefs

print("loaded {} word vectors".format(len(embeddings_index)))


loaded 198917 word vectors


In [None]:
data=pd.read_csv(data_file,usecols=[0,1],skiprows=1,encoding='latin-1',names=['label','text'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# removing the non ascii characters

def remove_nonascii(s):
  return "".join([c for c in s if ord(c)<128])


data['text']=data['text'].map(lambda x:remove_nonascii(x))

In [None]:
data['label']=data['label'].map(lambda x : 1 if x=='ham' else 0)

In [None]:
docs=data['text'].values
labels=data['label'].values

In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(docs)
vocab_size=len(tokenizer.word_index)
encoded_docs=tokenizer.texts_to_sequences(docs)

In [None]:
embedding_matrix=np.zeros((vocab_size +1,50))

for word,i in tokenizer.word_index.items():
  embedding_vector=embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i]=embedding_vector

embedding_matrix.shape

(8838, 50)

In [None]:
print(encoded_docs[0:2])

[[50, 468, 4379, 837, 749, 656, 64, 8, 1314, 89, 120, 350, 1315, 147, 2961, 1316, 67, 58, 4380, 144], [46, 337, 1485, 469, 6, 1915]]


In [None]:
padded_docs=pad_sequences(encoded_docs,maxlen=20,padding='post')

In [None]:
print(padded_docs[0:2])

[[  50  468 4379  837  749  656   64    8 1314   89  120  350 1315  147
  2961 1316   67   58 4380  144]
 [  46  337 1485  469    6 1915    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]]


In [None]:
model=Sequential()
model.add(Embedding(vocab_size +1,50, weights=[embedding_matrix],trainable=False,input_length=20))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 50)            441900    
_________________________________________________________________
flatten_2 (Flatten)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1001      
Total params: 442,901
Trainable params: 1,001
Non-trainable params: 441,900
_________________________________________________________________


In [None]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
X_train,X_test,y_train,y_test=train_test_split(padded_docs,labels,test_size=0.2)


In [None]:
model.fit(X_train,y_train,epochs=10,verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7fd9f771fbd0>

In [None]:
loss,accuracy=model.evaluate(X_test,y_test,verbose=0)

print("Accuracy: ",accuracy *100)

Accuracy:  96.50224447250366
