In [11]:
import nltk
import re
import pandas as pd
import numpy as np
import random

pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../data/SMSSpamCollection.tsv', sep= '\t')
messages.columns = ["labels", "text"]

messages.head()

Unnamed: 0,labels,text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


In [12]:
import string


In [13]:
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

messages['text_clean'] = messages['text'].apply(lambda x: remove_punct(x))

messages.head()

Unnamed: 0,labels,text,text_clean
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
1,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,As per your request Melle Melle Oru Minnaminunginte Nurungu Vettam has been set as your callertu...


In [14]:
import re 

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens


messages['text_tokenized'] = messages['text_clean'].apply(lambda x: tokenize(x.lower()))

messages.head()

Unnamed: 0,labels,text,text_clean,text_tokenized
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
1,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,As per your request Melle Melle Oru Minnaminunginte Nurungu Vettam has been set as your callertu...,"[as, per, your, request, melle, melle, oru, minnaminunginte, nurungu, vettam, has, been, set, as..."


In [15]:
stopwords = nltk.corpus.stopwords.words('english')
 

In [16]:
def remove_stopwords(tokenized_text):
    text = [word for word in tokenized_text if word not in stopwords]
    return text

messages['text_nostop'] = messages['text_tokenized'].apply(lambda x: remove_stopwords(x))

messages.head()

Unnamed: 0,labels,text,text_clean,text_tokenized,text_nostop
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
1,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,As per your request Melle Melle Oru Minnaminunginte Nurungu Vettam has been set as your callertu...,"[as, per, your, request, melle, melle, oru, minnaminunginte, nurungu, vettam, has, been, set, as...","[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr..."


In [17]:
 def clean_text(text):
     text = "".join([word.lower() for word in text if word not in string.punctuation])
     tokens = re.split('\W+', text)
     text = [word for word in tokens if word not in stopwords]
     return text

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfid_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfid_vect.fit_transform(messages['text'])
print(X_tfidf.shape)
print(tfid_vect.get_feature_names())



In [19]:
X_features = pd.DataFrame(X_tfidf.toarray())

In [20]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_features, messages['labels'], test_size = .2)

In [22]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)

In [23]:
y_pred = rf_model.predict(X_test)


In [24]:
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
print('Precision: {} / Recall: {}'.format(round(precision, 3), round(recall, 3)))

Precision: 1.0 / Recall: 0.784


In [25]:
!pip install -U gensim

Requirement already up-to-date: gensim in /home/mishkasistrunk/anaconda3/lib/python3.8/site-packages (3.8.3)
You should consider upgrading via the '/home/mishkasistrunk/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [26]:
import gensim
import gensim.downloader as api

wiki_embeddings = api.load('glove-wiki-gigaword-100')

In [27]:
wiki_embeddings['king']

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [28]:
wiki_embeddings.most_similar('king')


[('prince', 0.7682329416275024),
 ('queen', 0.7507690191268921),
 ('son', 0.7020887136459351),
 ('brother', 0.6985775232315063),
 ('monarch', 0.6977890729904175),
 ('throne', 0.6919990181922913),
 ('kingdom', 0.6811410188674927),
 ('father', 0.6802029013633728),
 ('emperor', 0.6712857484817505),
 ('ii', 0.6676074266433716)]

In [29]:
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,labels,text,text_clean,text_tokenized,text_nostop
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,...","[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]","[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[have, date, on, sunday, with, will]","[i, have, a, date, on, sunday, with, will]","[date, sunday]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[as, per, your, request, melle, melle, oru, minnaminunginte, nurungu, vettam, has, been, set, as...","[as, per, your, request, melle, melle, oru, minnaminunginte, nurungu, vettam, has, been, set, as...","[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr..."


In [30]:
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'], messages['labels'], test_size = .2)

In [31]:
w2v_model = gensim.models.Word2Vec(X_train,
                                size = 100,
                                window = 5,
                                min_count=2)

In [32]:
w2v_model.wv['king']

array([-0.03659741,  0.00378942,  0.04019789, -0.04122544,  0.00822724,
        0.00144896,  0.04241052,  0.07885942,  0.00112525, -0.06457522,
       -0.01579792,  0.10299546, -0.01080722,  0.06168618,  0.06558908,
        0.00037669,  0.00160752,  0.08220106,  0.01494096, -0.00476334,
        0.03413939, -0.05005289, -0.11994803,  0.0815511 ,  0.01683928,
        0.05170585,  0.00240765, -0.02263567,  0.00142983,  0.01853479,
        0.09707622, -0.01423792,  0.02561693,  0.02417313, -0.0686046 ,
        0.04545865, -0.06563912,  0.08197162,  0.04144864,  0.00457984,
        0.01736996, -0.00045873,  0.02600063,  0.04484327,  0.0250183 ,
       -0.00246564, -0.07010023, -0.05816971, -0.03435051,  0.00579066,
        0.03105334, -0.03695688, -0.04108509, -0.06643796,  0.03066527,
        0.10531388,  0.0198595 , -0.05341019,  0.05785987,  0.09499303,
        0.0332849 ,  0.0031771 , -0.07002787,  0.10921766, -0.05016376,
        0.04323463,  0.03324573, -0.00911642, -0.02313892,  0.05

In [33]:
w2v_model.wv.most_similar('king')

[('money', 0.9982311129570007),
 ('hl', 0.9982223510742188),
 ('change', 0.9982131719589233),
 ('re', 0.9982094764709473),
 ('gift', 0.9982085227966309),
 ('world', 0.9982009530067444),
 ('is', 0.9981915950775146),
 ('old', 0.9981810450553894),
 ('hope', 0.998174786567688),
 ('was', 0.9981711506843567)]

In [34]:
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word]) for ls in X_test])

In [35]:
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

24 24
11 11
14 14
40 34
12 10
4 4
5 4
30 30
25 24
4 4
3 2
19 18
11 9
12 12
23 19
2 1
31 29
6 6
15 14
47 46
30 26
21 13
5 5
24 20
12 12
7 5
6 6
21 21
9 8
7 4
7 7
4 4
18 18
46 38
15 14
15 15
27 25
2 2
6 5
10 9
13 12
4 3
23 22
5 5
17 15
10 9
12 11
5 2
10 9
5 5
6 6
23 23
8 8
9 9
5 4
31 26
12 11
27 23
6 6
27 27
4 4
6 6
13 12
7 6
13 13
3 3
8 7
12 12
9 9
25 24
18 18
6 5
11 11
4 3
6 5
8 7
6 5
9 9
11 11
7 7
17 17
21 18
7 7
8 8
19 19
7 5
4 4
5 5
9 7
28 27
13 12
3 2
10 4
24 24
8 7
14 14
7 6
15 14
5 5
6 5
1 1
9 9
60 54
22 22
10 9
25 24
22 22
7 6
9 9
29 24
7 2
13 12
16 15
8 8
5 5
6 6
17 13
23 21
22 20
4 3
24 23
13 12
5 5
24 23
25 25
6 6
7 6
13 11
15 15
9 9
5 4
18 16
22 19
1 1
7 7
5 5
4 3
19 16
14 11
28 28
9 8
19 13
8 7
11 10
6 6
27 24
26 25
11 11
11 11
7 3
14 12
7 6
8 7
32 32
5 5
5 5
17 15
8 8
22 20
5 5
25 19
24 19
6 6
12 11
22 21
30 30
12 12
16 16
9 9
8 5
19 18
10 10
9 8
14 12
16 16
11 8
8 7
5 5
31 29
22 22
21 19
6 6
17 15
24 24
7 6
8 8
16 15
11 10
11 9
17 17
3 2
21 18
21 21
16 16
6 5
25 24
14 14


In [36]:
w2v_vect_avg = []

#returns word vector averages across each post
for vect in w2v_vect:
    if len(vect)!= 0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
            w2v_vect_avg.append(np.zeros(100))

In [37]:
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

24 100
11 100
14 100
40 100
12 100
4 100
5 100
30 100
25 100
4 100
3 100
19 100
11 100
12 100
23 100
2 100
31 100
6 100
15 100
47 100
30 100
21 100
5 100
24 100
12 100
7 100
6 100
21 100
9 100
7 100
7 100
4 100
18 100
46 100
15 100
15 100
27 100
2 100
6 100
10 100
13 100
4 100
23 100
5 100
17 100
10 100
12 100
5 100
10 100
5 100
6 100
23 100
8 100
9 100
5 100
31 100
12 100
27 100
6 100
27 100
4 100
6 100
13 100
7 100
13 100
3 100
8 100
12 100
9 100
25 100
18 100
6 100
11 100
4 100
6 100
8 100
6 100
9 100
11 100
7 100
17 100
21 100
7 100
8 100
19 100
7 100
4 100
5 100
9 100
28 100
13 100
3 100
10 100
24 100
8 100
14 100
7 100
15 100
5 100
6 100
1 100
9 100
60 100
22 100
10 100
25 100
22 100
7 100
9 100
29 100
7 100
13 100
16 100
8 100
5 100
6 100
17 100
23 100
22 100
4 100
24 100
13 100
5 100
24 100
25 100
6 100
7 100
13 100
15 100
9 100
5 100
18 100
22 100
1 100
7 100
5 100
4 100
19 100
14 100
28 100
9 100
19 100
8 100
11 100
6 100
27 100
26 100
11 100
11 100
7 100
14 100
7 100
8 100
3

In [38]:
messages = pd.read_csv('../data/spam.csv', encoding='latin-1')

messages = messages.drop(labels = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
messages.columns = ['label', 'text']

messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'], messages['label'], test_size = .2)


In [39]:
tagged_docs = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

In [40]:
tagged_docs[0]

TaggedDocument(words=['thinking', 'that', 'chennai', 'forgot', 'to', 'come', 'for', 'auction'], tags=[0])

In [41]:
d2v_model = gensim.models.Doc2Vec(tagged_docs, vector_size = 100, window = 5, min_count = 2)

In [42]:
d2v_model.infer_vector(['i', 'am', 'fucking', 'gay'])

array([-0.00311202,  0.00254173,  0.01018597, -0.01221487,  0.00599265,
        0.00142286,  0.01190174,  0.01657795,  0.00257349, -0.01043487,
       -0.00654005,  0.00965022, -0.0030847 ,  0.0065184 ,  0.01152291,
        0.00221782, -0.00519627,  0.01768096,  0.0003648 ,  0.0072893 ,
        0.00616315, -0.01838581, -0.02199402,  0.01260206, -0.00199672,
        0.00984463,  0.00186168, -0.0060292 ,  0.00152921,  0.00297292,
        0.01276561, -0.01422401,  0.00868168,  0.01341219, -0.02217923,
       -0.00056132, -0.01923588,  0.01407235,  0.0131375 , -0.00230066,
        0.00391089,  0.00503323,  0.00433684,  0.01452693,  0.00391087,
        0.00586373, -0.01397752, -0.01509378, -0.00935687,  0.00367777,
        0.0075454 , -0.00692635, -0.00797189, -0.00907508,  0.01051117,
        0.01968142,  0.00602168, -0.01344074,  0.01272639,  0.02195982,
        0.00281681, -0.00621549, -0.01551903,  0.02294644, -0.00480211,
        0.00910127, -0.00160695, -0.00264233, -0.00614894, -0.00

In [48]:
labels = np.where(messages['label'] == 'spam', 1, 0)

In [None]:
!pip install -U Keras

In [None]:
!pip install -U tensorflow

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [49]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [50]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [51]:
X_test_seq[0]

[88, 2854, 58, 2927, 3043]

In [53]:
import keras.backend as K 
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential

In [54]:
X_train_seq_padded = pad_sequences(X_train_seq, 50)
X_test_seq_padded = pad_sequences(X_test_seq, 50)


In [55]:
X_train_seq_padded[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0, 628,  15, 592, 358,   2,  58,  10, 550], dtype=int32)

In [58]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip)(y_true * y_pred, 0 , 1))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


In [60]:
model = Sequential()

#Embedding/vector for each text message 
#what are the dimensions of the input?
model.add(Embedding(len(tokenizer.index_word)+1, 32))
#Prevents overfitting through regulazation
model.add(LSTM(32, dropout = 0, recurrent_dropout = 0))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          218720    
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 228,129
Trainable params: 228,129
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile()