In [96]:
import nltk
import re
import pandas as pd
import numpy as np
import random

pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../data/SMSSpamCollection.tsv', sep= '\t')
messages.columns = ["labels", "text"]

messages.head()

Unnamed: 0,labels,text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


In [4]:
import string


In [7]:
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

messages['text_clean'] = messages['text'].apply(lambda x: remove_punct(x))

messages.head()

Unnamed: 0,labels,text,text_clean
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
1,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,As per your request Melle Melle Oru Minnaminunginte Nurungu Vettam has been set as your callertu...


In [10]:
import re 

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens


messages['text_tokenized'] = messages['text_clean'].apply(lambda x: tokenize(x.lower()))

messages.head()

Unnamed: 0,labels,text,text_clean,text_tokenized
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
1,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,As per your request Melle Melle Oru Minnaminunginte Nurungu Vettam has been set as your callertu...,"[as, per, your, request, melle, melle, oru, minnaminunginte, nurungu, vettam, has, been, set, as..."


In [11]:
stopwords = nltk.corpus.stopwords.words('english')
 

In [12]:
def remove_stopwords(tokenized_text):
    text = [word for word in tokenized_text if word not in stopwords]
    return text

messages['text_nostop'] = messages['text_tokenized'].apply(lambda x: remove_stopwords(x))

messages.head()

Unnamed: 0,labels,text,text_clean,text_tokenized,text_nostop
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
1,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,As per your request Melle Melle Oru Minnaminunginte Nurungu Vettam has been set as your callertu...,"[as, per, your, request, melle, melle, oru, minnaminunginte, nurungu, vettam, has, been, set, as...","[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr..."


In [13]:
 def clean_text(text):
     text = "".join([word.lower() for word in text if word not in string.punctuation])
     tokens = re.split('\W+', text)
     text = [word for word in tokens if word not in stopwords]
     return text

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfid_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfid_vect.fit_transform(messages['text'])
print(X_tfidf.shape)
print(tfid_vect.get_feature_names())



In [25]:
X_features = pd.DataFrame(X_tfidf.toarray())

In [26]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_features, messages['labels'], test_size = .2)

In [31]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)

In [32]:
y_pred = rf_model.predict(X_test)


In [33]:
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
print('Precision: {} / Recall: {}'.format(round(precision, 3), round(recall, 3)))

Precision: 1.0 / Recall: 0.81


In [34]:
!pip install -U gensim

Collecting gensim
  Downloading gensim-3.8.3-cp38-cp38-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 3.6 MB/s 
Collecting smart-open>=1.8.1
  Downloading smart_open-4.1.0.tar.gz (116 kB)
[K     |████████████████████████████████| 116 kB 10.9 MB/s 
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py) ... [?25ldone
[?25h  Created wheel for smart-open: filename=smart_open-4.1.0-py3-none-any.whl size=106204 sha256=3107ff4318ab09bf6c498e86632f14fb3ed2203a3b8a4c4ba34c416e81ad6d18
  Stored in directory: /home/mishkasistrunk/.cache/pip/wheels/74/33/8e/37e22f50ce94856f37b3e23a93c648c194aa8d5a546083d09a
Successfully built smart-open
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.8.3 smart-open-4.1.0
You should consider upgrading via the '/home/mishkasistrunk/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [54]:
import gensim
import gensim.downloader as api

wiki_embeddings = api.load('glove-wiki-gigaword-100')

In [55]:
wiki_embeddings['king']

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [56]:
wiki_embeddings.most_similar('king')


[('prince', 0.7682329416275024),
 ('queen', 0.7507690191268921),
 ('son', 0.7020887136459351),
 ('brother', 0.6985775232315063),
 ('monarch', 0.6977890729904175),
 ('throne', 0.6919990181922913),
 ('kingdom', 0.6811410188674927),
 ('father', 0.6802029013633728),
 ('emperor', 0.6712857484817505),
 ('ii', 0.6676074266433716)]

In [58]:
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,labels,text,text_clean,text_tokenized,text_nostop
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,...","[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]","[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[have, date, on, sunday, with, will]","[i, have, a, date, on, sunday, with, will]","[date, sunday]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[as, per, your, request, melle, melle, oru, minnaminunginte, nurungu, vettam, has, been, set, as...","[as, per, your, request, melle, melle, oru, minnaminunginte, nurungu, vettam, has, been, set, as...","[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr..."


In [62]:
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'], messages['labels'], test_size = .2)

In [63]:
w2v_model = gensim.models.Word2Vec(X_train,
                                size = 100,
                                window = 5,
                                min_count=2)

In [65]:
w2v_model.wv['king']

array([-0.0027289 , -0.01344087, -0.00340273,  0.01170433, -0.09169315,
        0.03231518,  0.04601553, -0.01869143,  0.00514376,  0.01020341,
       -0.05863426,  0.01493125,  0.03770757,  0.0087204 , -0.0922171 ,
       -0.02992693, -0.03780801, -0.08308336, -0.07739293, -0.02217375,
       -0.0727206 , -0.00615059, -0.00844011, -0.00017599,  0.02643425,
        0.03070591,  0.00876151, -0.0268494 ,  0.03228621,  0.04342809,
       -0.01244085,  0.03023546, -0.00804392,  0.02696636,  0.03505045,
        0.02275912, -0.0086491 , -0.07379606,  0.02198456,  0.01213531,
        0.03617238,  0.00607156,  0.03259821,  0.03730531, -0.04976666,
        0.0140271 ,  0.02522767,  0.02650619, -0.04305142,  0.01218043,
        0.01614249,  0.02814395,  0.00253435,  0.03243604,  0.01847801,
       -0.04655145,  0.0021002 ,  0.03138082,  0.02909566, -0.0325501 ,
       -0.03871831, -0.04268863, -0.04270626, -0.00037604, -0.08208974,
        0.04902735, -0.05325377, -0.00378029, -0.05680966,  0.01

In [66]:
w2v_model.wv.most_similar('king')

[('co', 0.9977072477340698),
 ('sweet', 0.9976955652236938),
 ('once', 0.9976819753646851),
 ('love', 0.9976751208305359),
 ('haha', 0.9976654052734375),
 ('tot', 0.9976487159729004),
 ('thanks', 0.9976344704627991),
 ('long', 0.9976239204406738),
 ('half', 0.997617244720459),
 ('heart', 0.9976092576980591)]

In [70]:
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word]) for ls in X_test])

In [71]:
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

20 20
13 13
15 15
5 3
16 14
24 21
13 12
12 11
5 4
7 6
6 5
1 1
25 22
8 7
19 19
8 7
30 30
6 6
21 21
1 1
4 4
22 21
7 6
37 34
19 19
9 8
25 25
26 20
10 10
24 21
2 2
10 7
2 2
23 16
7 6
16 15
27 25
31 29
6 6
15 12
7 5
27 6
4 4
11 11
11 9
12 10
10 8
6 6
6 5
21 19
18 16
15 15
18 18
12 11
12 10
19 12
24 18
30 26
10 8
7 6
18 16
23 21
0 0
6 6
24 20
25 22
25 24
24 20
4 4
7 5
6 6
27 27
31 29
29 25
32 32
16 16
17 17
11 9
15 14
8 8
25 20
43 40
11 11
6 6
20 19
1 1
9 8
19 18
6 6
5 5
21 19
5 5
11 10
30 25
5 5
11 11
9 8
19 18
3 3
8 8
6 6
5 5
12 12
6 6
7 7
12 9
32 28
5 4
5 5
19 17
30 26
11 7
24 20
11 10
2 0
14 13
13 11
12 12
11 11
27 23
16 14
8 7
25 22
28 26
9 9
10 9
24 24
3 1
26 24
12 12
12 11
14 14
8 8
9 9
3 3
4 4
44 41
8 7
13 11
20 18
9 5
22 21
52 46
13 13
17 15
5 4
21 20
17 14
31 26
4 4
23 23
12 12
17 16
17 17
5 5
27 24
26 22
5 5
79 75
7 7
20 17
4 4
26 26
9 9
16 13
5 5
27 24
8 7
12 12
7 6
22 22
23 23
11 10
25 23
29 26
7 7
15 14
14 12
26 19
6 6
5 4
19 16
4 4
28 18
9 9
20 18
47 40
20 19
4 4
18 16
26 25
1

In [86]:
w2v_vect_avg = []

#returns word vector averages across each post
for vect in w2v_vect:
    if len(vect)!= 0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
            w2v_vect_avg.append(np.zeros(100))

In [87]:
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

20 100
13 100
15 100
5 100
16 100
24 100
13 100
12 100
5 100
7 100
6 100
1 100
25 100
8 100
19 100
8 100
30 100
6 100
21 100
1 100
4 100
22 100
7 100
37 100
19 100
9 100
25 100
26 100
10 100
24 100
2 100
10 100
2 100
23 100
7 100
16 100
27 100
31 100
6 100
15 100
7 100
27 100
4 100
11 100
11 100
12 100
10 100
6 100
6 100
21 100
18 100
15 100
18 100
12 100
12 100
19 100
24 100
30 100
10 100
7 100
18 100
23 100
0 100
6 100
24 100
25 100
25 100
24 100
4 100
7 100
6 100
27 100
31 100
29 100
32 100
16 100
17 100
11 100
15 100
8 100
25 100
43 100
11 100
6 100
20 100
1 100
9 100
19 100
6 100
5 100
21 100
5 100
11 100
30 100
5 100
11 100
9 100
19 100
3 100
8 100
6 100
5 100
12 100
6 100
7 100
12 100
32 100
5 100
5 100
19 100
30 100
11 100
24 100
11 100
2 100
14 100
13 100
12 100
11 100
27 100
16 100
8 100
25 100
28 100
9 100
10 100
24 100
3 100
26 100
12 100
12 100
14 100
8 100
9 100
3 100
4 100
44 100
8 100
13 100
20 100
9 100
22 100
52 100
13 100
17 100
5 100
21 100
17 100
31 100
4 100
23 10

In [134]:
messages = pd.read_csv('../data/spam.csv', encoding='latin-1')

messages = messages.drop(labels = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
messages.columns = ['label', 'text']

messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'], messages['label'], test_size = .2)


In [142]:
tagged_docs = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

In [143]:
tagged_docs[0]

TaggedDocument(words=['ranjith', 'cal', 'drpd', 'deeraj', 'and', 'deepak', 'min', 'hold'], tags=[0])

In [145]:
d2v_model = gensim.models.Doc2Vec(tagged_docs, vector_size = 100, window = 5, min_count = 2)

In [146]:
d2v_model.infer_vector(['i', 'am', 'fucking', 'gay'])

array([ 0.0024029 , -0.0058388 , -0.0005007 ,  0.00502572, -0.0274045 ,
        0.00422417,  0.01087392, -0.00486382,  0.00400582,  0.00317663,
       -0.0155356 ,  0.00870859,  0.00249349,  0.00909022, -0.01894418,
       -0.00112502, -0.00797662, -0.01180502, -0.01751181, -0.00992527,
       -0.01334384,  0.00404176, -0.00517214,  0.00636412,  0.00445412,
        0.0078181 ,  0.00393844, -0.00156435,  0.00239625,  0.01151866,
        0.00182755,  0.00425971,  0.00045227,  0.00509931,  0.00833823,
        0.0094913 , -0.00058024, -0.00243734,  0.00861443,  0.0120497 ,
        0.00595455, -0.00070964,  0.00733312,  0.00763612, -0.01426243,
        0.01128078,  0.01086403,  0.00777117, -0.00795457,  0.00180267,
        0.00369564,  0.00364714,  0.00615027,  0.01482986,  0.00682704,
       -0.00803831,  0.00515813,  0.01098441,  0.00509766, -0.01267549,
       -0.00838568, -0.01197765, -0.00565846, -0.00513034, -0.01967524,
        0.01613936, -0.0191185 , -0.00195147, -0.00921478, -0.00