In [1]:
import pandas as pd

messages = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [2]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
## data cleaning and preprocessing
import re
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [5]:
ps = PorterStemmer()

In [9]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


In [10]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 2500, binary = True, ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [17]:
X[1]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [18]:
y = pd.get_dummies(messages['label'])
y = y.iloc[:, 1].values

In [20]:
## train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)

In [21]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [23]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score,classification_report

In [25]:
score=accuracy_score(y_test,y_pred)
print(score)

from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

0.9721973094170404
              precision    recall  f1-score   support

       False       1.00      0.97      0.98       986
        True       0.81      1.00      0.89       129

    accuracy                           0.97      1115
   macro avg       0.90      0.98      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [26]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(corpus).toarray()

In [27]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [28]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [29]:
#prediction
y_pred=spam_detect_model.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(score)

0.9847533632286996


In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       972
        True       0.89      1.00      0.94       143

    accuracy                           0.98      1115
   macro avg       0.95      0.99      0.97      1115
weighted avg       0.99      0.98      0.99      1115



### Word2Vec Implementation

In [31]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [33]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [34]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess ## converts a document into a list of lowercase tokens

In [36]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [37]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [39]:
import gensim
model=gensim.models.Word2Vec(words,window=5,min_count=2)
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'day',
 'ok',
 'free',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'love',
 'text',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'please',
 'later',
 'pls',
 'co',
 'msg',
 'min',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'thing',
 'much',
 'great',
 'oh',
 'hope',
 'claim',
 'hey',
 'number',
 'give',
 'happy',
 'work',
 'wat',
 'friend',
 'yes',
 'way',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'tone',
 'win',
 'ask',
 'said',
 'cash',
 'life',
 'amp',
 'im',
 'yeah',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'service',
 'year',
 'thanks',
 'uk',
 'last',
 'would',
 'anything',
 'com',
 'care',
 'lol',
 'nokia',
 'also',
 'feel',
 'every',
 'keep',
 'pick',
 'sure',
 'sent',
 'urgent',
 'contact',


In [40]:
model.corpus_count

5564

In [41]:
model.epochs

5

In [42]:
model.wv.similar_by_word('kid')

[('enjoy', 0.9972427487373352),
 ('work', 0.9972149729728699),
 ('happy', 0.9970395565032959),
 ('life', 0.9970388412475586),
 ('thing', 0.9969795346260071),
 ('amp', 0.9969726800918579),
 ('hello', 0.9969626069068909),
 ('even', 0.9969567656517029),
 ('great', 0.9969395995140076),
 ('person', 0.9969336986541748)]

In [43]:
model.wv['kid'].shape

(100,)

In [53]:
import numpy as np

In [44]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)

In [45]:
!pip install tqdm




[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [49]:
from tqdm import tqdm

In [50]:
words[73]

['performed']

In [51]:
type(model.wv.index_to_key)

list

In [54]:
#apply for the entire sentences
X=[]
for i in tqdm(range(len(words))):
    print("Hello",i)
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 17%|█▋        | 964/5564 [00:00<00:00, 5343.00it/s]

Hello 0
Hello 1
Hello 2
Hello 3
Hello 4
Hello 5
Hello 6
Hello 7
Hello 8
Hello 9
Hello 10
Hello 11
Hello 12
Hello 13
Hello 14
Hello 15
Hello 16
Hello 17
Hello 18
Hello 19
Hello 20
Hello 21
Hello 22
Hello 23
Hello 24
Hello 25
Hello 26
Hello 27
Hello 28
Hello 29
Hello 30
Hello 31
Hello 32
Hello 33
Hello 34
Hello 35
Hello 36
Hello 37
Hello 38
Hello 39
Hello 40
Hello 41
Hello 42
Hello 43
Hello 44
Hello 45
Hello 46
Hello 47
Hello 48
Hello 49
Hello 50
Hello 51
Hello 52
Hello 53
Hello 54
Hello 55
Hello 56
Hello 57
Hello 58
Hello 59
Hello 60
Hello 61
Hello 62
Hello 63
Hello 64
Hello 65
Hello 66
Hello 67
Hello 68
Hello 69
Hello 70
Hello 71
Hello 72
Hello 73
Hello 74
Hello 75
Hello 76
Hello 77
Hello 78
Hello 79
Hello 80
Hello 81
Hello 82
Hello 83
Hello 84
Hello 85
Hello 86
Hello 87
Hello 88
Hello 89
Hello 90
Hello 91
Hello 92
Hello 93
Hello 94
Hello 95
Hello 96
Hello 97
Hello 98
Hello 99
Hello 100
Hello 101
Hello 102
Hello 103
Hello 104
Hello 105
Hello 106
Hello 107
Hello 108
Hello 109
Hello 110


 47%|████▋     | 2639/5564 [00:00<00:00, 7286.84it/s]

Hello 964
Hello 965
Hello 966
Hello 967
Hello 968
Hello 969
Hello 970
Hello 971
Hello 972
Hello 973
Hello 974
Hello 975
Hello 976
Hello 977
Hello 978
Hello 979
Hello 980
Hello 981
Hello 982
Hello 983
Hello 984
Hello 985
Hello 986
Hello 987
Hello 988
Hello 989
Hello 990
Hello 991
Hello 992
Hello 993
Hello 994
Hello 995
Hello 996
Hello 997
Hello 998
Hello 999
Hello 1000
Hello 1001
Hello 1002
Hello 1003
Hello 1004
Hello 1005
Hello 1006
Hello 1007
Hello 1008
Hello 1009
Hello 1010
Hello 1011
Hello 1012
Hello 1013
Hello 1014
Hello 1015
Hello 1016
Hello 1017
Hello 1018
Hello 1019
Hello 1020
Hello 1021
Hello 1022
Hello 1023
Hello 1024
Hello 1025
Hello 1026
Hello 1027
Hello 1028
Hello 1029
Hello 1030
Hello 1031
Hello 1032
Hello 1033
Hello 1034
Hello 1035
Hello 1036
Hello 1037
Hello 1038
Hello 1039
Hello 1040
Hello 1041
Hello 1042
Hello 1043
Hello 1044
Hello 1045
Hello 1046
Hello 1047
Hello 1048
Hello 1049
Hello 1050
Hello 1051
Hello 1052
Hello 1053
Hello 1054
Hello 1055
Hello 1056
Hello 1057
He

 64%|██████▍   | 3560/5564 [00:00<00:00, 7960.48it/s]

Hello 2639
Hello 2640
Hello 2641
Hello 2642
Hello 2643
Hello 2644
Hello 2645
Hello 2646
Hello 2647
Hello 2648
Hello 2649
Hello 2650
Hello 2651
Hello 2652
Hello 2653
Hello 2654
Hello 2655
Hello 2656
Hello 2657
Hello 2658
Hello 2659
Hello 2660
Hello 2661
Hello 2662
Hello 2663
Hello 2664
Hello 2665
Hello 2666
Hello 2667
Hello 2668
Hello 2669
Hello 2670
Hello 2671
Hello 2672
Hello 2673
Hello 2674
Hello 2675
Hello 2676
Hello 2677
Hello 2678
Hello 2679
Hello 2680
Hello 2681
Hello 2682
Hello 2683
Hello 2684
Hello 2685
Hello 2686
Hello 2687
Hello 2688
Hello 2689
Hello 2690
Hello 2691
Hello 2692
Hello 2693
Hello 2694
Hello 2695
Hello 2696
Hello 2697
Hello 2698
Hello 2699
Hello 2700
Hello 2701
Hello 2702
Hello 2703
Hello 2704
Hello 2705
Hello 2706
Hello 2707
Hello 2708
Hello 2709
Hello 2710
Hello 2711
Hello 2712
Hello 2713
Hello 2714
Hello 2715
Hello 2716
Hello 2717
Hello 2718
Hello 2719
Hello 2720
Hello 2721
Hello 2722
Hello 2723
Hello 2724
Hello 2725
Hello 2726
Hello 2727
Hello 2728
Hello 2729

100%|██████████| 5564/5564 [00:00<00:00, 7419.48it/s]

Hello 4349
Hello 4350
Hello 4351
Hello 4352
Hello 4353
Hello 4354
Hello 4355
Hello 4356
Hello 4357
Hello 4358
Hello 4359
Hello 4360
Hello 4361
Hello 4362
Hello 4363
Hello 4364
Hello 4365
Hello 4366
Hello 4367
Hello 4368
Hello 4369
Hello 4370
Hello 4371
Hello 4372
Hello 4373
Hello 4374
Hello 4375
Hello 4376
Hello 4377
Hello 4378
Hello 4379
Hello 4380
Hello 4381
Hello 4382
Hello 4383
Hello 4384
Hello 4385
Hello 4386
Hello 4387
Hello 4388
Hello 4389
Hello 4390
Hello 4391
Hello 4392
Hello 4393
Hello 4394
Hello 4395
Hello 4396
Hello 4397
Hello 4398
Hello 4399
Hello 4400
Hello 4401
Hello 4402
Hello 4403
Hello 4404
Hello 4405
Hello 4406
Hello 4407
Hello 4408
Hello 4409
Hello 4410
Hello 4411
Hello 4412
Hello 4413
Hello 4414
Hello 4415
Hello 4416
Hello 4417
Hello 4418
Hello 4419
Hello 4420
Hello 4421
Hello 4422
Hello 4423
Hello 4424
Hello 4425
Hello 4426
Hello 4427
Hello 4428
Hello 4429
Hello 4430
Hello 4431
Hello 4432
Hello 4433
Hello 4434
Hello 4435
Hello 4436
Hello 4437
Hello 4438
Hello 4439




In [55]:
type(X)

list

In [60]:
X

[array([-0.07702301,  0.27317804,  0.03883897,  0.03220059,  0.07138968,
        -0.3350257 ,  0.08346338,  0.51656216, -0.21499842, -0.17022914,
        -0.11942366, -0.32899052,  0.02961339,  0.11216795,  0.07828018,
        -0.2459569 ,  0.00395849, -0.3149604 ,  0.02363428, -0.5350649 ,
         0.16631378,  0.10691584,  0.07751425, -0.12092187, -0.20374398,
         0.01515141, -0.1804804 , -0.17136179, -0.24245636, -0.00403432,
         0.30121353,  0.03284432,  0.03705793, -0.09991414, -0.12167209,
         0.23532493,  0.07133946, -0.1984918 , -0.1700417 , -0.4673659 ,
         0.00783141, -0.22779624, -0.04556705,  0.00582094,  0.22323366,
        -0.02870124, -0.1691882 , -0.04101219,  0.08981562,  0.17481406,
         0.10179742, -0.18272024, -0.03158615, -0.04556122, -0.10898448,
         0.10959811,  0.17451614, -0.04127948, -0.3226637 ,  0.0942885 ,
         0.09391312,  0.18636692, -0.17040087, -0.0108919 , -0.29013273,
         0.13136682,  0.09495463,  0.14326142, -0.3

In [61]:
X[0].shape

(100,)

In [62]:
words[1]

['ok', 'lar', 'joking', 'wif', 'oni']

In [None]:
## train test split
## apply a model