In [179]:
import pandas as pd 
import nltk 
import re 
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
## Word2Vec 
import gensim 
from gensim.models import Word2Vec,KeyedVectors
import gensim.downloader as api
from gensim.utils import simple_preprocess
## Numpy 
import numpy as np 
## tqdm 
from tqdm import tqdm
## Train Test Split 
from sklearn.model_selection import train_test_split 
## Model
from sklearn.ensemble import RandomForestClassifier
## Metrix 
from sklearn.metrics import accuracy_score,classification_report


In [145]:
message = pd.read_csv("spam.csv", encoding='latin1')
message.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [146]:
df = message.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1)

In [147]:
df.rename({
    "v1":"label",
    "v2":"text"} ,axis=1,inplace=True)

In [148]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [149]:
lim=WordNetLemmatizer()
corpus=[] 
for i in range(0,len(df)):
    review=re.sub('[^a-zA-Z]',' ',df['text'][i]) 
    review=review.lower()
    review=review.split()
    sentance=[lim.lemmatize(word) for word in review ]#if not word in stopwords.words("english")]
    review=" ".join(sentance)
    corpus.append(review)

In [150]:
[[i,j,k]for i,j,k in zip(list(map(len,corpus)),corpus,df['text'] )if i <1]

[[0, '', '645'], [0, '', ':) '], [0, '', ':-) :-)']]

In [151]:
## Simple_process (preprocessing)
words=[]
for sent in corpus:
  sent_token=sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))

In [152]:
words

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'go',
  'to',
  'usf',
  'he',
  'life',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'spea

In [153]:
## Lets Train word2vec from scratch 
gensim_model=Word2Vec(words)

In [154]:
gensim_model.wv.most_similar("free")

[('txt', 0.9976161122322083),
 ('mobile', 0.996535062789917),
 ('call', 0.9953421950340271),
 ('text', 0.9947730898857117),
 ('claim', 0.9945932626724243),
 ('stop', 0.9944311380386353),
 ('reply', 0.9941173791885376),
 ('camcorder', 0.9927505850791931),
 ('prize', 0.9925094246864319),
 ('tone', 0.9924424886703491)]

In [155]:
## Get All Vocab 
gansim_vocab=gensim_model.wv.key_to_index

In [156]:
gansim_vocab

{'you': 0,
 'to': 1,
 'the': 2,
 'and': 3,
 'it': 4,
 'in': 5,
 'is': 6,
 'me': 7,
 'my': 8,
 'for': 9,
 'your': 10,
 'call': 11,
 'of': 12,
 'that': 13,
 'have': 14,
 'on': 15,
 'now': 16,
 'are': 17,
 'can': 18,
 'so': 19,
 'but': 20,
 'not': 21,
 'or': 22,
 'we': 23,
 'do': 24,
 'get': 25,
 'at': 26,
 'be': 27,
 'if': 28,
 'will': 29,
 'ur': 30,
 'with': 31,
 'no': 32,
 'just': 33,
 'this': 34,
 'gt': 35,
 'lt': 36,
 'go': 37,
 'how': 38,
 'up': 39,
 'when': 40,
 'ok': 41,
 'day': 42,
 'what': 43,
 'free': 44,
 'from': 45,
 'all': 46,
 'out': 47,
 'know': 48,
 'll': 49,
 'come': 50,
 'like': 51,
 'time': 52,
 'good': 53,
 'then': 54,
 'am': 55,
 'got': 56,
 'wa': 57,
 'there': 58,
 'he': 59,
 'text': 60,
 'only': 61,
 'love': 62,
 'want': 63,
 'send': 64,
 'txt': 65,
 'need': 66,
 'one': 67,
 'today': 68,
 'going': 69,
 'by': 70,
 'home': 71,
 'don': 72,
 'about': 73,
 'stop': 74,
 'she': 75,
 'lor': 76,
 'sorry': 77,
 'see': 78,
 'still': 79,
 'mobile': 80,
 'take': 81,
 'back': 82

In [157]:
##vocab Size 
gensim_model.corpus_count

5569

In [158]:
gensim_model.epochs

5

In [159]:
gensim_model.wv.similar_by_word("good")

[('great', 0.9986478686332703),
 ('where', 0.9986290335655212),
 ('happy', 0.9986037611961365),
 ('day', 0.9985918402671814),
 ('morning', 0.9983978867530823),
 ('well', 0.9983742237091064),
 ('my', 0.9983150959014893),
 ('should', 0.9982749819755554),
 ('about', 0.9981889128684998),
 ('why', 0.9981850385665894)]

In [160]:
gensim_model.wv.similar_by_word("bad")

[('getting', 0.9992855191230774),
 ('ya', 0.9992836713790894),
 ('smile', 0.9992781281471252),
 ('lol', 0.9992572069168091),
 ('over', 0.9992571473121643),
 ('went', 0.9992548823356628),
 ('cant', 0.9992358088493347),
 ('keep', 0.9992266893386841),
 ('other', 0.9992257952690125),
 ('them', 0.9992215633392334)]

In [161]:
gensim_model.wv['good'].shape

(100,)

In [162]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [163]:
def avg_word2vec(sentence, model):
    word_vecs = []
    for word in sentence:
        if word in model.wv:
            word_vecs.append(model.wv[word])
    if len(word_vecs) == 0:
        return np.zeros(model.vector_size)
    avg_vec = np.mean(word_vecs, axis=0)
    return avg_vec

In [164]:
X=[] 
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i],gensim_model))

100%|██████████| 5569/5569 [00:00<00:00, 29073.89it/s]


In [165]:
X[0]

array([-1.77090839e-01,  1.90020978e-01,  2.43463516e-01,  1.81241915e-01,
        1.17535710e-01, -4.42511171e-01,  1.90050453e-01,  6.30419374e-01,
       -3.11757714e-01, -2.16452554e-01, -1.41514212e-01, -4.58324075e-01,
       -7.62474239e-02,  1.36093751e-01,  1.32978156e-01, -1.66797355e-01,
        4.19571027e-02, -2.81315356e-01, -3.56526201e-04, -5.05069733e-01,
        1.66421309e-01,  2.58190215e-01,  1.85558110e-01, -2.20789149e-01,
       -4.26825956e-02,  8.07523206e-02, -2.03518122e-01, -2.00875238e-01,
       -3.08222175e-01,  9.82440412e-02,  2.45993406e-01, -8.98177102e-02,
        1.21047594e-01, -1.16943903e-01, -1.63548037e-01,  2.73095489e-01,
        8.87268409e-02, -2.29871273e-01, -3.14822868e-02, -4.30222452e-01,
        3.54695953e-02, -1.24750063e-01, -1.67820543e-01, -1.54468687e-02,
        2.34656766e-01, -9.03220400e-02, -1.81775481e-01, -8.68857205e-02,
        6.97857961e-02,  1.25593156e-01,  2.55087037e-02, -1.65538788e-01,
       -1.35667294e-01,  

In [166]:
len(X)

5569

In [167]:
##Independent Features 
X_new=np.array(X)
X_new.shape


(5569, 100)

In [168]:
X_new[0].shape

(100,)

In [169]:
filtered_df = df[list(map(lambda x: len(x) > 0, corpus))]
y=pd.get_dummies(filtered_df['label']).astype(int) 
y=y.iloc[:,1].values

In [170]:
y.shape

(5569,)

In [174]:
X[0].reshape(1,-1).shape

(1, 100)

In [172]:
## Change the X to Row Based 
df = pd.DataFrame()

# Create a list to hold the dataframes before concatenation
df_list = []

for i in range(len(X)):
    df_list.append(pd.DataFrame(X[i].reshape(1, -1)))

# Concatenate all the dataframes at once
df = pd.concat(df_list, ignore_index=True)

In [173]:
df.shape

(5569, 100)

In [175]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.177091,0.190021,0.243464,0.181242,0.117536,-0.442511,0.19005,0.630419,-0.311758,-0.216453,...,0.318288,0.221121,5.2e-05,0.091292,0.528296,0.302924,0.079768,-0.213813,0.099094,-0.081937
1,-0.15659,0.170345,0.207031,0.149557,0.100732,-0.378608,0.161512,0.536044,-0.261785,-0.1846,...,0.277363,0.176543,-0.003841,0.083248,0.441862,0.252923,0.05906,-0.194173,0.083384,-0.068606
2,-0.185809,0.198499,0.295697,0.221436,0.133674,-0.481435,0.161603,0.689289,-0.324825,-0.243247,...,0.337462,0.242405,-0.006171,0.10566,0.529739,0.330596,0.082236,-0.218212,0.065901,-0.09228
3,-0.253982,0.258094,0.310867,0.237481,0.155035,-0.599704,0.259732,0.846034,-0.419186,-0.290883,...,0.427328,0.277694,-0.004057,0.135663,0.715601,0.402673,0.099681,-0.305062,0.142214,-0.093619
4,-0.216835,0.211784,0.273108,0.198914,0.144103,-0.507051,0.221388,0.713926,-0.359952,-0.247977,...,0.364427,0.238979,0.000126,0.115686,0.601779,0.344123,0.082742,-0.26246,0.120333,-0.084377


In [176]:
## Independent Feature 
X=df 


In [177]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [178]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
4257,-0.222647,0.212236,0.287569,0.20308,0.150512,-0.53871,0.22795,0.739166,-0.373242,-0.266767,...,0.38493,0.259261,-0.012682,0.115585,0.617058,0.359098,0.07834,-0.285184,0.141676,-0.080375
2931,-0.195362,0.239945,0.311338,0.231097,0.116359,-0.551483,0.25451,0.792099,-0.404552,-0.277304,...,0.383973,0.283169,0.011719,0.119893,0.69178,0.386315,0.119036,-0.225868,0.123593,-0.123348
5509,-0.199001,0.186593,0.246286,0.188234,0.130322,-0.474884,0.209309,0.659227,-0.334946,-0.241876,...,0.338289,0.217796,-0.01115,0.108298,0.546781,0.321808,0.055154,-0.265728,0.130443,-0.065061
3969,-0.210677,0.215281,0.250538,0.190036,0.135342,-0.506392,0.21692,0.693497,-0.356092,-0.242745,...,0.360287,0.236148,-0.007813,0.105859,0.59043,0.340031,0.069853,-0.254919,0.132891,-0.077478
1898,-0.211142,0.22832,0.294912,0.216426,0.16142,-0.546517,0.225692,0.769891,-0.374909,-0.262423,...,0.388093,0.26337,-0.009042,0.108871,0.623695,0.361043,0.09714,-0.266984,0.123335,-0.09615


In [180]:
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)

In [181]:
classifier.fit(X_train,y_train)

In [182]:
predict=classifier.predict(X_test)

In [183]:
accuracy_score(y_test,predict)

0.9703770197486535

In [185]:
print(classification_report(y_test,predict))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       978
           1       0.89      0.86      0.88       136

    accuracy                           0.97      1114
   macro avg       0.94      0.92      0.93      1114
weighted avg       0.97      0.97      0.97      1114

