### Text preprocessing - 
- Lowercase letters.
- Removing HTML tags.
- Removing URLs.
- Removing punctuation.
- Chat Words Treatment.
- Spelling Correction.
- Removing stop words
- Handling Emojies
- Tokenization
- Stemming
- Lemmatization

In [1]:
!pip uninstall -y nltk

Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1


In [2]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1



[notice] A new release of pip is available: 23.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [109]:
# import sys
# sys.modules.pop('nltk', None)




In [110]:
import nltk

In [111]:
import pandas as pd

messages = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])

print(messages)

     label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [112]:
import re 
import nltk
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [113]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
stop_words = set(stopwords.words("english")) 

In [114]:
corpus = []
for i in range(len(messages)):
    sent = re.sub('[^a-zA-Z0-9]', ' ' ,messages['message'][i])
    sent = sent.lower()
    sent = sent.split()
    ## Stemming
    tokens = [ps.stem(word) for word in sent if not word in stop_words]
    sent = " ".join(tokens)
    corpus.append(sent)

corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

In [115]:
## creating BOW model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [116]:

X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [117]:
X[1]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [118]:
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values
y = y.astype(int)
y
len(y)

5572

In [119]:
## Train test split - 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42)

In [120]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [121]:
y_train

array([1, 0, 0, ..., 0, 0, 0])

In [122]:
from sklearn.naive_bayes import MultinomialNB
spam_model = MultinomialNB().fit(X_train, y_train)

In [123]:
## predict
y_pred = spam_model.predict(X_test)

In [124]:
from sklearn.metrics import accuracy_score, classification_report

In [125]:
score = accuracy_score(y_test, y_pred)
print(score)

0.9739910313901345


In [126]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99       995
           1       0.81      1.00      0.89       120

    accuracy                           0.97      1115
   macro avg       0.90      0.99      0.94      1115
weighted avg       0.98      0.97      0.98      1115



In [127]:
## TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, ngram_range=(2,2))
X = tv.fit_transform(corpus).toarray()

In [135]:
print(len(X), len(y))

5572 5572


In [136]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42)

In [137]:
from sklearn.naive_bayes import MultinomialNB
spam_model = MultinomialNB().fit(X_train, y_train)

In [138]:
# prediction
pred = spam_model.predict(X_test)

In [139]:
from sklearn.metrics import accuracy_score, classification_report

In [140]:
score = accuracy_score(y_test, y_pred)
print(score)

0.9739910313901345


In [141]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99       995
           1       0.81      1.00      0.89       120

    accuracy                           0.97      1115
   macro avg       0.90      0.99      0.94      1115
weighted avg       0.98      0.97      0.98      1115



In [142]:
## Word 2 Vec implementation - 

In [143]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
messages = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [144]:
corpus = []
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])
    review = review.lower().split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in stop_words]
    review = " ".join(review)

    corpus.append(review)

In [145]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling 3 week word back like fun still tb ok xxx std chgs send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune',
 'winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6days 16 tsandcs apply reply

In [146]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [154]:
len(corpus)

5572

In [None]:
words = []
for sent in corpus:
    sent_token = sent_tokenize(sent)
#     for token in sent_token:
    words.append(simple_preprocess(sent))

In [None]:
# len(words)

5565

In [150]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  're

In [151]:
len(words)

5565

In [88]:
import gensim

In [89]:
model = gensim.models.Word2Vec(words, window=5, min_count=2)

In [90]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'text',
 'love',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'take',
 'mobile',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'dear',
 'make',
 'night',
 'message',
 'say',
 'well',
 'min',
 'thing',
 'much',
 'claim',
 'great',
 'hope',
 'oh',
 'hey',
 'give',
 'number',
 'happy',
 'work',
 'friend',
 'wat',
 'yes',
 'way',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'ask',
 'said',
 'win',
 'life',
 'amp',
 'cash',
 'yeah',
 'im',
 'really',
 'tone',
 'babe',
 'meet',
 'find',
 'miss',
 'morning',
 'service',
 'last',
 'uk',
 'thanks',
 'care',
 'would',
 'com',
 'anything',
 'year',
 'lol',
 'also',
 'nokia',
 'every',
 'feel',
 'keep',
 'sure',
 'pick',
 'sent',
 'urgent',
 'contact',


In [91]:
model.corpus_count

5565

In [92]:
model.epochs

5

In [93]:
model.wv.similar_by_word('prize')

[('claim', 0.9994779229164124),
 ('line', 0.9991785883903503),
 ('call', 0.9991661906242371),
 ('cash', 0.9990310072898865),
 ('free', 0.998967707157135),
 ('guaranteed', 0.9989489316940308),
 ('show', 0.9989310503005981),
 ('mobile', 0.9988924264907837),
 ('number', 0.9988689422607422),
 ('please', 0.9988331198692322)]

In [94]:
model.wv['kid'].shape

(100,)

In [95]:
import numpy as np
## Avg word to vec
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis = 0)


In [101]:
# words

In [96]:
from tqdm import tqdm
## Apply for entire sentences
X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5565/5565 [00:01<00:00, 4424.71it/s]


In [99]:
X

[array([-0.13653092,  0.2961965 ,  0.08335153, -0.00340718,  0.05822485,
        -0.37937787,  0.03833861,  0.47236434, -0.17791195, -0.08511233,
        -0.1933409 , -0.38249397, -0.07862533,  0.10763077,  0.04167753,
        -0.24112695,  0.03236302, -0.35371843,  0.03192625, -0.483258  ,
         0.09876203,  0.04697311,  0.00405644, -0.16306843, -0.10824565,
        -0.00093871, -0.14894138, -0.15775944, -0.24695288,  0.04099234,
         0.3116148 ,  0.03740793,  0.07819542, -0.13411318, -0.085664  ,
         0.2678902 ,  0.05045183, -0.21634984, -0.19614537, -0.49187097,
        -0.08469525, -0.22900762, -0.08172423,  0.04548724,  0.14445342,
        -0.10884432, -0.15848128, -0.08442006,  0.14906035,  0.16150497,
         0.14354569, -0.23382467, -0.03196058, -0.00628451, -0.21268739,
         0.0769529 ,  0.13664024, -0.06063765, -0.36252376,  0.01759229,
         0.06874911,  0.0834626 , -0.1285434 , -0.0154714 , -0.30028585,
         0.22769701,  0.06622037,  0.16003694, -0.2

In [None]:
# X_new = np.array(X)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5565,) + inhomogeneous part.

In [None]:
# X[0].shape

(100,)

In [108]:
len(X)

5565

In [105]:
## Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [5565, 5572]