In [None]:
import pandas as pd
dataset = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t', names=['label','message'])

In [None]:
dataset

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
!pip install nltk



In [None]:
## Data Cleaning & Preprocessing

import re
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
stemmer = PorterStemmer()
corpus = []
for i in range(0,len(dataset)):
  doc = re.sub('[^A-Za-z0-9]', ' ', dataset['message'][i])
  doc = doc.lower()
  doc = doc.split()
  doc = [stemmer.stem(word) for word in doc if not word in stopwords.words('english')]
  doc = ' '.join(doc)
  corpus.append(doc)

In [None]:
corpus[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

In [None]:
!pip install scikit-learn



# Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 2800, binary = True, ngram_range=(1,2))
x = cv.fit_transform(corpus)

In [None]:
x[:5].toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(dataset['label'])

In [None]:
y[:5]

array([0, 0, 1, 0, 0])

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=False)

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))


0.9755922469490309
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1224
           1       0.86      0.95      0.90       169

    accuracy                           0.98      1393
   macro avg       0.93      0.96      0.94      1393
weighted avg       0.98      0.98      0.98      1393



# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,3))
x = tfidf.fit_transform(corpus)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=False)

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB().fit(x_train, y_train)
y_pred = nb_model.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))


0.9454414931801867
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      1284
           1       0.59      1.00      0.74       109

    accuracy                           0.95      1393
   macro avg       0.79      0.97      0.86      1393
weighted avg       0.97      0.95      0.95      1393



# Word2Vec

In [None]:
!pip install gensim



In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
## Method 1
corpus = []
for i in range(0,len(dataset)):
  doc = re.sub('[^A-Za-z0-9]',' ', dataset['message'][i])
  doc = doc.lower()
  doc = doc.split()

  doc = [lemmatizer.lemmatize(word) for word in doc if not word in stopwords.words('english')]

  corpus.append(doc)


In [None]:
corpus[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [None]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import gensim
from gensim.models import Word2Vec

model_w2v = Word2Vec(corpus, window=7, min_count=2)


In [None]:
model_w2v.wv.index_to_key

['u',
 'call',
 '2',
 'get',
 'ur',
 '4',
 'gt',
 'lt',
 'go',
 'ok',
 'free',
 'day',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'love',
 'text',
 'want',
 'send',
 'need',
 'one',
 'today',
 'r',
 'txt',
 '1',
 'going',
 'home',
 'c',
 'stop',
 'lor',
 'sorry',
 'see',
 'still',
 'n',
 'take',
 'mobile',
 'back',
 'da',
 'reply',
 'dont',
 'k',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'please',
 'later',
 'pls',
 'co',
 'msg',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'much',
 'thing',
 'oh',
 'claim',
 'hope',
 'great',
 'min',
 'hey',
 'number',
 'give',
 '3',
 'happy',
 'friend',
 'wat',
 'work',
 'yes',
 'way',
 'www',
 'let',
 'e',
 'prize',
 'right',
 'tomorrow',
 'already',
 'ask',
 'said',
 'amp',
 'b',
 'cash',
 'life',
 'yeah',
 'tone',
 'really',
 'im',
 'babe',
 'meet',
 'find',
 'miss',
 'win',
 'morning',
 'uk',
 'last',
 'service',
 'thanks',
 'care',
 'anything',
 'would',
 'year',
 'lol',
 'also',
 'feel',
 'every',
 'keep',
 '

In [None]:
model_w2v.corpus_count

5572

In [None]:
model_w2v.epochs

5

In [None]:
model_w2v.wv.similar_by_word('happy')

[('day', 0.9995433688163757),
 ('dear', 0.9994513392448425),
 ('many', 0.9993752837181091),
 ('hope', 0.999374270439148),
 ('love', 0.9993639588356018),
 ('wish', 0.9993499517440796),
 ('year', 0.9992948174476624),
 ('b', 0.9992784261703491),
 ('said', 0.9992659687995911),
 ('really', 0.9992550611495972)]