In [3]:
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from nltk import FreqDist
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [4]:
df = pd.read_csv("Spam_SMS.csv")

In [5]:
df.head()

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Get most common words
all_words = []

for line in df["Message"]:
    words = line.split()
    for word in words:
        if len(word) > 2:
            all_words.append(word.lower())

Counter(all_words).most_common(50)

[('you', 1921),
 ('the', 1328),
 ('and', 968),
 ('for', 703),
 ('your', 677),
 ('have', 571),
 ('call', 559),
 ('are', 486),
 ('that', 470),
 ('but', 422),
 ('not', 410),
 ('can', 385),
 ('with', 379),
 ('will', 379),
 ("i'm", 377),
 ('get', 375),
 ('just', 365),
 ('this', 312),
 ('when', 283),
 ('from', 277),
 ('&lt;#&gt;', 276),
 ('all', 261),
 ('how', 254),
 ('what', 251),
 ('now', 247),
 ('like', 236),
 ('got', 235),
 ('know', 230),
 ('was', 230),
 ('free', 228),
 ('out', 220),
 ('come', 220),
 ('its', 208),
 ('then', 205),
 ('good', 201),
 ('send', 187),
 ('only', 184),
 ('want', 183),
 ('text', 175),
 ('time', 169),
 ("i'll", 168),
 ('love', 163),
 ('...', 163),
 ('going', 161),
 ('need', 157),
 ('about', 156),
 ('still', 151),
 ('one', 150),
 ('txt', 149),
 ('see', 145)]

In [7]:
stop_words = stopwords.words("english")


def text_preprocessing(text):
    # Make all words lowercase
    text = text.lower()

    # Remove  punction,number & spical char
    text = re.sub("[^a-zA-z]", " ", text)

    # Single char removel
    text = re.sub(r"\s+[a-zA-z]\s+", " ", text)

    # Remove multi spaces
    text = re.sub(r"\s+", " ", text)

    # Make a sentence to toknize
    tokens = word_tokenize(text)

    # Remove Stopwords
    final_token = [i for i in tokens if i not in stop_words]

    # Apply lemma
    final_words = []
    lemma = WordNetLemmatizer()
    for i in final_token:
        if len(i) > 2:
            word = lemma.lemmatize(i)
            final_words.append(word)
    return " ".join(final_words)

In [8]:
df["Clean_Text"] = df["Message"].apply(lambda x: text_preprocessing(x))
print("Text Preprocessing is done")

Text Preprocessing is done


In [9]:
x = df["Clean_Text"]
text = " ".join(x.iloc[:5])

In [10]:
def bigram_counts(txt):
    tokens = nltk.word_tokenize(txt)
    bigrams = nltk.bigrams(tokens)
    return FreqDist(bigrams)


bigrams = bigram_counts(text)
bigrams

FreqDist({('jurong', 'point'): 1, ('point', 'crazy'): 1, ('crazy', 'available'): 1, ('available', 'bugis'): 1, ('bugis', 'great'): 1, ('great', 'world'): 1, ('world', 'buffet'): 1, ('buffet', 'cine'): 1, ('cine', 'got'): 1, ('got', 'amore'): 1, ...})

In [11]:
def unigram_counts(txt):
    tokens = nltk.word_tokenize(txt)
    return nltk.FreqDist(tokens)


unigrams = unigram_counts(text)
unigrams

FreqDist({'entry': 2, 'say': 2, 'jurong': 1, 'point': 1, 'crazy': 1, 'available': 1, 'bugis': 1, 'great': 1, 'world': 1, 'buffet': 1, ...})

In [12]:
def bigram_probability(txt):
    total_prop = 1.0
    for bigram, count in bigrams.items():
        bigram_prop = count / unigrams[bigram[0]]
        print(bigram, ": ", bigram_prop)
        total_prop = total_prop * bigram_prop
    print("probability = ", total_prop)


bigram_probability(text)

('jurong', 'point') :  1.0
('point', 'crazy') :  1.0
('crazy', 'available') :  1.0
('available', 'bugis') :  1.0
('bugis', 'great') :  1.0
('great', 'world') :  1.0
('world', 'buffet') :  1.0
('buffet', 'cine') :  1.0
('cine', 'got') :  1.0
('got', 'amore') :  1.0
('amore', 'wat') :  1.0
('wat', 'lar') :  1.0
('lar', 'joking') :  1.0
('joking', 'wif') :  1.0
('wif', 'oni') :  1.0
('oni', 'free') :  1.0
('free', 'entry') :  1.0
('entry', 'wkly') :  0.5
('wkly', 'comp') :  1.0
('comp', 'win') :  1.0
('win', 'cup') :  1.0
('cup', 'final') :  1.0
('final', 'tkts') :  1.0
('tkts', 'may') :  1.0
('may', 'text') :  1.0
('text', 'receive') :  1.0
('receive', 'entry') :  1.0
('entry', 'question') :  0.5
('question', 'std') :  1.0
('std', 'txt') :  1.0
('txt', 'rate') :  1.0
('rate', 'apply') :  1.0
('apply', 'dun') :  1.0
('dun', 'say') :  1.0
('say', 'early') :  0.5
('early', 'hor') :  1.0
('hor', 'already') :  1.0
('already', 'say') :  1.0
('say', 'nah') :  0.5
('nah', 'think') :  1.0
('think