# NLTK: Introduction

In [1]:
import nltk

In [2]:
# tokenize example
from nltk.tokenize import word_tokenize
input_txt = "I am learning NLP and using NLTK"
word_tokens = word_tokenize(input_txt)
print(input_txt)
print(word_tokens)

I am learning NLP and using NLTK
['I', 'am', 'learning', 'NLP', 'and', 'using', 'NLTK']


# Reading Text

In [3]:
# method 1: using open()
raw_data = open("ds\SMSSpamCollection").read()
raw_data[0:500]

"ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\nham\tOk lar... Joking wif u oni...\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tU dun say so early hor... U c already then say...\nham\tNah I don't think he goes to usf, he lives around here though\nspam\tFreeMsg Hey there darling it's been 3 week's now and no word bac"

In [4]:
parsed_data = raw_data.replace("\t", "\n").split("\n")
parsed_data[0:10]

['ham',
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'ham',
 'Ok lar... Joking wif u oni...',
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham',
 'U dun say so early hor... U c already then say...',
 'ham',
 "Nah I don't think he goes to usf, he lives around here though"]

In [5]:
label_list = parsed_data[0::2]
msg_list = parsed_data[1::2]
print(label_list[0:5])
print(label_list[0:5])
# print(msg_list[0:5])

['ham', 'ham', 'spam', 'ham', 'ham']
['ham', 'ham', 'spam', 'ham', 'ham']


In [6]:
import pandas as pd
print(len(label_list))
print(len(msg_list))

print(label_list[-3:])

combined_df = pd.DataFrame({
    "label": label_list[:-1],
    "sms": msg_list
})

combined_df.head()

5575
5574
['ham', 'ham', '']


Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# method 2: using pandas.read_csv()
dataset = pd.read_csv("ds\SMSSpamCollection", sep="\t", header=None)
dataset.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Exploring dataset

In [8]:
# Read the dataset
dataset = pd.read_csv("ds\SMSSpamCollection", sep="\t", header=None)
dataset.columns = ["label", "sms"]
dataset.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
print(f"Input data has {len(dataset)} rows, {len(dataset.columns)} columns")

Input data has 5572 rows, 2 columns


In [10]:
# ham/spam
print(f"ham = {len(dataset[dataset['label'] == 'ham'])}")
print(f"ham = {len(dataset[dataset['label'] == 'spam'])}")

ham = 4825
ham = 747


In [11]:
# missing data
print(f"Number of missing label = {dataset['label'].isnull().sum()}")
print(f"Number of missing msg = {dataset['sms'].isnull().sum()}")

Number of missing label = 0
Number of missing msg = 0


# Removing Punctuation

In [12]:
# increase using pd.set_option for longer message
pd.set_option("display.max_colwidth", 100)
data = pd.read_csv("ds\SMSSpamCollection", sep="\t", header=None)
data.columns = ["label", "msg"]
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [13]:
"I am teaching NLP" == "I am teaching NLP."

False

In [14]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
def remove_punctuation(txt):
    txt_nopunct = [c for c in txt if c not in string.punctuation]
    return txt_nopunct

In [16]:
data["msg_clean"] = data["msg"].apply(lambda x: remove_punctuation(x))
data.head()

Unnamed: 0,label,msg,msg_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[G, o, , u, n, t, i, l, , j, u, r, o, n, g, , p, o, i, n, t, , c, r, a, z, y, , A, v, a, i,..."
1,ham,Ok lar... Joking wif u oni...,"[O, k, , l, a, r, , J, o, k, i, n, g, , w, i, f, , u, , o, n, i]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[F, r, e, e, , e, n, t, r, y, , i, n, , 2, , a, , w, k, l, y, , c, o, m, p, , t, o, , w,..."
3,ham,U dun say so early hor... U c already then say...,"[U, , d, u, n, , s, a, y, , s, o, , e, a, r, l, y, , h, o, r, , U, , c, , a, l, r, e, a,..."
4,ham,"Nah I don't think he goes to usf, he lives around here though","[N, a, h, , I, , d, o, n, t, , t, h, i, n, k, , h, e, , g, o, e, s, , t, o, , u, s, f, ,..."


In [17]:
# add "".join()
def remove_punctuation(txt):
    txt_nopunct = "".join([c for c in txt if c not in string.punctuation])
    return txt_nopunct

In [18]:
data["msg_clean"] = data["msg"].apply(lambda x: remove_punctuation(x))
data.head()

Unnamed: 0,label,msg,msg_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though


# Tokenization

In [19]:
import re

def tokenize(txt):
    tokens = re.split("\W+", txt)
    return tokens

data["msg_clean_tokenized"] = data["msg_clean"].apply(lambda x: tokenize(x.lower()))
data.head()

Unnamed: 0,label,msg,msg_clean,msg_clean_tokenized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"


# Remove Stop Words

In [20]:
stopwords = nltk.corpus.stopwords.words("english")
stopwords[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [21]:
def remove_stopwords(txt_tokenized):
    txt_clean = [word for word in txt_tokenized if word not in stopwords]
    return txt_clean

data["msg_no_sw"] = data["msg_clean_tokenized"].apply(lambda x: remove_stopwords(x))
data.head()

Unnamed: 0,label,msg,msg_clean,msg_clean_tokenized,msg_no_sw
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"


# Stemming: Introduction

# Porter Stemmer

In [22]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
dir(ps)

['MARTIN_EXTENSIONS',
 'NLTK_EXTENSIONS',
 'ORIGINAL_ALGORITHM',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_apply_rule_list',
 '_contains_vowel',
 '_ends_cvc',
 '_ends_double_consonant',
 '_has_positive_measure',
 '_is_consonant',
 '_measure',
 '_replace_suffix',
 '_step1a',
 '_step1b',
 '_step1c',
 '_step2',
 '_step3',
 '_step4',
 '_step5a',
 '_step5b',
 'mode',
 'pool',
 'stem',
 'vowels']

In [23]:
print(ps.stem("coder"))
print(ps.stem("coding"))
print(ps.stem("code"))

coder
code
code


In [24]:
print(ps.stem("data"))
print(ps.stem("datum"))

data
datum


In [25]:
print(ps.stem("bowl"))
print(ps.stem("bowling"))
print(ps.stem("bowler"))

bowl
bowl
bowler


In [26]:
import pandas as pd
import re
import string

pd.set_option("display.max_colwidth", 100)
stopwords = nltk.corpus.stopwords.words("english")
data = pd.read_csv("ds\SMSSpamCollection", sep="\t", header=None)

data.columns = ["label", "msg"]
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


# Clean text

In [27]:
def clean_text(text):
    text = "".join([c for c in text if c not in string.punctuation])
    tokens = re.split("\W+", text)
    text = [word for word in tokens if word not in stopwords]
    return text

In [28]:
data["msg_nostop"] = data["msg"].apply(lambda x: clean_text(x.lower()))
data.head()

Unnamed: 0,label,msg,msg_nostop
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"


# Stem the text

In [29]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [30]:
data["msg_stemmed"] = data["msg_nostop"].apply(lambda x: stemming(x))
data.head()

Unnamed: 0,label,msg,msg_nostop,msg_stemmed
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"


# Lemmatization

In [31]:
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
dir(wn)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'lemmatize']

In [32]:
# Stemming
# uses just heuristics and is only concerned with the string it is given and it
# chops off suffixes from a word
print(ps.stem("goose"))
print(ps.stem("geese"))

goos
gees


In [33]:
# Lemmatization
# searches the corpus, finds related words, and reduces it down to the root word,
# or lemma
print(wn.lemmatize("goose"))
print(wn.lemmatize("geese"))

goose
goose


In [34]:
print(wn.lemmatize("cactus"))
print(wn.lemmatize("cacti"))

cactus
cactus


In [35]:
print(ps.stem("cactus"))
print(ps.stem("cacti"))

cactu
cacti


In [36]:
# Read raw text
import pandas as pd
import re
import string

pd.set_option("display.max_colwidth", 100)
stopwords = nltk.corpus.stopwords.words("english")
data = pd.read_csv("ds\SMSSpamCollection", sep="\t", header=None)
data.columns = ["label", "msg"]
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [37]:
# Cleaning
def clean_text(txt):
    txt = "".join([c for c in txt if c not in string.punctuation])
    tokens = re.split("\W+", txt)
    txt = [word for word in tokens if word not in stopwords]
    return txt

In [38]:
data["msg_nostop"] = data["msg"].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,label,msg,msg_nostop
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, early, hor, U, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[Nah, I, dont, think, goes, usf, lives, around, though]"


In [39]:
# Lemmatization
def lemmatization(token_txt):
    text = [wn.lemmatize(word) for word in token_txt]
    return text

In [40]:
data["msg_lemmatized"] = data["msg_nostop"].apply(lambda x: lemmatization(x))
data.head()

Unnamed: 0,label,msg,msg_nostop,msg_lemmatized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv...","[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, early, hor, U, c, already, say]","[U, dun, say, early, hor, U, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[Nah, I, dont, think, goes, usf, lives, around, though]","[Nah, I, dont, think, go, usf, life, around, though]"


# Vectorization

# Count Vectorization

In [41]:
# Read raw text
import pandas as pd
import re
import string
import nltk

stopwords = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()
data = pd.read_csv("ds\SMSSpamCollection", sep="\t", header=None)
data.columns = ["label", "msg"]
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [42]:
def clean_text(txt):
    txt = "".join([c for c in txt if c not in string.punctuation])
    tokens = re.split("\W+", txt)
    txt = [word for word in tokens if word not in stopwords]
    return txt

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

corpus = ["This is a sentence is",
         "This is another sentence",
         "third document is here"]

X = cv.fit(corpus)
print(X.vocabulary_)
# print(cv.get_feature_names()) # verbatim on ppt, doesn't work
print(cv.get_feature_names_out())

X = cv.transform(corpus)
# X = cv.fit_transform(corpus)
print(X.shape)
print(X)
print(X.toarray())

df = pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out())
print(df)

{'this': 6, 'is': 3, 'sentence': 4, 'another': 0, 'third': 5, 'document': 1, 'here': 2}
['another' 'document' 'here' 'is' 'sentence' 'third' 'this']
(3, 7)
  (0, 3)	2
  (0, 4)	1
  (0, 6)	1
  (1, 0)	1
  (1, 3)	1
  (1, 4)	1
  (1, 6)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1
[[0 0 0 2 1 0 1]
 [1 0 0 1 1 0 1]
 [0 1 1 1 0 1 0]]
   another  document  here  is  sentence  third  this
0        0         0     0   2         1      0     1
1        1         0     0   1         1      0     1
2        0         1     1   1         0      1     0


# CountVectorization on SMSSpamCollection

In [44]:
cv1 = CountVectorizer(analyzer=clean_text)

X = cv1.fit_transform(data["msg"])
print(X.shape)

(5572, 11525)


In [45]:
data_sample = data[0:10]
cv2 = CountVectorizer(analyzer=clean_text)

X = cv2.fit_transform(data_sample["msg"])
print(X.shape)

(10, 137)


In [46]:
df = pd.DataFrame(X.toarray(), columns=cv2.get_feature_names_out())
df.head()

Unnamed: 0,08002986030,08452810075over18s,09061701461,11,12,150,2,2005,21st,3,...,u,usf,valued,wat,weeks,wif,win,wkly,word,world
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,1,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# N-Grams Vectorization

In [47]:
import pandas as pd
import re
import string
import nltk

stopwords = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()
data = pd.read_csv("ds\SMSSpamCollection", sep="\t", header=None)
data.columns = ["label", "msg"]
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [48]:
def clean_text(txt):
    txt = "".join([c for c in txt if c not in string.punctuation])
    tokens = re.split("\W+", txt)
    txt = " ".join([ps.stem(word) for word in tokens if word not in stopwords])
    return txt

data["msg_clean"] = data["msg"].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,label,msg,msg_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",go jurong point crazi avail bugi n great world la e buffet cine got amor wat
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives around here though",nah i dont think goe usf live around though


In [1]:
# 1. Explain the result of the data.
# 2. Try to change by bi gram and tri gram (2, 3) observe and explain the results.
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2, 2)) # changed from (2, 2) to (2, 3)

corpus = ["This is a sentence is",
         "This is another sentence",
         "third document is here"]

X = cv.fit_transform(corpus)
print("X.shape\n", X.shape)
print("X\n", X)
print("X.toarray()\n", X.toarray())

df = pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out()) # get_feature_names() doesn't work
print("df\n", df)

"""
Explanation:
1. X is the 
X.shape prints the shape of the corpus
"""

X.shape
 (3, 8)
X
   (0, 7)	1
  (0, 4)	1
  (0, 5)	1
  (1, 7)	1
  (1, 2)	1
  (1, 0)	1
  (2, 6)	1
  (2, 1)	1
  (2, 3)	1
X.toarray()
 [[0 0 0 0 1 1 0 1]
 [1 0 1 0 0 0 0 1]
 [0 1 0 1 0 0 1 0]]
df
    another sentence  document is  is another  is here  is sentence  \
0                 0            0           0        0            1   
1                 1            0           1        0            0   
2                 0            1           0        1            0   

   sentence is  third document  this is  
0            1               0        1  
1            0               0        1  
2            0               1        0  


In [50]:
cv1 = CountVectorizer(ngram_range=(2,2))
X = cv1.fit_transform(data["msg_clean"])
print(X.shape)

(5572, 34162)


In [51]:
print(cv1.get_feature_names_out())

['008704050406 sp' '0089mi last' '0121 2025050' ... 'üll submit'
 'üll take' '〨ud even']


In [52]:
data_sample = data[0:10]
cv2 = CountVectorizer(ngram_range=(2,2))

X = cv2.fit_transform(data_sample["msg_clean"])
print(X.shape)

(10, 126)


In [53]:
df = pd.DataFrame(X.toarray(), columns=cv2.get_feature_names_out())
df.head(10)

Unnamed: 0,09061701461 claim,11 month,12 hour,150 rcv,2005 text,21st may,87121 receiv,900 prize,aid patent,alreadi say,...,valu network,vettam set,week word,wif oni,win fa,winner as,wkli comp,word back,world la,xxx std
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
6,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,1,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
