## Importing the Dataset
#### [data url](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection)

In [None]:
#if nltk not installed
!pip install -U nltk

In [2]:
import nltk

In [None]:
#to download all the functions/packages of nltk
nltk.download()
#to download all the packages
#d all

In [None]:
#list all the various functions in this package
dir(nltk)

## tokenize example

In [5]:
from nltk.tokenize import word_tokenize

In [6]:
input_txt = "I am learning NLP and using NLTK"
word_tokens = word_tokenize(input_txt)
print(input_txt)
print(word_tokens)

I am learning NLP and using NLTK
['I', 'am', 'learning', 'NLP', 'and', 'using', 'NLTK']


# Reading text data
#### [data url](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection)

In [7]:
#method1 -> using open()
#method2 -> pandas read_csv()

### Method 1: reading data using open()

In [9]:
raw_data = open('SMSSpamCollection').read()
raw_data[:500]

"ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\nham\tOk lar... Joking wif u oni...\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tU dun say so early hor... U c already then say...\nham\tNah I don't think he goes to usf, he lives around here though\nspam\tFreeMsg Hey there darling it's been 3 week's now and no word bac"

In [10]:
#replacing tab with new line characters and split it based on new line
parsed_data = raw_data.replace('\t', '\n').split('\n')
#print first 10 lines of parsed_data
parsed_data[:10]

['ham',
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'ham',
 'Ok lar... Joking wif u oni...',
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham',
 'U dun say so early hor... U c already then say...',
 'ham',
 "Nah I don't think he goes to usf, he lives around here though"]

In [11]:
#separate into 2 lists which contains label and message
#start from 0 index, second empty -> go till the end, third -> advance to 2nd position
label_list = parsed_data[0::2]
msg_list = parsed_data[1::2]
print(label_list[0:5])
print(msg_list[0:5])

['ham', 'ham', 'spam', 'ham', 'ham']
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though"]


In [12]:
import pandas as pd
combined_df = pd.DataFrame({
    'label': label_list[:-1],
    'sms': msg_list
})

print(len(label_list))
print(len(msg_list))
combined_df.head()

5575
5574


Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Method 2: reading data Using pandas read_csv() 

In [13]:
dataset = pd.read_csv('SMSSpamCollection', sep='\t', header=None)
dataset.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
#renaming the columns name
dataset.columns= ['label', 'msg']
dataset.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Exploring the Dataset

In [15]:
#shape of the data
dataset.shape

(5572, 2)

In [16]:
print(f'Input data has {len(dataset)} rows and {len(dataset.columns)} columns')

Input data has 5572 rows and 2 columns


In [17]:
#ham/spam 
print(f'ham = {len(dataset[dataset["label"] == "ham"])}')
print(f'spam = {len(dataset[dataset["label"] == "spam"])}')

ham = 4825
spam = 747


In [18]:
#missing data

In [19]:
print(f'Numbers of missing label = {dataset["label"].isnull().sum()}')
print(f'Numbers of missing msg = {dataset["msg"].isnull().sum()}')

Numbers of missing label = 0
Numbers of missing msg = 0


## NLP PipeLine for Text Data
#### Raw Text -> Tokenization(example above) -> Text Cleaning(removing stop words, stemming,etc) -> Vectorization(text to number)(word2vec, bag of words, TF-IDF -> ML Algorithm -> Spam Filter
#### Pre-processing -> Tokenization and Text Cleaning
#### TF - IDF -> Term Frequency, Inverse Document Frequency

### Text Pre-processing
    1. Remove Punctuation
    2. Tokenization
    3. Remove stop words
    4. Stemming
    5. Lemmatizing

### 1. Remove Punctuation

In [20]:
#increasing the lenght of columns from default 50 to 100
pd.set_option('display.max_colwidth', 100)
dataset.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [21]:
#checking punctuation example
'I am learning NLP' == 'I am learning NLP.'

False

In [22]:
#list all the punctuations
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [23]:
def remove_punctuation(txt):
    #txt_nopunc = [c for c in txt if c not in string.punctuation] ->removes punctuation and the letter are separated from word
    txt_nopunc = "".join([c for c in txt if c not in string.punctuation]) #joins the character back to word
    return txt_nopunc

In [24]:
dataset['msg_clean'] = dataset['msg'].apply(lambda x: remove_punctuation(x))
dataset.head()

Unnamed: 0,label,msg,msg_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though


### 2. Tokenization (using regex)

In [25]:
import re

In [26]:
def tokenize(txt):
    tokens = re.split('\W+', txt) #small w is word characters, capital W is non-words characters, + means one or more
    return tokens
#dataset['msg_clean_tokenized'] = dataset['msg_clean'].apply(lambda x: tokenize(x))
dataset['msg_clean_tokenized'] = dataset['msg_clean'].apply(lambda x: tokenize(x.lower())) #changing all characters to lower -> not necessary but easier to read

dataset.head()

Unnamed: 0,label,msg,msg_clean,msg_clean_tokenized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"


### 3. Removing Stop Words (get rid of commonly used words which do not add much value or meaning in the dataset)

In [27]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [28]:
#custom function to remove stopwords
def remove_stopwords(txt_tokenized):
    #list comphrension to create new list based on old list available
    txt_clean = [word for word in txt_tokenized if word not in stopwords] #if the word is not in stopword add it in txt_clean, if it is in stopword don't add it in txt_clean
    return txt_clean

dataset['msg_no_sw'] = dataset['msg_clean_tokenized'].apply(lambda x: remove_stopwords(x))
dataset.head()

Unnamed: 0,label,msg,msg_clean,msg_clean_tokenized,msg_no_sw
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"


### 4. Stemming
##### Process of reducing inflected(or derived) words to their root word or word stem
##### Example, code, coder, coders, coding -> code
##### Errors in Steeming -> OverStemming and Understemming
##### Stemming Algorithmss -> Porter(most popular), Snowball, Lancaster, Regex-based Stemmer -> included in nltk 

In [29]:
#porter stemmer

In [30]:
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()
#list of function in porterStemmer
dir(ps)

['MARTIN_EXTENSIONS',
 'NLTK_EXTENSIONS',
 'ORIGINAL_ALGORITHM',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_abc_impl',
 '_apply_rule_list',
 '_contains_vowel',
 '_ends_cvc',
 '_ends_double_consonant',
 '_has_positive_measure',
 '_is_consonant',
 '_measure',
 '_replace_suffix',
 '_step1a',
 '_step1b',
 '_step1c',
 '_step2',
 '_step3',
 '_step4',
 '_step5a',
 '_step5b',
 'mode',
 'pool',
 'stem',
 'unicode_repr',
 'vowels']

In [31]:
#some examples with stem function

In [32]:
print(ps.stem('coder'))
print(ps.stem('coding'))
print(ps.stem('code'))

coder
code
code


In [33]:
print(ps.stem('data'))
print(ps.stem('datum'))

data
datum


In [34]:
print(ps.stem('bowl'))
print(ps.stem('bowling'))
print(ps.stem('bowler'))

bowl
bowl
bowler


In [35]:
dataset.head()

Unnamed: 0,label,msg,msg_clean,msg_clean_tokenized,msg_no_sw
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"


In [36]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [37]:
dataset['msg_stemmed'] = dataset['msg_no_sw'].apply(lambda x: stemming(x))
dataset.head()

Unnamed: 0,label,msg,msg_clean,msg_clean_tokenized,msg_no_sw,msg_stemmed
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"


### 5. Lemmatization 
##### more powerful than stemming as it does vocabulary analysis of words (generally slower(speed) than stemming but works good(accuracy))
##### always reduces a word to a dictionary word (base word)
##### more accurate but computationally expensive

In [38]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
dir(wn)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 'lemmatize',
 'unicode_repr']

In [39]:
dir(pos_tag)

['__annotations__',
 '__call__',
 '__class__',
 '__closure__',
 '__code__',
 '__defaults__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__get__',
 '__getattribute__',
 '__globals__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__kwdefaults__',
 '__le__',
 '__lt__',
 '__module__',
 '__name__',
 '__ne__',
 '__new__',
 '__qualname__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

In [40]:
print(ps.stem('goose'))
print(ps.stem('geese'))

goos
gees


In [41]:
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))

goose
goose


In [42]:
#can implement part of speech (pos) tagging for better result
def lemmatization(token_txt):
    text = [wn.lemmatize(word) for word in token_txt]
    return text

In [43]:
dataset['msg_lemmatized'] = dataset['msg_no_sw'].apply(lambda x: lemmatization(x))
dataset.head()

Unnamed: 0,label,msg,msg_clean,msg_clean_tokenized,msg_no_sw,msg_stemmed,msg_lemmatized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]"


## Vectorization (Count Vectorization, N-grams, TF-IDF (Term Frequency, Inverse Document Frequency)
#### Process of encoding text as integers to create Feature Vectors
#### Feature Vector -> vector of numerical features that represent an object -> Count Vectorization
#### The matrix is called Document Matrix or Document term matrix

## 1. Count Vectorization

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer=lemmatization)
X = cv.fit_transform(dataset['msg_lemmatized'])
print(X.shape)

(5572, 8915)


In [None]:
#print(cv.get_feature_names())

In [46]:
data_sample = dataset[0:10]
cv1 = CountVectorizer(analyzer=lemmatization)
X = cv1.fit_transform(data_sample['msg_lemmatized'])
print(X.shape)

(10, 122)


In [47]:
df = pd.DataFrame(X.toarray(), columns= cv1.get_feature_names())
df.head(10)

Unnamed: 0,08002986030,08452810075over18s,09061701461,11,12,150,2,2005,21st,3,...,vettam,wat,week,wif,win,winner,wkly,word,world,xxx
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,1,1,1,0,...,0,0,0,0,1,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,1,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Feature Engineering -> generally done before Cleaning
##### creating new features of transforming existing features using domain knowlede of data, makes ML algos work better
##### Creating features 
    #length of documents
    #average word size within a document
    #use of punctuation in the text
    #capitalization of words in a document
    #stop words usage
##### Transformations (Example: Box-Cox Transformations, Tukey Transformations)
    #power transformation(x2, root(x), x3, etc)
    #standardizing data -> using log transfer    
    #normalization:bringing different features to similar scale