# IMPORT LIBRARY

In [None]:
import pandas as pd
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
# Stemming
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
# Lemmatisation
from nltk.stem.wordnet import WordNetLemmatizer
wordnet_lemma = WordNetLemmatizer()
# stopwords
stopwords = nltk.corpus.stopwords.words('english')
# Text Enrichment / Augmmentation
from nltk.corpus import wordnet


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Text Preprocessing
Text Preprocessing is process to clean word
- Lowercasing
- Stemming
- Lemmatisation
- Stopword Removal
- Normalization
- Noise Removal
- Text Enrichment / Augmmentation

## Lowercasing
uppercase to smallcase

In [None]:
text = "Thailand know formerly as Siam and officially as the Kingdom of Thailand, is a country in Southeast Asia"
lower_case = text.lower()
lower_case

'thailand know formerly as siam and officially as the kingdom of thailand, is a country in southeast asia'

## Stemming
Stemming is the process of cutting off the end of a word in a rough way with Heuristic, which works quite well. For most, but not all, English words.

such as connected, connecting connections ==> connect

In [None]:
# porter_stemmer.stem(text.split(" "))
list_word = lower_case.split(" ")
stemming_word = [porter_stemmer.stem(list_word[i]) for i in range(len(list_word))]
stemming_word

['thailand',
 'know',
 'formerli',
 'as',
 'siam',
 'and',
 'offici',
 'as',
 'the',
 'kingdom',
 'of',
 'thailand,',
 'is',
 'a',
 'countri',
 'in',
 'southeast',
 'asia']

## Lemmatisation
Eliminate Inflection of words such as Gender, Tense, Sound, Mood, Number.

such as am, are, is ==> be

better ==> good

In [None]:
lemmatisation_word = [wordnet_lemma.lemmatize(list_word[i]) for i in range(len(list_word))]
lemmatisation_word

['thailand',
 'know',
 'formerly',
 'a',
 'siam',
 'and',
 'officially',
 'a',
 'the',
 'kingdom',
 'of',
 'thailand,',
 'is',
 'a',
 'country',
 'in',
 'southeast',
 'asia']

## Stemming vs Lemmatisation

In [None]:
def display_lemma_porter(text):
  print(f"{'word':<12}\t{'lemma':<12}\t{'stem':<12}")
  print("-"*50)
  for word in text:
    print(f"{word:12}\t{wordnet_lemma.lemmatize(word):12}\t{porter_stemmer.stem(word):12}")

In [None]:
# good for lemma and stem
word_list = ["foot","feet","foots","footing"]
display_lemma_porter(word_list)

word        	lemma       	stem        
--------------------------------------------------
foot        	foot        	foot        
feet        	foot        	feet        
foots       	foot        	foot        
footing     	footing     	foot        


In [None]:
# stemming is not ok
word_list = ["fly","files","flying","flew","flown"]
display_lemma_porter(word_list)

word        	lemma       	stem        
--------------------------------------------------
fly         	fly         	fli         
files       	file        	file        
flying      	flying      	fli         
flew        	flew        	flew        
flown       	flown       	flown       


In [None]:
# stemming is not ok
word_list = ["universe","university","univalsal"]
display_lemma_porter(word_list)

word        	lemma       	stem        
--------------------------------------------------
universe    	universe    	univers     
university  	university  	univers     
univalsal   	univalsal   	univals     


In [None]:
# stemming is not ok
word_list = "The formatting operations described here exhibit a variety of quirks that lead to a number of common errors".split(" ")
display_lemma_porter(word_list)

word        	lemma       	stem        
--------------------------------------------------
The         	The         	the         
formatting  	formatting  	format      
operations  	operation   	oper        
described   	described   	describ     
here        	here        	here        
exhibit     	exhibit     	exhibit     
a           	a           	a           
variety     	variety     	varieti     
of          	of          	of          
quirks      	quirk       	quirk       
that        	that        	that        
lead        	lead        	lead        
to          	to          	to          
a           	a           	a           
number      	number      	number      
of          	of          	of          
common      	common      	common      
errors      	error       	error       


## Stopword Removal
Stop Words are common words that we often encounter in sentences or documents but rarely help convey meaning.

such as a, an the

In [None]:
def remove_stopwords(text):
  output = [i for i in text if i not in stopwords]
  return output
print("Original text:", word_list)
print("Remove_stopword:", remove_stopwords(word_list))

Original text: ['The', 'formatting', 'operations', 'described', 'here', 'exhibit', 'a', 'variety', 'of', 'quirks', 'that', 'lead', 'to', 'a', 'number', 'of', 'common', 'errors']
Remove_stopword: ['The', 'formatting', 'operations', 'described', 'exhibit', 'variety', 'quirks', 'lead', 'number', 'common', 'errors']


## Normalization
making or changing words with the same meaning but with different writing methods to be in the same form

such as 2 moro, 2mrrw, 2morrow, 2mrw, tomrw ==>tomorrow

:) :-) ;-) ==> smile

In [None]:
norm_dict = {
    '2moro':"tomorrow",
    '2mrrw':"tomorrow",
    '2morrow':"tomorrow",
    '2mrw':"tomorrow",
    'tomrw':"tomorrow",
    'b4':"before",
    'otw':"on the way",
    ':)':"smile",
    ':-)':"smile",
    ';-)':"smile",
}
def normalise(text):
  res = [norm_dict[w] if w in norm_dict else w for w in text]
  return res
word_list = ["2moro","2mrrw","2morrow","2mrw","tomrw","b4"]
normalise(word_list)

['tomorrow', 'tomorrow', 'tomorrow', 'tomorrow', 'tomorrow', 'before']

## Noise Removal
process of deleting characters, numerals, and tex fragments that not relate word in sentence

such as "...test..a> ==>test

In [None]:
import pandas as pd
import re
def scrub_words(text):
  # remove html markup
  text = re.sub("(<.*?>)","",text)
  # remove non-ascii and digits
  text = re.sub("(\\W|\\d)"," ", text)
  # remove whitespace
  text = text.strip()
  return text
raw_words = ["..trouble..", "trouble<","trouble!","<a>trouble</a>","1.trouble"]
cleaned_words = [scrub_words(w) for w in raw_words]
stemdf = pd.DataFrame({'raw_word': raw_words, 'cleaned_word':cleaned_words})
stemdf = stemdf[['raw_word','cleaned_word']]
stemdf

Unnamed: 0,raw_word,cleaned_word
0,..trouble..,trouble
1,trouble<,trouble
2,trouble!,trouble
3,<a>trouble</a>,trouble
4,1.trouble,trouble


## Text Enrichment / Augmmentation
looking synonym words from wordnet or add some word for improve model

In [None]:
syns = wordnet.synsets("program")
syns

[Synset('plan.n.01'),
 Synset('program.n.02'),
 Synset('broadcast.n.02'),
 Synset('platform.n.02'),
 Synset('program.n.05'),
 Synset('course_of_study.n.01'),
 Synset('program.n.07'),
 Synset('program.n.08'),
 Synset('program.v.01'),
 Synset('program.v.02')]

In [None]:
[s.lemmas()[0].name() for s in syns]

['plan',
 'program',
 'broadcast',
 'platform',
 'program',
 'course_of_study',
 'program',
 'program',
 'program',
 'program']