In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Removing Upper Case

In [None]:
sentence = 'HALO everyONe, todaY IS MONday'
sentence.lower()

'halo everyone, today is monday'

## Removing Stop Words

In [None]:
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

In [None]:
stop_words[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [None]:
sentence = 'Halo everyone, what are you doing right now ? now i am playing my favorite movies'
text = ' '.join(word for word in sentence.split() if word not in stop_words)
text

'Halo everyone, right ? playing favorite movies'

## Removing Symbol and etc

In [None]:
import re
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_spcl

re.compile(r'[/(){}\[\]\|@,;]', re.UNICODE)

In [None]:
import re
clean_symbol = re.compile('[^0-9a-z #+_]')
clean_symbol

re.compile(r'[^0-9a-z #+_]', re.UNICODE)

In [None]:
sentence = '||{{halo this is my pen !!!'

text = clean_spcl.sub('', sentence)
text = clean_symbol.sub('', sentence)


text

'halo this is my pen '

## Combined

In [None]:
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')
stopworda = stop_words

def clean_text(text):
    text = text.lower()
    text = clean_spcl.sub('', text)
    text = clean_symbol.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stopworda)
    return text
    
clean_text('Good Morning EveryOne !!!!!, "MY name is NURUL alimah"')

'good morning everyone name nurul alimah'

## Word Tokenize

In [None]:
sentence = 'Good Morning EveryOne, MY name is NURUL alimah'
nltk.word_tokenize(sentence)

['Good', 'Morning', 'EveryOne', ',', 'MY', 'name', 'is', 'NURUL', 'alimah']

In [None]:
sentence = 'Good Morning EveryOne, MY name is NURUL alimah'
nltk.sent_tokenize(sentence)

['Good Morning EveryOne, MY name is NURUL alimah']

# Stemming and Lemmetization

### Stemming

In [None]:
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer

In [None]:
lancaster = LancasterStemmer()
porter = PorterStemmer()
snowball = SnowballStemmer('english')

In [None]:
words = ['playing', 'computing', 'haters', 'hobbies', 'likes', 'starting', 'running', 'loading']
print("{0:30}{1:30}{2:30}{3:30}".format('Words', 'Lancaster Stemmer', 'Porter Stemmer', 'Snowball Stemmer'))
print('-'*106)
for word in words :
    print("{0:30}{1:30}{2:30}{3:30}".format(word, lancaster.stem(word), porter.stem(word), snowball.stem(word)))

Words                         Lancaster Stemmer             Porter Stemmer                Snowball Stemmer              
----------------------------------------------------------------------------------------------------------
playing                       play                          play                          play                          
computing                     comput                        comput                        comput                        
haters                        hat                           hater                         hater                         
hobbies                       hobby                         hobbi                         hobbi                         
likes                         lik                           like                          like                          
starting                      start                         start                         start                         
running                       run             

In [None]:
sentence = "right now i am texting my best friend to ask her if she is not busy we gonna go to cinema to watching a new movie"
teks = nltk.word_tokenize(sentence)
for stemmer in (lancaster, porter, snowball):
    stemming = [stemmer.stem(text) for text in teks]
    print(' '.join(stemming))

right now i am text my best friend to ask her if she is not busy we gon na go to cinem to watch a new movy
right now i am text my best friend to ask her if she is not busi we gon na go to cinema to watch a new movi
right now i am text my best friend to ask her if she is not busi we gon na go to cinema to watch a new movi


### Lemmetization

In [None]:
from nltk.stem import WordNetLemmatizer
lemme = WordNetLemmatizer()

In [None]:
words = ['playing', 'computing', 'haters', 'hobbies', 'likes', 'starting', 'running', 'loading']
print("{0:30}{1:30}".format('Words', 'Lemmetization'))
print('-'*106)
for word in words :
    print("{0:30}{1:30}".format(word, lemme.lemmatize(word)))

Words                         Lemmetization                 
----------------------------------------------------------------------------------------------------------
playing                       playing                       
computing                     computing                     
haters                        hater                         
hobbies                       hobby                         
likes                         like                          
starting                      starting                      
running                       running                       
loading                       loading                       


In [None]:
print(lemme.lemmatize('jumping', pos='v'))
print(lemme.lemmatize('better', pos='a'))
print(lemme.lemmatize('computing', pos='v')) 
print(lemme.lemmatize('tables', pos='n'))
print(lemme.lemmatize('shortest', pos='a'))

jump
good
compute
table
short


## Word Vectorization 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd 

sentence = ['the apps really good', 'the apps is so bad']

feature_extraction = TfidfVectorizer(stop_words='english')
matrix = feature_extraction.fit_transform(sentence)
print(feature_extraction.get_feature_names())

df = pd.DataFrame(matrix.todense(), columns=feature_extraction.get_feature_names() )

print(matrix)

['apps', 'bad', 'good', 'really']
  (0, 2)	0.6316672017376245
  (0, 3)	0.6316672017376245
  (0, 0)	0.4494364165239821
  (1, 1)	0.8148024746671689
  (1, 0)	0.5797386715376657
