## Tokenize

In [1]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

#### Sentence Tokenizer

In [2]:
#membuat variable baru bernama 'text'
text = "this's a sent tokenize test. this is sent two. is this sent three? sent 4 is cool! Now it is your turn."

#menggunakan library sent_tokenized
#bahasa yang digunakan, defaultnya 'english'
#bisa diganti dengan bahasa lain yang tersedia
#library sent_tokenized merupakan library yang sudah dilatih agar mampu memecah kalimat bedasarkan pattern yang ada
#namanya PunktSentenceTokenizer

sent_tokenize_list = sent_tokenize(text) 

In [3]:
len(sent_tokenize_list) #5

5

In [4]:
sent_tokenize_list

["this's a sent tokenize test.",
 'this is sent two.',
 'is this sent three?',
 'sent 4 is cool!',
 'Now it is your turn.']

#### Word Tokenizer

In [5]:
#sejenis dengan sent_tokenize
#namun library word_tokenize digunakan untuk membuat token berdasarkan katanya
#defaultnya dalam bahasa inggris
#library word_tokenize merupakan library yang sudah dilatih agar mampu memecah kalimat bedasarkan pattern yang ada
#namanya TreebankWordTokenizer

word_tokenize(text) 

['this',
 "'s",
 'a',
 'sent',
 'tokenize',
 'test',
 '.',
 'this',
 'is',
 'sent',
 'two',
 '.',
 'is',
 'this',
 'sent',
 'three',
 '?',
 'sent',
 '4',
 'is',
 'cool',
 '!',
 'Now',
 'it',
 'is',
 'your',
 'turn',
 '.']

## POS Tagger

In [6]:
import nltk

In [7]:
#membuat variable baru bernama text
text = "This is python training by Shiv for the Analytics team happening at Bangalore . he is felicitating and helping us ."

#memanggil library pos_tag
#kemudian memberikan tagging atas setiap kata dari text yang sudah di split
#tagging mencirikan jenis atas kata tersebut
nltk.pos_tag(text.split())

[('This', 'DT'),
 ('is', 'VBZ'),
 ('python', 'JJ'),
 ('training', 'NN'),
 ('by', 'IN'),
 ('Shiv', 'NNP'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('Analytics', 'NNP'),
 ('team', 'NN'),
 ('happening', 'VBG'),
 ('at', 'IN'),
 ('Bangalore', 'NNP'),
 ('.', '.'),
 ('he', 'PRP'),
 ('is', 'VBZ'),
 ('felicitating', 'VBG'),
 ('and', 'CC'),
 ('helping', 'VBG'),
 ('us', 'PRP'),
 ('.', '.')]

In [8]:
#mencontohkan maksud atas salah satu tagging
#kita ambil contoh tagging dari 'NN'

nltk.help.upenn_tagset("NN*")

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NNPS: noun, proper, plural
    Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists
    Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques
    Apache Apaches Apocrypha ...
NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...


## Stemming

#### Porter Stemmer

In [9]:
from nltk.stem.porter import PorterStemmer

In [10]:
#mengaktifkan library PorterStemmer

porter_stemmer = PorterStemmer()

In [11]:
#contoh
porter_stemmer.stem("study")

'studi'

In [12]:
porter_stemmer.stem("studies")

'studi'

In [13]:
porter_stemmer.stem("studying")

'studi'

#### Snowball Stemmer

In [14]:
from nltk.stem import SnowballStemmer

In [15]:
#mengaktifkan SnowballStemmer
#perhatikan bahwa saat akan mengaktifkan SnowballStemmer
#perlu untuk di define bahasa apa yang akan digunakan terlebih dahulu
#sedikit berbeda dengan stemmer lainnya

snowball_stemmer = SnowballStemmer("english")

In [16]:
snowball_stemmer.stem("study")

'studi'

In [17]:
snowball_stemmer.stem("studies")

'studi'

In [18]:
snowball_stemmer.stem("studying")

'studi'

#### Lancaster Stemmer

In [19]:
from nltk.stem.lancaster import LancasterStemmer

In [20]:
#mengaktifkan library LancasterStemmer

lancaster_stemmer = LancasterStemmer()

In [21]:
#contoh
lancaster_stemmer.stem("study")

'study'

In [22]:
lancaster_stemmer.stem("studies")

'study'

In [23]:
lancaster_stemmer.stem("studying")

'study'

#### Word Sense Disambiguation

In [24]:
#word sense disambiguation
#kata-kata memiliki banyak makna
#contohnya jika indonesia, apel dalam maksud buah, dan apel dalam maksud upacara
#contoh disini kita mengecek kemiripan kata 'bank' dalam konteks kalimat berbeda
#apakah ada makna yang berbeda

from nltk.wsd import lesk
sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.']

In [25]:
print(lesk(sent, 'bank', pos = 'n')) # See the meaning in SYsnet at bottom of this section

Synset('savings_bank.n.02')


In [26]:
print(lesk(sent, 'bank'))

#maknanya
#a container (usually with a slot in the top) for keeping money at home

Synset('savings_bank.n.02')


In [27]:
sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'catch', 'ship', '.']

In [28]:
print(lesk(sent, 'bank', pos = 'n'))

Synset('bank.n.07')


In [29]:
print(lesk(sent, 'bank'))

#maknanya
#cover with ashes so to control the rate of burning

Synset('bank.v.07')


In [30]:
from nltk.corpus import wordnet as wn
for ss in wn.synsets('bank'):
    print(ss, ss.definition())

Synset('bank.n.01') sloping land (especially the slope beside a body of water)
Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities
Synset('bank.n.03') a long ridge or pile
Synset('bank.n.04') an arrangement of similar objects in a row or in tiers
Synset('bank.n.05') a supply or stock held in reserve for future use (especially in emergencies)
Synset('bank.n.06') the funds held by a gambling house or the dealer in some gambling games
Synset('bank.n.07') a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
Synset('savings_bank.n.02') a container (usually with a slot in the top) for keeping money at home
Synset('bank.n.09') a building in which the business of banking transacted
Synset('bank.n.10') a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning)
Synset('bank.v.01') tip laterally
Sy