# Introduction to NLTK lib
for more information http://www.nltk.org/book

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

## Tokenization

In [2]:
text = 'Data science is a multi-disciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data.'

In [3]:
words = word_tokenize(text)
words

['Data',
 'science',
 'is',
 'a',
 'multi-disciplinary',
 'field',
 'that',
 'uses',
 'scientific',
 'methods',
 ',',
 'processes',
 ',',
 'algorithms',
 'and',
 'systems',
 'to',
 'extract',
 'knowledge',
 'and',
 'insights',
 'from',
 'structured',
 'and',
 'unstructured',
 'data',
 '.']

## Stop words filtration

In [4]:
stop_words = stopwords.words('english')
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [5]:
text = "This is a sample sentence, showing off the stop words filtration."
text_tokens = word_tokenize(text)
filtered_text = []
for w in text_tokens:
    if w not in stop_words:
        filtered_text.append(w)
print('text: ', text_tokens)
print('filtered: ', filtered_text)

text:  ['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
filtered:  ['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


## Stemming and Lemmatization

In [6]:
text = 'list lists listed listing listings'
tokens = word_tokenize(text)

In [7]:
tokens 

['list', 'lists', 'listed', 'listing', 'listings']

In [8]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [9]:
[stemmer.stem(t) for t in tokens]

['list', 'list', 'list', 'list', 'list']

In [10]:
[lemmatizer.lemmatize(t) for t in tokens]

['list', 'list', 'listed', 'listing', 'listing']

## Wordnet

In [11]:
from nltk.corpus import wordnet as wn
wn.synsets('motorcar')

[Synset('car.n.01')]

In [12]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [13]:
for i in range(len(wn.synset('car.n.01').hypernyms())):
    print(wn.synset('car.n.01').hypernyms()[i].lemma_names())

['motor_vehicle', 'automotive_vehicle']


In [14]:
for i in range(len(wn.synset('car.n.01').hyponyms())):
    print(wn.synset('car.n.01').hyponyms()[i].lemma_names())

['ambulance']
['beach_wagon', 'station_wagon', 'wagon', 'estate_car', 'beach_waggon', 'station_waggon', 'waggon']
['bus', 'jalopy', 'heap']
['cab', 'hack', 'taxi', 'taxicab']
['compact', 'compact_car']
['convertible']
['coupe']
['cruiser', 'police_cruiser', 'patrol_car', 'police_car', 'prowl_car', 'squad_car']
['electric', 'electric_automobile', 'electric_car']
['gas_guzzler']
['hardtop']
['hatchback']
['horseless_carriage']
['hot_rod', 'hot-rod']
['jeep', 'landrover']
['limousine', 'limo']
['loaner']
['minicar']
['minivan']
['Model_T']
['pace_car']
['racer', 'race_car', 'racing_car']
['roadster', 'runabout', 'two-seater']
['sedan', 'saloon']
['sport_utility', 'sport_utility_vehicle', 'S.U.V.', 'SUV']
['sports_car', 'sport_car']
['Stanley_Steamer']
['stock_car']
['subcompact', 'subcompact_car']
['touring_car', 'phaeton', 'tourer']
['used-car', 'secondhand_car']


## Part of speech tagging

In [15]:
text = 'Bob is great'

In [17]:
words = word_tokenize(text)

In [18]:
from nltk import pos_tag
pos_tag(words)

[('Bob', 'NNP'), ('is', 'VBZ'), ('great', 'JJ')]

In [19]:
from nltk.help import upenn_tagset
upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
