# Tokenization Practice

In [48]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import nltk
from nltk.chunk import RegexpParser
import collections

In [35]:
text = '''Joe waited for the train. The train was late. Mary and Samantha took the bus. 
I looked for Mary and Samantha at the bus stations. '''

### Task 1. Split into sentence

In [36]:
token_text = sent_tokenize(text)
print(token_text)

['Joe waited for the train.', 'The train was late.', 'Mary and Samantha took the bus.', 'I looked for Mary and Samantha at the bus stations.']


### Task 2. Non-English sentence split

In [4]:
text = '''NLTK ist Open Source Software. Der Quellcode wird unter den Bedingungen der Apache License Version 2.0 vertrieben.  
Die Dokumentation wird unter den Bedingungen der Creative Commons-Lizenz Namensnennung - Nicht kommerziell - Keine abgeleiteten Werke 3.0 in den Vereinigten Staaten verteilt.
'''

In [8]:
token_text_german = sent_tokenize(text, language="german")
print(token_text_german)

['NLTK ist Open Source Software.', 'Der Quellcode wird unter den Bedingungen der Apache License Version 2.0 vertrieben.', 'Die Dokumentation wird unter den Bedingungen der Creative Commons-Lizenz Namensnennung - Nicht kommerziell - Keine abgeleiteten Werke 3.0 in den Vereinigten Staaten verteilt.']


### Task 3. Split into words

In [37]:
words = word_tokenize(text)
print(words)

['Joe', 'waited', 'for', 'the', 'train', '.', 'The', 'train', 'was', 'late', '.', 'Mary', 'and', 'Samantha', 'took', 'the', 'bus', '.', 'I', 'looked', 'for', 'Mary', 'and', 'Samantha', 'at', 'the', 'bus', 'stations', '.']


### Task 4. Tokenize - split in sentence and words

In [38]:
tokens = [word_tokenize(t) for t in sent_tokenize(text)]
print(tokens)

[['Joe', 'waited', 'for', 'the', 'train', '.'], ['The', 'train', 'was', 'late', '.'], ['Mary', 'and', 'Samantha', 'took', 'the', 'bus', '.'], ['I', 'looked', 'for', 'Mary', 'and', 'Samantha', 'at', 'the', 'bus', 'stations', '.']]


### Task 5. Tokenize Tweeter Text

In [14]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
tweet_text = "NoSQL introduction - w3resource http://bit.ly/1ngHC5F  #nosql #database #webdev"
result = tknzr.tokenize(tweet_text)
print(result)

['NoSQL', 'introduction', '-', 'w3resource', 'http://bit.ly/1ngHC5F', '#nosql', '#database', '#webdev']


### Task 6. Stopwords - how many stopwords are in French

In [15]:
result = stopwords.words('french')
len(result)

157

### Task 7. Remove and Add stopwords


In [24]:
#sorted(list(stopwords.words('english')))

remove some words from the list

In [25]:
stop_words = list(set(stopwords.words('english')) - set(['again', 'once', 'from']))

In [26]:
#sorted(stop_words)

Add could, would, should

In [39]:
stop_words = stopwords.words('english')
custom_list = ['would','could','should']
stop_words.extend(custom_list)

Remove stop words from your tokens

In [40]:
filtered_words = [w for w in words if not w in stop_words] 

In [41]:
word_counts = collections.Counter(filtered_words)
word_counts.most_common(4)

[('.', 4), ('train', 2), ('Mary', 2), ('Samantha', 2)]

### Task 8. Lemmatize and Stem

In [42]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [43]:
stems = [ps.stem(w) for line in tokens for w in line]
print(stems)

['joe', 'wait', 'for', 'the', 'train', '.', 'the', 'train', 'wa', 'late', '.', 'mari', 'and', 'samantha', 'took', 'the', 'bu', '.', 'I', 'look', 'for', 'mari', 'and', 'samantha', 'at', 'the', 'bu', 'station', '.']


In [44]:
lemmas = [lemmatizer.lemmatize(w) for line in tokens for w in line]
print(lemmas)

['Joe', 'waited', 'for', 'the', 'train', '.', 'The', 'train', 'wa', 'late', '.', 'Mary', 'and', 'Samantha', 'took', 'the', 'bus', '.', 'I', 'looked', 'for', 'Mary', 'and', 'Samantha', 'at', 'the', 'bus', 'station', '.']


### Task 9. POS Tagging

In [45]:
nltk_pos_tagged = nltk.pos_tag(words)
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
Word,Joe,waited,for,the,train,.,The,train,was,late,...,looked,for,Mary,and,Samantha,at,the,bus,stations,.
POS tag,NNP,VBD,IN,DT,NN,.,DT,NN,VBD,JJ,...,VBD,IN,NNP,CC,NNP,IN,DT,NN,NNS,.


### Task 10. NP-Chunking

In [46]:
grammar = """
NP: {<DT>?<JJ>?<NN.*>}
"""

In [47]:
rc = RegexpParser(grammar)
c = rc.parse(nltk_pos_tagged)
print(c)

(S
  (NP Joe/NNP)
  waited/VBD
  for/IN
  (NP the/DT train/NN)
  ./.
  (NP The/DT train/NN)
  was/VBD
  late/JJ
  ./.
  (NP Mary/NNP)
  and/CC
  (NP Samantha/NNP)
  took/VBD
  (NP the/DT bus/NN)
  ./.
  I/PRP
  looked/VBD
  for/IN
  (NP Mary/NNP)
  and/CC
  (NP Samantha/NNP)
  at/IN
  (NP the/DT bus/NN)
  (NP stations/NNS)
  ./.)
