# L3: Tokenization

**Instructor: Haiqin Yang**: yanghaiqin@sztu.edu.cn

---
# Setup

In [1]:
# @title Imports
!pip install tokenizers
from urllib.request import urlopen
import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
import re
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
nltk.download('punkt')
nltk.download('stopwords')
!pip install SentencePiece
from transformers import MT5Tokenizer, GPT2LMHeadModel, TextGenerationPipeline



[nltk_data] Downloading package punkt to /Users/hqyang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hqyang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.




---
# Read text


## From webpage

In [2]:
# Download comments from webpage
html = urlopen('https://www.hplovecraft.com/writings/texts/fiction/cc.aspx').read()

In [None]:
html

In [3]:
# Remove html markup
raw = BeautifulSoup(html).get_text()

In [4]:
raw



In [5]:
# trim the text to where you want it to begin
ind_start = re.search('“Of such great powers', raw).start()
raw = raw[ind_start:]

In [6]:
ind_start

318

In [7]:
raw



In [8]:
# save contents to text file
with open ('cthulhu.txt', 'w') as f:
  f.write(raw)

## From text file

In [9]:
# read the file we just saved
with open ('cthulhu.txt', 'r') as f:
  raw = f.read()

In [10]:
raw



---
# Tokenization

## Using whitespace

In [11]:
# segment words using whitespace
tokens = raw.split() # Python's built-in function, default seperaotr is whitespace; but you can change it to other seperators
sent_tokens = raw.split('.')

In [12]:
tokens

['“Of',
 'such',
 'great',
 'powers',
 'or',
 'beings',
 'there',
 'may',
 'be',
 'conceivably',
 'a',
 'survival',
 '.',
 '.',
 '.',
 'a',
 'survival',
 'of',
 'a',
 'hugely',
 'remote',
 'period',
 'when',
 '.',
 '.',
 '.',
 'consciousness',
 'was',
 'manifested,',
 'perhaps,',
 'in',
 'shapes',
 'and',
 'forms',
 'long',
 'since',
 'withdrawn',
 'before',
 'the',
 'tide',
 'of',
 'advancing',
 'humanity',
 '.',
 '.',
 '.',
 'forms',
 'of',
 'which',
 'poetry',
 'and',
 'legend',
 'alone',
 'have',
 'caught',
 'a',
 'flying',
 'memory',
 'and',
 'called',
 'them',
 'gods,',
 'monsters,',
 'mythical',
 'beings',
 'of',
 'all',
 'sorts',
 'and',
 'kinds.',
 '.',
 '.',
 '.”',
 '—Algernon',
 'Blackwood.',
 'I.The',
 'Horror',
 'in',
 'Clay.',
 'The',
 'most',
 'merciful',
 'thing',
 'in',
 'the',
 'world,',
 'I',
 'think,',
 'is',
 'the',
 'inability',
 'of',
 'the',
 'human',
 'mind',
 'to',
 'correlate',
 'all',
 'its',
 'contents.',
 'We',
 'live',
 'on',
 'a',
 'placid',
 'island',
 

In [13]:
sent_tokens

['“Of such great powers or beings there may be conceivably a survival\xa0',
 '\xa0',
 '\xa0',
 '\na survival of a hugely remote period when\xa0',
 '\xa0',
 '\xa0',
 ' consciousness was manifested,\nperhaps, in shapes and forms long since withdrawn before the tide of advancing humanity\xa0',
 '\xa0',
 '\xa0',
 '\nforms of which poetry and legend alone have caught a flying memory and called them gods, monsters,\nmythical beings of all sorts and kinds',
 '\xa0',
 '\xa0',
 '\xa0',
 '”\n—Algernon Blackwood',
 '\nI',
 'The Horror in Clay',
 '\n\nThe most merciful thing in the world, I think, is the inability of the human mind to correlate\nall its contents',
 ' We live on a placid island of ignorance in the midst of black seas of infinity,\nand it was not meant that we should voyage far',
 ' The sciences, each straining in its own direction,\nhave hitherto harmed us little; but some day the piecing together of dissociated knowledge will\nopen up such terrifying vistas of reality, and of our 

## Using NLTK's tokenizer

In [14]:
tokens = word_tokenize(raw)

In [15]:
tokens

['“',
 'Of',
 'such',
 'great',
 'powers',
 'or',
 'beings',
 'there',
 'may',
 'be',
 'conceivably',
 'a',
 'survival',
 '.',
 '.',
 '.',
 'a',
 'survival',
 'of',
 'a',
 'hugely',
 'remote',
 'period',
 'when',
 '.',
 '.',
 '.',
 'consciousness',
 'was',
 'manifested',
 ',',
 'perhaps',
 ',',
 'in',
 'shapes',
 'and',
 'forms',
 'long',
 'since',
 'withdrawn',
 'before',
 'the',
 'tide',
 'of',
 'advancing',
 'humanity',
 '.',
 '.',
 '.',
 'forms',
 'of',
 'which',
 'poetry',
 'and',
 'legend',
 'alone',
 'have',
 'caught',
 'a',
 'flying',
 'memory',
 'and',
 'called',
 'them',
 'gods',
 ',',
 'monsters',
 ',',
 'mythical',
 'beings',
 'of',
 'all',
 'sorts',
 'and',
 'kinds.',
 '.',
 '.',
 '.',
 '”',
 '—Algernon',
 'Blackwood',
 '.',
 'I.The',
 'Horror',
 'in',
 'Clay',
 '.',
 'The',
 'most',
 'merciful',
 'thing',
 'in',
 'the',
 'world',
 ',',
 'I',
 'think',
 ',',
 'is',
 'the',
 'inability',
 'of',
 'the',
 'human',
 'mind',
 'to',
 'correlate',
 'all',
 'its',
 'contents',
 '.

In [16]:
# Show frequncy of each word
token_freq = FreqDist(tokens) #nltk's function to find how many times each token occurs in the text

In [17]:
token_freq.most_common(50)

[('the', 738),
 (',', 692),
 ('of', 523),
 ('and', 476),
 ('.', 416),
 ('a', 236),
 ('in', 212),
 ('to', 206),
 ('was', 170),
 ('had', 157),
 ('I', 112),
 ('that', 107),
 ('which', 94),
 (';', 94),
 ('his', 89),
 ('with', 85),
 ('he', 84),
 ('it', 82),
 ('from', 79),
 ('’', 79),
 ('for', 73),
 ('on', 72),
 ('at', 70),
 ('by', 69),
 ('as', 69),
 ('s', 61),
 ('The', 60),
 ('not', 59),
 ('but', 53),
 ('were', 50),
 ('my', 48),
 ('an', 47),
 ('all', 44),
 ('be', 43),
 ('or', 40),
 ('this', 40),
 ('have', 38),
 ('some', 37),
 ('could', 37),
 ('its', 35),
 ('men', 34),
 ('been', 32),
 ('no', 31),
 ('one', 31),
 ('him', 30),
 ('“', 29),
 ('”', 29),
 ('so', 29),
 ('only', 29),
 ('is', 28)]

# Text normalization

### Remove punctutation

In [18]:
tokens = [t for t in tokens if t.isalpha()]

In [19]:
tokens

['Of',
 'such',
 'great',
 'powers',
 'or',
 'beings',
 'there',
 'may',
 'be',
 'conceivably',
 'a',
 'survival',
 'a',
 'survival',
 'of',
 'a',
 'hugely',
 'remote',
 'period',
 'when',
 'consciousness',
 'was',
 'manifested',
 'perhaps',
 'in',
 'shapes',
 'and',
 'forms',
 'long',
 'since',
 'withdrawn',
 'before',
 'the',
 'tide',
 'of',
 'advancing',
 'humanity',
 'forms',
 'of',
 'which',
 'poetry',
 'and',
 'legend',
 'alone',
 'have',
 'caught',
 'a',
 'flying',
 'memory',
 'and',
 'called',
 'them',
 'gods',
 'monsters',
 'mythical',
 'beings',
 'of',
 'all',
 'sorts',
 'and',
 'Blackwood',
 'Horror',
 'in',
 'Clay',
 'The',
 'most',
 'merciful',
 'thing',
 'in',
 'the',
 'world',
 'I',
 'think',
 'is',
 'the',
 'inability',
 'of',
 'the',
 'human',
 'mind',
 'to',
 'correlate',
 'all',
 'its',
 'contents',
 'We',
 'live',
 'on',
 'a',
 'placid',
 'island',
 'of',
 'ignorance',
 'in',
 'the',
 'midst',
 'of',
 'black',
 'seas',
 'of',
 'infinity',
 'and',
 'it',
 'was',
 'no

In [20]:
token_freq = FreqDist(tokens)
token_freq.most_common(50)

[('the', 738),
 ('of', 523),
 ('and', 476),
 ('a', 236),
 ('in', 212),
 ('to', 206),
 ('was', 170),
 ('had', 157),
 ('I', 112),
 ('that', 107),
 ('which', 94),
 ('his', 89),
 ('with', 85),
 ('he', 84),
 ('it', 82),
 ('from', 79),
 ('for', 73),
 ('on', 72),
 ('at', 70),
 ('by', 69),
 ('as', 69),
 ('s', 61),
 ('The', 60),
 ('not', 59),
 ('but', 53),
 ('were', 50),
 ('my', 48),
 ('an', 47),
 ('all', 44),
 ('be', 43),
 ('or', 40),
 ('this', 40),
 ('have', 38),
 ('some', 37),
 ('could', 37),
 ('its', 35),
 ('men', 34),
 ('been', 32),
 ('no', 31),
 ('one', 31),
 ('him', 30),
 ('so', 29),
 ('only', 29),
 ('is', 28),
 ('would', 28),
 ('It', 28),
 ('cult', 27),
 ('their', 27),
 ('Johansen', 27),
 ('when', 26)]

### Case folding

In [21]:
tokens_lower = [t.lower() for t in tokens]

In [22]:
tokens_lower

['of',
 'such',
 'great',
 'powers',
 'or',
 'beings',
 'there',
 'may',
 'be',
 'conceivably',
 'a',
 'survival',
 'a',
 'survival',
 'of',
 'a',
 'hugely',
 'remote',
 'period',
 'when',
 'consciousness',
 'was',
 'manifested',
 'perhaps',
 'in',
 'shapes',
 'and',
 'forms',
 'long',
 'since',
 'withdrawn',
 'before',
 'the',
 'tide',
 'of',
 'advancing',
 'humanity',
 'forms',
 'of',
 'which',
 'poetry',
 'and',
 'legend',
 'alone',
 'have',
 'caught',
 'a',
 'flying',
 'memory',
 'and',
 'called',
 'them',
 'gods',
 'monsters',
 'mythical',
 'beings',
 'of',
 'all',
 'sorts',
 'and',
 'blackwood',
 'horror',
 'in',
 'clay',
 'the',
 'most',
 'merciful',
 'thing',
 'in',
 'the',
 'world',
 'i',
 'think',
 'is',
 'the',
 'inability',
 'of',
 'the',
 'human',
 'mind',
 'to',
 'correlate',
 'all',
 'its',
 'contents',
 'we',
 'live',
 'on',
 'a',
 'placid',
 'island',
 'of',
 'ignorance',
 'in',
 'the',
 'midst',
 'of',
 'black',
 'seas',
 'of',
 'infinity',
 'and',
 'it',
 'was',
 'no

### Lemmatization

#### Using Porter stemmer

In [23]:
ps = PorterStemmer()

In [24]:
ps.stem('computational')

'comput'

In [25]:
ps.stem('linguistics')

'linguist'

In [26]:
tokens_stemmed = [ps.stem(w) for w in tokens_lower]

In [27]:
tokens_stemmed

['of',
 'such',
 'great',
 'power',
 'or',
 'be',
 'there',
 'may',
 'be',
 'conceiv',
 'a',
 'surviv',
 'a',
 'surviv',
 'of',
 'a',
 'huge',
 'remot',
 'period',
 'when',
 'conscious',
 'wa',
 'manifest',
 'perhap',
 'in',
 'shape',
 'and',
 'form',
 'long',
 'sinc',
 'withdrawn',
 'befor',
 'the',
 'tide',
 'of',
 'advanc',
 'human',
 'form',
 'of',
 'which',
 'poetri',
 'and',
 'legend',
 'alon',
 'have',
 'caught',
 'a',
 'fli',
 'memori',
 'and',
 'call',
 'them',
 'god',
 'monster',
 'mythic',
 'be',
 'of',
 'all',
 'sort',
 'and',
 'blackwood',
 'horror',
 'in',
 'clay',
 'the',
 'most',
 'merci',
 'thing',
 'in',
 'the',
 'world',
 'i',
 'think',
 'is',
 'the',
 'inabl',
 'of',
 'the',
 'human',
 'mind',
 'to',
 'correl',
 'all',
 'it',
 'content',
 'we',
 'live',
 'on',
 'a',
 'placid',
 'island',
 'of',
 'ignor',
 'in',
 'the',
 'midst',
 'of',
 'black',
 'sea',
 'of',
 'infin',
 'and',
 'it',
 'wa',
 'not',
 'meant',
 'that',
 'we',
 'should',
 'voyag',
 'far',
 'the',
 'sc

In [28]:
# save normalized tokens
with open('tokens_normed.txt', 'w') as f:
  f.write('\n'.join(tokens_lower))

#### Using BPE tokenizer

In [29]:
# import BPE tokenizer and train on the normalized text
bpe_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
bpe_tokenizer.train(['tokens_normed.txt'], trainer)






In [30]:
# test BPE tokenizer on unseen words
output = bpe_tokenizer.encode('computational linguistics')
output.tokens

['comp', 'ut', 'at', 'ion', 'al', '[UNK]', 'ling', 'u', 'ist', 'ic', 's']

#### Using mGPT tokenizer

In [31]:
tokenizer = MT5Tokenizer.from_pretrained("THUMT/mGPT")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading config.json: 0.00B [00:00, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'T5Tokenizer'.
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [32]:
input_ids = tokenizer('computational linguistics').input_ids

In [33]:
input_ids

[93928, 65662, 259, 151025, 263, 1]

In [34]:
[tokenizer.decode(i) for i in input_ids]

['comput', 'ational', '', 'linguistic', 's', '</s>']

In [35]:
text = '当我还只有六岁的时候在一本描写原始森林的名叫真实的故事的书中看到了一幅精彩的插画'

In [36]:
input_ids = tokenizer(text).input_ids

In [37]:
input_ids

[259,
 7031,
 3003,
 8349,
 29505,
 10534,
 27622,
 37374,
 1083,
 76896,
 204604,
 209877,
 92418,
 493,
 3094,
 27333,
 62903,
 127390,
 493,
 13107,
 1223,
 214835,
 1374,
 13558,
 114526,
 493,
 35858,
 15828,
 1]

In [38]:
[tokenizer.decode(i) for i in input_ids]

['',
 '当',
 '我',
 '还',
 '只有',
 '六',
 '岁',
 '的时候',
 '在',
 '一本',
 '描写',
 '原始',
 '森林',
 '的',
 '名',
 '叫',
 '真实',
 '的故事',
 '的',
 '书',
 '中',
 '看到了',
 '一',
 '幅',
 '精彩',
 '的',
 '插',
 '画',
 '</s>']

In [None]:
with open ('tokens_normed.txt', 'r') as f:
  tokens_normed = f.read()

In [None]:
tokenizer(text).input_ids