Tokenisation

**Instructor: Haiqin Yang**: yanghaiqin@sztu.edu.cn

---
# Setup

In [None]:
# @title Imports
!pip install tokenizers
from urllib.request import urlopen
import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
import re
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
nltk.download('punkt')
nltk.download('stopwords')
!pip install SentencePiece
from transformers import MT5Tokenizer, GPT2LMHeadModel, TextGenerationPipeline



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Collecting SentencePiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.3 MB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.3 MB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m0.8/1.3 MB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m1.2/1.3 MB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SentencePiece
Successfully installed SentencePiece-0.1.99


---
# Read text


## From webpage

In [None]:
# Download comments from webpage
html = urlopen('https://www.hplovecraft.com/writings/texts/fiction/cc.aspx').read()

In [None]:
html

In [None]:
# Remove html markup
raw = BeautifulSoup(html).get_text()

In [None]:
raw



In [None]:
# trim the text to where you want it to begin
ind_start = re.search('“Of such great powers', raw).start()
raw = raw[ind_start:]

In [None]:
ind_start

316

In [None]:
raw



In [None]:
# save contents to text file
with open ('cthulhu.txt', 'w') as f:
  f.write(raw)

## From text file

In [None]:
# read the file we just saved
with open ('cthulhu.txt', 'r') as f:
  raw = f.read()

In [None]:
raw



---
# Tokenization

## Using whitespace

In [None]:
# segment words using whitespace
tokens = raw.split() # Python's built-in function, default seperaotr is whitespace; but you can change it to other seperators
sent_tokens = raw.split('.')

In [None]:
tokens

In [None]:
sent_tokens

## Using NLTK's tokenizer

In [None]:
tokens = word_tokenize(raw)

In [None]:
tokens

In [None]:
# Show frequncy of each word
token_freq = FreqDist(tokens) #nltk's function to find how many times each token occurs in the text

In [None]:
token_freq.most_common(50)

# Text normalization

### Remove punctutation

In [None]:
tokens = [t for t in tokens if t.isalpha()]

In [None]:
tokens

In [None]:
token_freq = FreqDist(tokens)
token_freq.most_common(50)

### Case folding

In [None]:
tokens_lower = [t.lower() for t in tokens]

In [None]:
tokens_lower

### Lemmatization

#### Using Porter stemmer

In [None]:
ps = PorterStemmer()

In [None]:
ps.stem('computational')

'comput'

In [None]:
ps.stem('linguistics')

'linguist'

In [None]:
tokens_stemmed = [ps.stem(w) for w in tokens_lower]

In [None]:
tokens_stemmed

In [None]:
# save normalized tokens
with open('tokens_normed.txt', 'w') as f:
  f.write('\n'.join(tokens_lower))

#### Using BPE tokenizer

In [None]:
# import BPE tokenizer and train on the normalized text
bpe_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
bpe_tokenizer.train(['tokens_normed.txt'], trainer)

In [None]:
# test BPE tokenizer on unseen words
output = bpe_tokenizer.encode('computational linguistics')
output.tokens

['comp', 'ut', 'at', 'ion', 'al', '[UNK]', 'ling', 'u', 'ist', 'ic', 's']

#### Using mGPT tokenizer

In [None]:
tokenizer = MT5Tokenizer.from_pretrained("THUMT/mGPT")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'T5Tokenizer'.


In [None]:
input_ids = tokenizer('computational linguistics').input_ids

In [None]:
input_ids

[93928, 65662, 259, 151025, 263, 1]

In [None]:
[tokenizer.decode(i) for i in input_ids]

['comput', 'ational', '', 'linguistic', 's', '</s>']

In [None]:
text = '当我还只有六岁的时候在一本描写原始森林的名叫真实的故事的书中看到了一幅精彩的插画'

In [None]:
input_ids = tokenizer(text).input_ids

In [None]:
input_ids

In [None]:
[tokenizer.decode(i) for i in input_ids]

In [None]:
with open ('tokens_normed.txt', 'r') as f:
  tokens_normed = f.read()

In [None]:
tokenizer(text).input_ids