In [None]:
# Tokenization Examples in Python

## 1. Using NLTK for Tokenization
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Download necessary data
nltk.download('punkt')
nltk.download('punkt_tab')

text = "Tokenization is a key step in NLP. It splits text into tokens."

# Word tokenization
word_tokens = word_tokenize(text)
print("Word Tokens:", word_tokens)

# Sentence tokenization
sentence_tokens = sent_tokenize(text)
print("Sentence Tokens:", sentence_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...


Word Tokens: ['Tokenization', 'is', 'a', 'key', 'step', 'in', 'NLP', '.', 'It', 'splits', 'text', 'into', 'tokens', '.']
Sentence Tokens: ['Tokenization is a key step in NLP.', 'It splits text into tokens.']


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
## 2. Using spaCy for Tokenization
import spacy

# Load English tokenizer
nlp = spacy.load("en_core_web_sm")

# Process text
doc = nlp(text)

# Word tokens
word_tokens = [token.text for token in doc]
print("Word Tokens:", word_tokens)

# Sentence tokens
sentence_tokens = [sent.text for sent in doc.sents]
print("Sentence Tokens:", sentence_tokens)

Word Tokens: ['Tokenization', 'is', 'a', 'key', 'step', 'in', 'NLP', '.', 'It', 'splits', 'text', 'into', 'tokens', '.']
Sentence Tokens: ['Tokenization is a key step in NLP.', 'It splits text into tokens.']


In [None]:
## 3. Custom Tokenization using Regular Expressions
import re

# Word tokenization
word_tokens = re.findall(r'\w+', text)
print("Word Tokens:", word_tokens)

# Sentence tokenization
sentence_tokens = re.split(r'[.!?]', text)
sentence_tokens = [sent.strip() for sent in sentence_tokens if sent]
print("Sentence Tokens:", sentence_tokens)


Word Tokens: ['Tokenization', 'is', 'a', 'key', 'step', 'in', 'NLP', 'It', 'splits', 'text', 'into', 'tokens']
Sentence Tokens: ['Tokenization is a key step in NLP', 'It splits text into tokens']


In [None]:
## 4. Subword Tokenization using Hugging Face Transformers
from transformers import AutoTokenizer

# Load a pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text
tokens = tokenizer.tokenize(text)
print("Subword Tokens:", tokens)

# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Input IDs:", input_ids)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Subword Tokens: ['token', '##ization', 'is', 'a', 'key', 'step', 'in', 'nl', '##p', '.', 'it', 'splits', 'text', 'into', 'token', '##s', '.']
Input IDs: [19204, 3989, 2003, 1037, 3145, 3357, 1999, 17953, 2361, 1012, 2009, 19584, 3793, 2046, 19204, 2015, 1012]


In [None]:
pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.1 stanza-1.10.1


In [None]:
import stanza
stanza.install_corenlp()

INFO:stanza:Installing CoreNLP package into /root/stanza_corenlp


Downloading https://huggingface.co/stanfordnlp/CoreNLP/resolve/main/stanford-corenlp-latest.zip:   0%|        …

INFO:stanza:Downloaded file to /root/stanza_corenlp/corenlp.zip


In [None]:
## 5. Using Stanford CoreNLP for Tokenization
from stanza.server import CoreNLPClient
text = "Tokenization is a key step in NLP. It splits text into tokens."
# Start CoreNLP client
with CoreNLPClient(annotators=['tokenize'], timeout=30000, memory='2G') as client:
    # Annotate the text
    ann = client.annotate(text)

    # Word tokens
    word_tokens = [token.word for sentence in ann.sentence for token in sentence.token]
    print("Word Tokens:", word_tokens)

    # Sentence tokens
    sentence_tokens = [sentence.text for sentence in ann.sentence]
    print("Sentence Tokens:", sentence_tokens)

INFO:stanza:Writing properties to tmp file: corenlp_server-89a39ced38a140c8.props
INFO:stanza:Starting server with command: java -Xmx2G -cp /root/stanza_corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-89a39ced38a140c8.props -annotators tokenize -preload -outputFormat serialized


PermanentlyFailedException: Timed out waiting for service to come alive.

In [None]:
## 5. Using Stanford CoreNLP for Tokenization
from stanza.server import CoreNLPClient
text = "Tokenization is a key step in NLP. It splits text into tokens."
# Start CoreNLP client, increase timeout and memory
with CoreNLPClient(annotators=['tokenize'], timeout=60000, memory='4G', be_quiet=False, endpoint='http://localhost:9001') as client: # Increase timeout to 60 seconds and memory to 4GB. Added be_quiet=False for debugging. Changed port to 9001
    # Annotate the text
    ann = client.annotate(text)

    # Word tokens
    word_tokens = [token.word for sentence in ann.sentence for token in sentence.token]
    print("Word Tokens:", word_tokens)

    # Sentence tokens
    sentence_tokens = [sentence.text for sentence in ann.sentence]
    print("Sentence Tokens:", sentence_tokens)

INFO:stanza:Writing properties to tmp file: corenlp_server-c8d538b68fcf41bf.props
INFO:stanza:Starting server with command: java -Xmx4G -cp /root/stanza_corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-c8d538b68fcf41bf.props -annotators tokenize -preload -outputFormat serialized


Word Tokens: ['Tokenization', 'is', 'a', 'key', 'step', 'in', 'NLP', '.', 'It', 'splits', 'text', 'into', 'tokens', '.']
Sentence Tokens: ['', '']


In [None]:
pip install polyglot pyicu pycld2  morfessor


Collecting pyicu
  Downloading PyICU-2.14.tar.gz (263 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.9/263.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pycld2
  Downloading pycld2-0.41.tar.gz (41.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Building wheels for collected packages: pyicu, pycld2
  Building wheel for pyicu (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pyicu: filename=PyICU-2.14-cp311-cp311-linux_x86_64.whl size=1825010 sha256=b74dfd68a46a306165ae61e642eda6fcf1a81fc39f6ed4

In [None]:
## 6. Tokenization using Polyglot
from polyglot.text import Text

# Process text using Polyglot
polyglot_text = Text(text)

# Word tokens
word_tokens = polyglot_text.words
print("Word Tokens:", word_tokens)

# Sentence tokens
sentence_tokens = polyglot_text.sentences
print("Sentence Tokens:", sentence_tokens)

ModuleNotFoundError: No module named 'icu'

In [None]:
from gensim.models import Word2Vec

# Example corpus
sentences = [
    ["dog", "barks", "at", "the", "cat"],
    ["cat", "meows", "loudly"],
    ["birds", "chirp", "in", "the", "morning"],
    ["dogs", "and", "cats", "are", "pets"],
]

# Train Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Find words contextually similar to "cat"
similar_words = model.wv.most_similar("cat", topn=5)
print("Words contextually similar to 'cat':", similar_words)


Words contextually similar to 'cat': [('chirp', 0.21617142856121063), ('meows', 0.0931011214852333), ('morning', 0.09291722625494003), ('loudly', 0.07963486760854721), ('birds', 0.06285078823566437)]


In [None]:
from sklearn.cluster import KMeans

# Extract word vectors
word_vectors = model.wv
vocab = list(word_vectors.index_to_key)
vectors = [word_vectors[word] for word in vocab]

# Apply KMeans
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(vectors)

# Group words by clusters
clustered_words = {i: [] for i in range(num_clusters)}
for i, word in enumerate(vocab):
    cluster_id = kmeans.labels_[i]
    clustered_words[cluster_id].append(word)

# Display clusters
for cluster_id, words in clustered_words.items():
    print(f"Cluster {cluster_id}: {words}")


Cluster 0: ['pets', 'cats', 'dogs', 'in']
Cluster 1: ['and', 'morning', 'birds', 'loudly']
Cluster 2: ['the', 'are', 'at', 'barks']
Cluster 3: ['cat', 'chirp', 'dog']
Cluster 4: ['meows']
