# Tokenization (Sentence / Word) - nltk

In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#1. Sentence Tokenization

In [None]:
import nltk

def sentence_tokenize(text):
    return nltk.sent_tokenize(text)

text = "Hello! How are you? I hope you're doing well. This is a tokenization example."
sentences = sentence_tokenize(text)

print("Sentences:", sentences)


Sentences: ['Hello!', 'How are you?', "I hope you're doing well.", 'This is a tokenization example.']


#2. Word Tokenization

In [None]:
import nltk

def word_tokenize(text):
    return nltk.word_tokenize(text)

text = "Hello! How are you? I hope you're doing well. This is a tokenization example."
words = word_tokenize(text)

print("Words:", words)


Words: ['Hello', '!', 'How', 'are', 'you', '?', 'I', 'hope', 'you', "'re", 'doing', 'well', '.', 'This', 'is', 'a', 'tokenization', 'example', '.']


#3. Custom Tokenization

In [None]:
from nltk.tokenize import RegexpTokenizer

# Custom word tokenization using regular expressions (splits on spaces)
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(text)

print("Custom Words:", words)


Custom Words: ['Hello', 'How', 'are', 'you', 'I', 'hope', 'you', 're', 'doing', 'well', 'This', 'is', 'a', 'tokenization', 'example']


#4. Removinng Stopwords using NLTK

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  # Set of stopwords for the English language
    words = word_tokenize(text)  # Tokenize the input text into words
    filtered_sentence = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    return filtered_sentence

# Example usage
text = "This is an example of removing stopwords from a given sentence."
filtered_text = remove_stopwords(text)

print("Original Text:", text)
print("Filtered Text (Without Stopwords):", filtered_text)


Original Text: This is an example of removing stopwords from a given sentence.
Filtered Text (Without Stopwords): ['example', 'removing', 'stopwords', 'given', 'sentence', '.']


In [None]:
print(stopwords.words('hindi'))

OSError: No such file or directory: '/root/nltk_data/corpora/stopwords/hindi'

# 5.1 Steaming using Porter Stemmer

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def stem_words(text):
    ps = PorterStemmer()  # Initialize the Porter Stemmer
    words = word_tokenize(text)  # Tokenize the input text into words
    stemmed_words = [ps.stem(word) for word in words]  # Apply stemming to each word
    return stemmed_words

# Example usage
text = "I am loving the way you are running and playing outside."
stemmed_text = stem_words(text)

print("Original Text:", text)
print("Stemmed Text:", stemmed_text)


Original Text: I am loving the way you are running and playing outside.
Stemmed Text: ['i', 'am', 'love', 'the', 'way', 'you', 'are', 'run', 'and', 'play', 'outsid', '.']


# 5.2 Steaming using Lancaster Stemmer

In [None]:
from nltk.stem import LancasterStemmer

def stem_words_lancaster(text):
    ls = LancasterStemmer()  # Initialize the Lancaster Stemmer
    words = word_tokenize(text)
    stemmed_words = [ls.stem(word) for word in words]
    return stemmed_words

# Example usage
stemmed_text_lancaster = stem_words_lancaster(text)

print("Lancaster Stemmed Text:", stemmed_text_lancaster)


Lancaster Stemmed Text: ['i', 'am', 'lov', 'the', 'way', 'you', 'ar', 'run', 'and', 'play', 'outsid', '.']


# 6. Lamitization Using WordNet Lemmatizer

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # For additional languages
nltk.download('punkt')  # For tokenization


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()  # Initialize the WordNet Lemmatizer
    words = word_tokenize(text)  # Tokenize the input text into words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize each word (default is noun)
    return lemmatized_words

# Example usage
text = "The striped bats are hanging on their feet for best"
lemmatized_text = lemmatize_words(text)

print("Original Text:", text)
print("Lemmatized Text:", lemmatized_text)


Original Text: The striped bats are hanging on their feet for best
Lemmatized Text: ['The', 'striped', 'bat', 'are', 'hanging', 'on', 'their', 'foot', 'for', 'best']


# 7. Removing Digits

In [None]:
import re

def remove_digits(text):
    result = re.sub(r'\d+', '', text)  # Replace all digits with an empty string
    return result

# Example usage
text_with_digits = "I have 2 cats and 3 dogs in 2024."
cleaned_text = remove_digits(text_with_digits)

print("Original Text:", text_with_digits)
print("Text Without Digits:", cleaned_text)


Original Text: I have 2 cats and 3 dogs in 2024.
Text Without Digits: I have  cats and  dogs in .


# 8. Convert to Lowercase

In [None]:
def convert_to_lowercase(text):
    return text.lower()

# Example usage
original_text = "Hello, World! This Is A Test String."
lowercase_text = convert_to_lowercase(original_text)

print("Original Text:", original_text)
print("Lowercase Text:", lowercase_text)


Original Text: Hello, World! This Is A Test String.
Lowercase Text: hello, world! this is a test string.


# 9. Language Detection

In [None]:
! pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/981.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m15.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993221 sha256=1601942a45af8f56d647a58b34c93acce2d68cbb641c9e064610ecc053fa5921
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711

In [None]:
from langdetect import detect, DetectorFactory

# Set a seed for reproducibility
DetectorFactory.seed = 0

def detect_language(text):
    return detect(text)

# Example usage
text_english = "Hello, how are you?"
text_spanish = "Hola, ¿cómo estás?"
text_french = "Bonjour, comment ça va?"

print(f"Text: '{text_english}' - Detected Language: {detect_language(text_english)}")
print(f"Text: '{text_spanish}' - Detected Language: {detect_language(text_spanish)}")
print(f"Text: '{text_french}' - Detected Language: {detect_language(text_french)}")


Text: 'Hello, how are you?' - Detected Language: en
Text: 'Hola, ¿cómo estás?' - Detected Language: es
Text: 'Bonjour, comment ça va?' - Detected Language: fr


# 10. POS Tagging
Common POS Tags
Here are some common POS tags used by NLTK:

NN: Noun, singular
NNS: Noun, plural
JJ: Adjective
DT: Determiner
VB: Verb, base form
VBD: Verb, past tense
VBG: Verb, gerund/present participle
IN: Preposition or subordinating conjunction

In [None]:
import nltk
nltk.download('punkt')  # For tokenization
nltk.download('averaged_perceptron_tagger')  # For POS tagging

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
import nltk
from nltk.tokenize import word_tokenize

def pos_tagging(text):
    # Tokenize the input text into words
    words = word_tokenize(text)
    # Tag the words with their part of speech
    pos_tags = nltk.pos_tag(words)
    return pos_tags

# Example usage
text = "The quick brown fox jumps over the lazy dog."
pos_tags = pos_tagging(text)

print("Original Text:", text)
print("POS Tags:", pos_tags)


Original Text: The quick brown fox jumps over the lazy dog.
POS Tags: [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


# 11. Parsing

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

def dependency_parsing(text):
    doc = nlp(text)
    for token in doc:
        print(f"Word: {token.text}, Dependent on: {token.head.text}, Dependency: {token.dep_}")

# Example usage
text = "The quick brown fox jumps over the lazy dog."
dependency_parsing(text)


Word: The, Dependent on: fox, Dependency: det
Word: quick, Dependent on: fox, Dependency: amod
Word: brown, Dependent on: fox, Dependency: amod
Word: fox, Dependent on: jumps, Dependency: nsubj
Word: jumps, Dependent on: jumps, Dependency: ROOT
Word: over, Dependent on: jumps, Dependency: prep
Word: the, Dependent on: dog, Dependency: det
Word: lazy, Dependent on: dog, Dependency: amod
Word: dog, Dependent on: over, Dependency: pobj
Word: ., Dependent on: jumps, Dependency: punct


# 12. Coreference resolution is the task of determining when two or more expressions in a text refer to the same entity. For example, in the sentences "John went to the store. He bought milk," the word "He" refers to "John." Coreference resolution is important in natural language understanding, as it helps maintain context and coherence.

Using SpaCy for Coreference Resolution
While NLTK does not have built-in support for coreference resolution, the spaCy library offers this capability when combined with the neuralcoref package.

In [None]:
!pip install allennlp allennlp-models spacy
!python -m spacy download en_core_web_sm


Collecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl.metadata (21 kB)
Collecting allennlp-models
  Downloading allennlp_models-2.10.1-py3-none-any.whl.metadata (23 kB)
Collecting torch<1.13.0,>=1.10.0 (from allennlp)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl.metadata (22 kB)
Collecting torchvision<0.14.0,>=0.8.1 (from allennlp)
  Downloading torchvision-0.13.1-cp310-cp310-manylinux1_x86_64.whl.metadata (10 kB)
Collecting cached-path<1.2.0,>=1.1.3 (from allennlp)
  Downloading cached_path-1.1.6-py3-none-any.whl.metadata (6.0 kB)
Collecting fairscale==0.4.6 (from allennlp)
  Downloading fairscale-0.4.6.tar.gz (248 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.2/248.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l

2024-09-19 08:01:17.624306: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-19 08:01:17.972924: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-19 08:01:18.066158: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-19 08:01:18.638407: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.3.0
  Downloading https:

In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
import spacy

# Load the AllenNLP Coreference Resolution model
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")

def coreference_resolution(text):
    # Perform coreference resolution using AllenNLP
    result = predictor.predict(document=text)

    # Replace coreferences in the text
    words = result['document']
    clusters = result['clusters']

    # Create a map for replacements
    resolved = words.copy()
    for cluster in clusters:
        first_mention = cluster[0]
        for mention in cluster[1:]:
            for i in range(mention[0], mention[1] + 1):
                resolved[i] = words[first_mention[0]]

    return ' '.join(resolved)

# Example usage
text = "Alice went to the park. She had a great time there."
resolved_text = coreference_resolution(text)

print("Original Text:", text)
print("Resolved Text:", resolved_text)
