<a href="https://colab.research.google.com/github/martinthetechie/nlp-guide/blob/main/pos_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
quote = "Happiness can be found, even in the darkest of times, if one only remembers to turn on the light."


<h2>NLTK Demo</h2>

In [8]:
import nltk
from nltk.tag import RegexpTagger
from nltk.tokenize import word_tokenize

# Define patterns for rule-based tagging
patterns = [
    (r'^Happiness$', 'NOUN'),
    (r'^can$', 'VERB'),
    (r'^be$', 'VERB'),
    (r'^found$', 'VERB'),
    (r'^even$', 'ADV'),
    (r'^in$', 'ADP'),
    (r'^the$', 'DET'),
    (r'^darkest$', 'ADJ'),
    (r'^of$', 'ADP'),
    (r'^times$', 'NOUN'),
    (r'^if$', 'SCONJ'),
    (r'^one$', 'PRON'),
    (r'^only$', 'ADV'),
    (r'^remembers$', 'VERB'),
    (r'^to$', 'PART'),
    (r'^turn$', 'VERB'),
    (r'^on$', 'ADP'),
    (r'^light$', 'NOUN'),
    (r'^,$', 'PUNC'),
    (r'^.$', 'PUNC'),
]

# Create a RegexpTagger with the defined patterns
tagger = RegexpTagger(patterns)

# Tokenize and apply rule-based tagging
tokens = word_tokenize(quote)
rule_based_tags = tagger.tag(tokens)

word_dict = {word: tag for word, tag in rule_based_tags}
print(word_dict)

# {'Happiness': 'NOUN', 'can': 'VERB', 'be': 'VERB', 'found': 'VERB', ',': 'PUNC', 'even': 'ADV', 'in': 'ADP', 'the': 'DET', 'darkest': 'ADJ', 'of': 'ADP', 'times': 'NOUN', 'if': 'SCONJ', 'one': 'PRON', 'only': 'ADV', 'remembers': 'VERB', 'to': 'PART', 'turn': 'VERB', 'on': 'ADP', 'light': 'NOUN', '.': 'PUNC'}

{'Happiness': 'NOUN', 'can': 'VERB', 'be': 'VERB', 'found': 'VERB', ',': 'PUNC', 'even': 'ADV', 'in': 'ADP', 'the': 'DET', 'darkest': 'ADJ', 'of': 'ADP', 'times': 'NOUN', 'if': 'SCONJ', 'one': 'PRON', 'only': 'ADV', 'remembers': 'VERB', 'to': 'PART', 'turn': 'VERB', 'on': 'ADP', 'light': 'NOUN', '.': 'PUNC'}


In [13]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# nltk.download('averaged_perceptron_tagger')

# Tokenize the sentence
tokens = word_tokenize(quote)

# Apply statistical POS tagging
statistical_tags = pos_tag(tokens)

# Print the statistical POS tags
word_dict = {word: tag for word, tag in rule_based_tags}
print(word_dict)

# {'Happiness': 'NOUN', 'can': 'VERB', 'be': 'VERB', 'found': 'VERB', ',': 'PUNC', 'even': 'ADV', 'in': 'ADP', 'the': 'DET', 'darkest': 'ADJ', 'of': 'ADP', 'times': 'NOUN', 'if': 'SCONJ', 'one': 'PRON', 'only': 'ADV', 'remembers': 'VERB', 'to': 'PART', 'turn': 'VERB', 'on': 'ADP', 'light': 'NOUN', '.': 'PUNC'}


{'Happiness': 'NOUN', 'can': 'VERB', 'be': 'VERB', 'found': 'VERB', ',': 'PUNC', 'even': 'ADV', 'in': 'ADP', 'the': 'DET', 'darkest': 'ADJ', 'of': 'ADP', 'times': 'NOUN', 'if': 'SCONJ', 'one': 'PRON', 'only': 'ADV', 'remembers': 'VERB', 'to': 'PART', 'turn': 'VERB', 'on': 'ADP', 'light': 'NOUN', '.': 'PUNC'}


<h2>Spacy Demo</h2>

In [23]:
import spacy
import pprint

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Example sentence
sentence = "Happiness can be found, even in the darkest of times, if one only remembers to turn on the light."

# Process the sentence
doc = nlp(sentence)

print(" | ".join(f"{token.text}: {token.pos_} ({token.tag_})" for token in doc))

Happiness: NOUN (NN) | can: AUX (MD) | be: AUX (VB) | found: VERB (VBN) | ,: PUNCT (,) | even: ADV (RB) | in: ADP (IN) | the: DET (DT) | darkest: NOUN (NN) | of: ADP (IN) | times: NOUN (NNS) | ,: PUNCT (,) | if: SCONJ (IN) | one: NUM (CD) | only: ADV (RB) | remembers: VERB (VBZ) | to: PART (TO) | turn: VERB (VB) | on: ADP (IN) | the: DET (DT) | light: NOUN (NN) | .: PUNCT (.)


<h2>Bert Based</h2>

In [None]:
!pip install torch transfomers

In [29]:
import torch
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline

# Load a pre-trained BERT model for token classification (fine-tuned for POS tagging)
tokenizer = BertTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
model = BertForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")

# Create a pipeline for token classification (POS tagging)
nlp = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example sentence
sentence = "Happiness can be found, even in the darkest of times, if one only remembers to turn on the light."

# Perform POS tagging
bert_tags = nlp(sentence)

# Perform POS tagging and store in the dictionary
pos_dict = {word['word']: word['entity_group'] for word in bert_tags}

pos_sentence = ', '.join([f"{word}: {tag}" for word, tag in pos_dict.items()])
print(f"POS Tags: {pos_sentence}")
# POS Tags: Happiness: NOUN, can: AUX, be: AUX, found: VERB, , : PUNCT, even: ADV, in: ADP, the: DET, darkest: ADJ, of: ADP, times: NOUN, , : PUNCT, if: SCONJ, one: PRON, only: ADV, remembers: VERB, to: PART, turn: VERB, on: ADP, the: DET, light: NOUN, . : PUNCT


Some weights of the model checkpoint at vblagoje/bert-english-uncased-finetuned-pos were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


POS Tags: happiness: NOUN, can be: AUX, found: VERB, ,: PUNCT, even: ADV, in: ADP, the: DET, darkest: ADJ, of: ADP, times: NOUN, if: SCONJ, one: PRON, only: ADV, remembers: VERB, to: PART, turn: VERB, on: ADP, light: NOUN, .: PUNCT
