**Stanford POS Tagger**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import StanfordPOSTagger
#nltk.download('punkt')

import numpy as np

In [None]:
class POSTagEmbedder:
    
    def __init__(self, path_POS_model, path_jar, embedding_dim=50):
        self.stanford_tagger = StanfordPOSTagger(path_POS_model, path_jar, encoding='utf-8')
        self.embedding_dim = embedding_dim
    
    def tokenize_text(self, text):
        tokens = word_tokenize(text)
        return tokens
    
    def tag_tokens_pos(self, tokens):
        pos_tags = self.stanford_tagger.tag(tokens)
        return pos_tags
    
    def unique_tags(self, tags):
        return set(tag for _,tag in tags)
    
    def initialize_embeddings(self, unique_tags):
        self.embeddings = {tag: np.random.rand(self.embedding_dim) for tag in unique_tags}
    
    
    def embedding_lookup(self, pos_tags):
        embedding_tags = [self.embeddings.get(tag, np.zeros(self.embedding_dim)) for _, tag in pos_tags]
        return embedding_tags
    
    def optimize_embeddings(self):
        pass # implement later

In [None]:
path_to_jar = "../stanford-postagger-full-2020-11-17/stanford-postagger.jar"
path_to_model = "../stanford-postagger-full-2020-11-17/models/english-bidirectional-distsim.tagger"

pos_tagger = POSTagEmbedder(path_to_model, path_to_jar)

In [None]:
example = "The cat is grey"
example_tokens = pos_tagger.tokenize_text(example)
example_tags = pos_tagger.tag_tokens_pos(example_tokens)
print(example_tags)

example_unique_tags = pos_tagger.unique_tags(example_tags)
pos_tagger.initialize_embeddings(example_unique_tags)

example_embedded_tags = pos_tagger.embedding_lookup(example_tags)
print(example_embedded_tags)
