**Stanford POS Tagger**

- This module re-uses Harvard's Part-of-Speech Tagger [Link](https://nlp.stanford.edu/software/tagger.shtml).
- a Java jar is provided with the tagger model that can be used to generate part of speech corresponding to each token in the word.

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import StanfordPOSTagger
#nltk.download('punkt')

import numpy as np

In [2]:
class POSTagEmbedder:
    
    def __init__(self, pathPOSmodel, pathJPOSar, dimension=50):
        self.tagger = StanfordPOSTagger(pathPOSmodel, pathJPOSar, encoding='utf-8')
        self.dimension = dimension
        self.posEmbeddings = {}
    
    def tagSentence(self, sentence):
        tokens = word_tokenize(sentence)
        return self.tagger.tag(tokens)
    
    def getEmbedding(self, pos_tag):
        if pos_tag not in self.posEmbeddings:
            self.posEmbeddings[pos_tag] = np.random.rand(self.dimension)
        return self.posEmbeddings[pos_tag]
    
    def embedSentence(self, sentence):
        posTags = self.tagSentence(sentence)
        posTagEmbeddings = [self.getEmbedding(tag) for _,tag in posTags]
        return posTags, posTagEmbeddings

In [3]:
pathToJar = "../stanford-postagger-full-2020-11-17/stanford-postagger.jar"
pathToModel = "../stanford-postagger-full-2020-11-17/models/english-bidirectional-distsim.tagger"

posTagger = POSTagEmbedder(pathToModel, pathToJar)

In [4]:
example = "The cat is sad due to rain"
posTags, tagEmbeddings = posTagger.embedSentence(example)
print(posTags)
print(tagEmbeddings)

[('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('sad', 'JJ'), ('due', 'IN'), ('to', 'IN'), ('rain', 'NN')]
[array([0.62350653, 0.49142007, 0.28382648, 0.42488843, 0.52547651,
       0.11839107, 0.64939801, 0.24312467, 0.58772552, 0.90246642,
       0.94267591, 0.49687355, 0.83961333, 0.89259469, 0.32418369,
       0.77324447, 0.46254004, 0.07644416, 0.2814345 , 0.52109409,
       0.02358542, 0.14972748, 0.16439624, 0.77817287, 0.6612806 ,
       0.59720241, 0.95396548, 0.39849196, 0.82673474, 0.7092472 ,
       0.83497306, 0.7251456 , 0.18940127, 0.7848052 , 0.97499977,
       0.22448261, 0.36080076, 0.04087741, 0.41268394, 0.96959068,
       0.06704056, 0.73576081, 0.97487543, 0.53728036, 0.17981098,
       0.45863861, 0.4824587 , 0.71526599, 0.69348503, 0.66626118]), array([0.58141075, 0.61902197, 0.27277961, 0.23953159, 0.33621592,
       0.54364576, 0.47304658, 0.86061852, 0.97098178, 0.92195853,
       0.18099482, 0.41250385, 0.65994024, 0.77107188, 0.52930002,
       0.44933094, 0