# POS tag using Logistic Regression

## Loading word embeddings
First we load the pretrained GloVe word embeddings trained on twitter data.

In [1]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

import re
import string
import numpy as np
import os.path

# Create a directory 'pretrained_embeds/' in the same directory as this notebook
# Download twitter embeddings from http://nlp.stanford.edu/data/glove.twitter.27B.zip
# Unzip it and place file 'glove.twitter.27B.25d.txt' in 'pretrained_embeds/' directory.

# We are doing it with 25 dimensional word embeddings, however we can try doing with more 
# dimensional embeddings available.

# If glove embeds is not in word2vec form then first convert it then load it
if os.path.isfile('pretrained_embeds/gensim_glove_vectors.txt'):
    glove_model = KeyedVectors.load_word2vec_format("pretrained_embeds/gensim_glove_vectors.txt", binary=False)
else:
    glove2word2vec(glove_input_file="pretrained_embeds/glove.twitter.27B.25d.txt", word2vec_output_file="pretrained_embeds/gensim_glove_vectors.txt")
    glove_model = KeyedVectors.load_word2vec_format("pretrained_embeds/gensim_glove_vectors.txt", binary=False)

def get_embed(word):
    # Case folding
    word = word.lower()
    try:
        return (glove_model.get_vector(word))
    except:
        return (glove_model.get_vector('unk'))


## Creating dataset

Loading data using nltk (we are using brown corpus) and splitting data in train and test.

In [2]:
from nltk.corpus import brown

tagged_sents = brown.tagged_sents(tagset='universal')

# Splitting train and test(80:20)
train_len = int(len(tagged_sents) * 0.8)
train_sents = tagged_sents[:train_len]
test_sents = tagged_sents[train_len:]

brown_tags_words = []
train_tags = []
train_words = []
train_embeds = []

test_tags = []
test_words = []
test_embeds = []

# Create Train data
for sent in train_sents:
    brown_tags_words.extend([ (tag, word) for (word, tag) in sent ])

# Seperate out tag and word sequences
for (tag, word) in brown_tags_words:
    train_tags.append(tag)
    train_words.append(word)
    train_embeds.append(get_embed(word))
    
brown_tags_words = []
# Create Test data
for sent in test_sents:
    brown_tags_words.extend([ (tag, word) for (word, tag) in sent ])

# Seperate out tag and word sequences
for (tag, word) in brown_tags_words:
    test_tags.append(tag)
    test_words.append(word)
    test_embeds.append(get_embed(word))

# Adding bias at the end of each embedding
train_embeds = np.asarray(train_embeds)
temp = np.ones((train_embeds.shape[0], train_embeds.shape[1] + 1))
temp[:,:-1] = train_embeds
train_embeds = temp

## Logistic Regression 

We will use one v/s all Logistic Regression as this is multiclass classification.

In [3]:
class logisitic_regression:
    
    def __init__(self, tag, inp, out):
        self.tag = tag
        self.inp = inp
        self.out = out
        self.weights = np.zeros((train_embeds.shape[1], 1))
        self.iterations = 300
        self.lr = 0.99
        
        # Consider 1 for which tag model is being trained, others will be considered as 0
        self.out = np.asarray([1 if i == self.tag else 0 for i in self.out]).reshape(-1,1)
        
        # train method will update weights.
        self.train(self.inp, self.out, self.weights)
        
        
    def sigmoid(self, u):
        return (1.0 / (1.0 + np.exp((-1.0) * u)))


    def hypothesis(self, w, X):
        return self.sigmoid(np.matmul(X, w))
        
        
    def cost(self, X, Y, w):
        h = self.hypothesis(w, X)
        cost = (-1) * (np.matmul(Y.T, np.log(h)) + np.matmul(1 - Y.T, np.log(1 - h))) / X.shape[0]
        return cost
    
    
    def gradient_descent(self, X, Y, w, lr, iterations):
        loss_list = []
        for i in range(iterations):
            loss = self.cost(X, Y, w)
            loss_list.append(loss)
            gradient = np.matmul(X.T, (self.hypothesis(w, X) - Y)) / X.shape[0]
            w = w - lr * gradient
        return loss_list, w
        
        
    def train(self, inp, out, w):
        losses, trained_w = self.gradient_descent(inp, out, w, self.lr, self.iterations)
        self.weights = trained_w
        print("Trained for : " + self.tag)
        
    # We will use this to get score of a given word after training.   
    def get_score(self, X):
        return self.hypothesis(self.weights, X)



## Training

For each label we train seperate LR models.

In [4]:
possible_tags = set(train_tags)
models = {}

# We train models for all the possible labels we have seen in train data.
# If there are N unique labels in train data, we will have N models to be trained.
for t in possible_tags:
    models[t] = logisitic_regression(t, train_embeds, train_tags)
    

Trained for : ADP
Trained for : PRT
Trained for : NOUN
Trained for : DET
Trained for : X
Trained for : ADV
Trained for : VERB
Trained for : PRON
Trained for : NUM
Trained for : CONJ
Trained for : ADJ
Trained for : .


## Testing

We test the trained model on the data split we had done before and find the accuracy.

In [5]:
# Function returns a tag for a given word. It calculates score for each tag and returns tag with maximum score.
def pred_tag(word):
    scores = {}
    for k, v in models.items():
        scores[k] = v.get_score(np.asarray([np.append(get_embed(word), 1)]))
    return sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[0][0]

# Predict tags for each word
pred_tags = [pred_tag(test_word) for test_word in test_words]
# Find the accuracy of predicted tags
accuracy_list = [1 if pred_tags[i] == test_tags[i] else 0 for i in range(len(pred_tags))]
print('Accuracy on test set : ' + "{0:.2f}".format(sum(accuracy_list)/len(accuracy_list)*100) + ' %')


Accuracy on test set : 73.23 %


## Sentence POS Prediction

Predicting POS for a given sentence.

In [11]:
import nltk

def pred_tag_sequence(sentence):
    for w in nltk.word_tokenize(sentence):
        print('{:<15s}'.format(w) + '{:<10s}'.format(pred_tag(w)))

pred_tag_sequence("Mrs Miller wants the entire house repainted.")

Mrs            VERB      
Miller         NOUN      
wants          VERB      
the            DET       
entire         DET       
house          NOUN      
repainted      VERB      
.              .         


# Future Work

We can trying predicting POS given some context as well, so by using bigram or higher n-gram. We can concatenate the embedding vectors or even try averaging embedding vectors of surrounding words(a window around the center word of whose POS we are trying to predict).