# POS Tagging using Logistic Regression

## Imports and Initialisations

In [1]:
from collections import Counter
from copy import deepcopy
import matplotlib.pyplot as plt
import numpy as np
import nltk

nltk.download('brown')
nltk.download('universal_tagset')

from nltk.corpus import brown

[nltk_data] Downloading package brown to
[nltk_data]     /Users/arghyabhattacharya/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/arghyabhattacharya/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [2]:
sents = brown.tagged_sents(tagset='universal')
_N = int(len(sents) * (8 / 10))
sents_train = sents[:_N]
sents_test = sents[_N:]

In [3]:
sents[0]

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP'),
 ("Atlanta's", 'NOUN'),
 ('recent', 'ADJ'),
 ('primary', 'NOUN'),
 ('election', 'NOUN'),
 ('produced', 'VERB'),
 ('``', '.'),
 ('no', 'DET'),
 ('evidence', 'NOUN'),
 ("''", '.'),
 ('that', 'ADP'),
 ('any', 'DET'),
 ('irregularities', 'NOUN'),
 ('took', 'VERB'),
 ('place', 'NOUN'),
 ('.', '.')]

## One vs Many Logistic Regression

In [4]:
class Multiclass_LR:
    def tag_sent(self, sent):
        sent_tagged = []
        for i, (word, tag) in enumerate(sent):
            x = np.array([int(ex(sent, i)) for k, ex in self.feature_extractors.items()])
            max_tag = max([(word, tag) for tag in self.tags], key=lambda k: np.dot(np.append(x, 1), W[k[1]]))
            sent_tagged.append(max_tag)
        return sent_tagged
        
    def train_model(self):
        for tag in self.tags:
            print('Training for tag', tag)
            self.W[tag] = self.logistic_regression(tag)
    
    def sigmoid(self, a):
        return 1.0 / (1 + np.exp(-a))
        
    def logistic_regression(self, tag, num_steps=300000, learning_rate=5e-5, add_intercept=True):
        X = self.X[tag]
        Y = self.Y[tag]
        if add_intercept:
            intercept = np.ones((X.shape[0], 1))
            X = np.hstack((X, intercept))
            
        w = np.zeros(X.shape[1])
        for step in range(num_steps):
            scores = np.dot(X, w)
            predictions = self.sigmoid(scores)
            output_error_signal = Y - predictions
            gradient = np.dot(X.T, output_error_signal)
            w += learning_rate * gradient
        return w
        
    def loss(self, w, X, Y):
        w = np.array(w).reshape(-1, 1)
        sum = 0
        for i in range(Y.shape[0]):
            x = X[i].reshape(-1, 1)
            wTx = np.matmul(w.T, x)
            sum += Y[i] * wTx - np.log(1 + np.exp(wTx))
        return sum.reshape(())
        
    def get_feature_set(self):
        self.puncs = {'!', "'", "''", '(', ')', ',', '--', '.', ':', ';', '?', '[', ']', '``'}
        self.feature_extractors = {
            'is_upper_first_letter': lambda sent, i: sent[i][0][0].isupper(),
            'is_upper_all': lambda sent, i: sent[i][0].isupper(),
            'is_hyphenated': lambda sent, i: '-' in sent[i][0],
            'is_punctuation': lambda sent, i: sent[i][0] in self.puncs,
            'current_tag': lambda sent, i: self.tags.index(sent[i][1]),
            'previous_tag': lambda sent, i: self.tags.index(sent[i-1][1]) if i != 0 else -1,
        }
        
        for t in range(len(self.tags)):
            self.X[self.tags[t]] = []
            self.Y[self.tags[t]] = []
            for sent in self.sents:
                for i, (word, tag) in enumerate(sent):
                    
                    x = [int(ex(sent, i)) for k, ex in self.feature_extractors.items()]
                    print("this is the x", x)
                    self.X[self.tags[t]].append(x)
                    self.Y[self.tags[t]].append(1 if tag == self.tags[t] else 0)
            self.X[self.tags[t]] = np.array(self.X[self.tags[t]])
            self.Y[self.tags[t]] = np.array(self.Y[self.tags[t]])

    def __init__(self, tagged_sents):
        self.sents = tagged_sents
        self.tags = list(set([tag for sent in self.sents for word, tag in sent]))
        self.X = {}
        self.Y = {}
        self.W = {}
        self.get_feature_set()
        self.train_model()
        pass

In [None]:
T = Multiclass_LR(sents_train[:100])

Training for tag NOUN
Training for tag PRT
Training for tag .


In [222]:
score = 0
total_count = 0
for i, sent in enumerate(sents_test):
    try:
        res = T.tag_sent(sent)
        N = len(sent)
        total_count += N
        for i in range(N):
            score += 1 if res[i][1] == sent[i][1] else 0
    except ValueError:
        pass
            
accuracy = (score / total_count) * 100
print(accuracy)

68.8615061499447
