In [38]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import random

In [17]:
from nltk.corpus import brown
from nltk.tokenize import word_tokenize

In [4]:
words = []
word_id = 0
word2id = {}

vectors = []

with open('./glove.6B.300d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2id[word] = word_id
        word_id += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

In [5]:
glove = {w: vectors[word2id[w]] for w in words}

In [6]:
tagged_corpus = brown.tagged_words(tagset="universal")

In [7]:
X = []
y = []

pos2num = {}
num2pos = {}

count = 0

for tagged in tagged_corpus:
    word = tagged[0].lower()
    if word in glove:
        X.append(np.asarray(glove[word]))
    else:
        X.append(np.asarray(glove['unk']))
    pos = tagged[1]
    
    if pos not in pos2num:
        pos2num[pos] = count
        num2pos[count] = pos
        count += 1
        
    y.append(pos2num[pos])

In [8]:
X[:5]

[array([  4.65600000e-02,   2.13180000e-01,  -7.43640000e-03,
         -4.58540000e-01,  -3.56390000e-02,   2.36430000e-01,
         -2.88360000e-01,   2.15210000e-01,  -1.34860000e-01,
         -1.64130000e+00,  -2.60910000e-01,   3.24340000e-02,
          5.66210000e-02,  -4.32960000e-02,  -2.16720000e-02,
          2.24760000e-01,  -7.51290000e-02,  -6.70180000e-02,
         -1.42470000e-01,   3.88250000e-02,  -1.89510000e-01,
          2.99770000e-01,   3.93050000e-01,   1.78870000e-01,
         -1.73430000e-01,  -2.11780000e-01,   2.36170000e-01,
         -6.36810000e-02,  -4.23180000e-01,  -1.16610000e-01,
          9.37540000e-02,   1.72960000e-01,  -3.30730000e-01,
          4.91120000e-01,  -6.89950000e-01,  -9.24620000e-02,
          2.47420000e-01,  -1.79910000e-01,   9.79080000e-02,
          8.31180000e-02,   1.52990000e-01,  -2.72760000e-01,
         -3.89340000e-02,   5.44530000e-01,   5.37370000e-01,
          2.91050000e-01,  -7.35140000e-03,   4.78800000e-02,
        

In [9]:
X = np.asarray(X)
y = np.asarray(y)

In [10]:
temp_y = [y_val if y_val == 1 else 0 for y_val in y]

In [11]:
temp_y = np.asarray(temp_y)

In [49]:
class LogisticRegression:
    def __init__(self, lr=0.1, num_iter=100000, tagnums=None,fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.weights = []
        self.tagnums = tagnums
    
    def add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def prediction(self, z):
        return 1 / (1 + np.exp(-z))
    
    def softmax(self,x):
        e = np.exp(np.asarray(x))
        dist = e / np.sum(e)
        return dist

    def loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.add_intercept(X)
        
        # weights initialization
        self.theta = np.zeros(X.shape[1])
        
        prev_loss = 10000
        for i in tqdm(range(self.num_iter)):
            z = np.dot(X, self.theta)
            h = self.prediction(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            cur_loss = self.loss(h,y)
                        
            if prev_loss - cur_loss < 0.00001:
                break
            prev_loss = cur_loss
    
        return self.theta
    
    def fit_all_classes(self,X,y):
        for i in self.tagnums:
            temp_y = [1 if y_val == i else 0 for y_val in y]
            print(i)

            temp_y = np.asarray(temp_y)    
            weight_i = self.fit(X,temp_y)    
            self.weights.append(weight_i)
        
        self.weights = np.asarray(self.weights)
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.add_intercept(X)
    
        return self.softmax(np.matmul(X, self.weights.T))
    
    def predict(self, sentence, threshold = 0.5):
        inp = []
        tokens = word_tokenize(sentence)

        for token in tokens:
            if token in glove:
                inp.append(glove[token])
            else:
                inp.append(random.choice(list(glove.items()))[1])

        inp = np.asarray(inp)
        preds = self.predict_prob(inp)
        
        args = []
        pos_preds = []
    
        for subarr in preds:
#             print(subarr)
#             print(np.argmax(subarr))
            
            args.append(np.argmax(subarr))
        
        for arg in args:
            pos_preds.append(num2pos[arg])
        
        return list(zip(tokens, pos_preds))
    
    def save_weights(self):
        f = open('./weight_matrix.npy','wb+')
        np.save(f,self.weights)
        f.close()
        
        return

In [50]:
model = LogisticRegression(lr=0.1, num_iter=1000, tagnums = list(num2pos.keys()))

In [51]:
try:
    model.weights = np.load('./weight_matrix.npy')
except:
    print('No weight matrix found')

In [67]:
model.fit_all_classes(X,y)

0


1


2


3


4


5


6


7


8


9


10


11


In [15]:
np.load('./weight_matrix.npy').shape

(12, 301)

In [46]:
model.predict('drink the juice.')

[  3.56608693e-04   3.00913413e-02   9.97972918e-04   1.12701677e-02
   2.22316769e-04   1.27413456e-04   3.34941105e-03   8.45076542e-04
   8.98588153e-04   3.42165537e-03   1.15803777e-04   7.65550195e-05]
1
[  6.04657069e-01   1.58667272e-03   6.16136750e-04   4.53207404e-04
   1.48688826e-03   2.80904510e-05   2.10732208e-04   6.77691792e-05
   1.31607938e-04   7.90689741e-05   7.60453194e-05   2.95348764e-05]
0
[ 0.00051293  0.0252777   0.00217276  0.00542226  0.00127674  0.00118712
  0.00151924  0.00211511  0.00233626  0.00104114  0.00056048  0.00034412]
1
[  3.32716380e-04   1.38746552e-03   4.86867549e-04   4.22419980e-04
   1.33904629e-03   2.89054134e-01   6.71391126e-04   3.86469636e-04
   3.25003242e-04   4.70100404e-04   1.20646446e-04   4.21364974e-05]
5


[('drink', 'NOUN'), ('the', 'DET'), ('juice', 'NOUN'), ('.', '.')]

In [19]:
model.weights

array([[-3.08131938,  0.05501284,  0.38959653, ...,  0.01581842,
        -0.30220087,  0.00446651],
       [ 0.90732368,  0.04204459, -0.10560233, ...,  0.22742927,
         0.01642113,  0.15283748],
       [-1.65234994,  0.04609446,  0.00440945, ...,  0.02773002,
        -0.01678408,  0.06766884],
       ..., 
       [-3.03938943, -0.05423672,  0.31715306, ...,  0.00812884,
         0.05121112, -0.10589744],
       [-2.28769017, -0.09387534,  0.17014126, ..., -0.2220913 ,
        -0.01165212, -0.06781459],
       [-2.38894539,  0.07117724,  0.0765835 , ..., -0.05124994,
        -0.02699758,  0.00363509]])

In [20]:
model.weights = np.asarray(model.weights)

In [21]:
model.weights.shape

(12, 301)

In [22]:
model.save_weights()

## Code Evaluation

In [52]:
brown.sents()[0]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [63]:
validation = brown.tagged_sents(tagset='universal')

In [54]:
sent_preds = []

for sent in brown.sents():
    fullsent = ''
    for i in sent:
        fullsent += i.lower()+' '
    
    pred = model.predict(fullsent)
    
    sent_preds.append(pred)

In [57]:
sent_preds[0]

[('the', 'DET'),
 ('fulton', 'NOUN'),
 ('county', 'NOUN'),
 ('grand', 'NOUN'),
 ('jury', 'VERB'),
 ('said', 'VERB'),
 ('friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP'),
 ('atlanta', 'NOUN'),
 ("'s", 'DET'),
 ('recent', 'NOUN'),
 ('primary', 'NOUN'),
 ('election', 'ADP'),
 ('produced', 'NOUN'),
 ('``', '.'),
 ('no', 'DET'),
 ('evidence', 'NOUN'),
 ('``', '.'),
 ('that', 'DET'),
 ('any', 'DET'),
 ('irregularities', 'ADJ'),
 ('took', 'NOUN'),
 ('place', 'ADP'),
 ('.', '.')]

In [65]:
len(validation)

57340

In [66]:
len(sent_preds)

57340

In [70]:
cor = 0
incor = 0
total = 0

for i in range(len(validation)):
    if (len(validation[i]) == len(sent_preds[i])):
        s1 = validation[i]
        s2 = sent_preds[i]
        
        for j in range(len(s1)):
            if s1[j][1] == s2[j][1]:
                cor += 1
            else:
                incor += 1
#                 print('word: ',s1[j][0],' pred_tag: ',s2[j][1],' act_tag: ',s1[j][1])

In [71]:
acc = cor/(cor+incor)

In [72]:
acc

0.7440845177307271