In [1]:
import nltk.corpus as corpus
import numpy as np

# Loading Dataset

In [2]:
word_tag_list = corpus.brown.tagged_sents(tagset="universal")
word_tag_list_train = []
word_tag_list_test = []
count = 0
for element in word_tag_list:
    count += 1
print(count)

57340


In [3]:
for iter, element in enumerate(word_tag_list):
    if((iter / count) * 100 < 80):
        word_tag_list_train.append(element)
    else:
        word_tag_list_test.append(element)

In [4]:
unigram_count = dict()
for sentence in word_tag_list_train:
    for word_tag_pair in sentence:
        word = word_tag_pair[0]
        tag = word_tag_pair[1]
        if word not in unigram_count:
            unigram_count[word] = 0
        unigram_count[word] += 1

# Preparing Dataset

In [5]:
train_tags = set()
X = dict()
Y = dict()
W = dict()
for sentence in word_tag_list_train:
    for wordtag_pair in sentence:
        tag = wordtag_pair[1]
        train_tags.add(tag)

In [6]:
print("The following are the tags in training set", train_tags)

The following are the tags in training set {'VERB', 'CONJ', 'PRON', 'X', 'DET', 'ADP', '.', 'ADV', 'PRT', 'NUM', 'NOUN', 'ADJ'}


# Logistic Regression

In [7]:
def get_features(sentence, index):
    features = {
                    'word'              :unigram_count[sentence[index][0]] if sentence[index][0] in unigram_count else 0,
#                     'prefix'            :ord(current_word[0]),
#                     'suffix'            :ord(current_word[-1]),
                    'prev-word'         : -1 if index == 0 else unigram_count[sentence[index - 1][0]] if sentence[index - 1][0] in unigram_count else 0,
                    '2-prev-word'       : -1 if index <= 1 else unigram_count[sentence[index - 2][0]] if sentence[index - 2][0] in unigram_count else 0,
                    'next-word'         : -1 if index >= len(sentence) - 1     else unigram_count[sentence[index + 1][0]] if sentence[index + 1][0] in unigram_count else 0,
                    '2-next-word'       : -1 if index >= len(sentence) - 2     else unigram_count[sentence[index + 2][0]] if sentence[index + 2][0] in unigram_count else 0,
                    'is_first'          : 1 if index == 0  else 0,
                    'is_last'           : 1 if index == len(sentence) - 1  else 0,
                    'is_capitalized'    : 1 if sentence[index][0][0].upper() == sentence[index][0][0]  else 0,
                    'is_all_capitalized': 1 if sentence[index][0].upper() == sentence[index][0] else 0,
                    'is_capitals_inside': 1 if sentence[index][0][1:].lower() != sentence[index][0][1:] else 0,
                    'is_numeric'        : 1 if sentence[index][0].isdigit() else 0,
    }
    return features

In [8]:
for tag in train_tags:
    X[tag] = []
    Y[tag] = []
    for sentence in word_tag_list_train:
#         print("Training for sentence")
        for i, wordtag_pair in enumerate(sentence):
            appender = []
            x = get_features(sentence, i)
            for t, it in x.items():
                appender.append(it)
            X[tag].append(appender)
            if wordtag_pair[1] == tag:
                Y[tag].append(1)
            else:
                Y[tag].append(0)
    X[tag] = np.array(X[tag])
    Y[tag] = np.array(Y[tag])  

In [9]:
def sigmoid(val):
    ans = 1.0 / (1 + np.exp(-1 * val))
    return ans

In [10]:
def regression(tag):
    num_iter = 200
    lr = 4e-5
    X_r = X[tag]
    Y_r = Y[tag]
    w = np.zeros(X_r.shape[1])
    for iteration in range(num_iter):
        print("this is the iteration num ", iteration)
        score = np.dot(X_r, w)
        pred = sigmoid(score)
        err = Y_r - pred
        grad = np.dot(X_r.T, err)
        w += lr * grad
    return w
    

In [11]:
def train():
    for tag in train_tags:
        print("Entered here")
        W[tag] = regression(tag)
        print("Trained for ", tag)

In [12]:
train()

Entered here
this is the iteration num  0
this is the iteration num  1
this is the iteration num  2


  


this is the iteration num  3
this is the iteration num  4
this is the iteration num  5
this is the iteration num  6
this is the iteration num  7
this is the iteration num  8
this is the iteration num  9
this is the iteration num  10
this is the iteration num  11
this is the iteration num  12
this is the iteration num  13
this is the iteration num  14
this is the iteration num  15
this is the iteration num  16
this is the iteration num  17
this is the iteration num  18
this is the iteration num  19
this is the iteration num  20
this is the iteration num  21
this is the iteration num  22
this is the iteration num  23
this is the iteration num  24
this is the iteration num  25
this is the iteration num  26
this is the iteration num  27
this is the iteration num  28
this is the iteration num  29
this is the iteration num  30
this is the iteration num  31
this is the iteration num  32
this is the iteration num  33
this is the iteration num  34
this is the iteration num  35
this is the itera

this is the iteration num  73
this is the iteration num  74
this is the iteration num  75
this is the iteration num  76
this is the iteration num  77
this is the iteration num  78
this is the iteration num  79
this is the iteration num  80
this is the iteration num  81
this is the iteration num  82
this is the iteration num  83
this is the iteration num  84
this is the iteration num  85
this is the iteration num  86
this is the iteration num  87
this is the iteration num  88
this is the iteration num  89
this is the iteration num  90
this is the iteration num  91
this is the iteration num  92
this is the iteration num  93
this is the iteration num  94
this is the iteration num  95
this is the iteration num  96
this is the iteration num  97
this is the iteration num  98
this is the iteration num  99
this is the iteration num  100
this is the iteration num  101
this is the iteration num  102
this is the iteration num  103
this is the iteration num  104
this is the iteration num  105
this

this is the iteration num  142
this is the iteration num  143
this is the iteration num  144
this is the iteration num  145
this is the iteration num  146
this is the iteration num  147
this is the iteration num  148
this is the iteration num  149
this is the iteration num  150
this is the iteration num  151
this is the iteration num  152
this is the iteration num  153
this is the iteration num  154
this is the iteration num  155
this is the iteration num  156
this is the iteration num  157
this is the iteration num  158
this is the iteration num  159
this is the iteration num  160
this is the iteration num  161
this is the iteration num  162
this is the iteration num  163
this is the iteration num  164
this is the iteration num  165
this is the iteration num  166
this is the iteration num  167
this is the iteration num  168
this is the iteration num  169
this is the iteration num  170
this is the iteration num  171
this is the iteration num  172
this is the iteration num  173
this is 

this is the iteration num  10
this is the iteration num  11
this is the iteration num  12
this is the iteration num  13
this is the iteration num  14
this is the iteration num  15
this is the iteration num  16
this is the iteration num  17
this is the iteration num  18
this is the iteration num  19
this is the iteration num  20
this is the iteration num  21
this is the iteration num  22
this is the iteration num  23
this is the iteration num  24
this is the iteration num  25
this is the iteration num  26
this is the iteration num  27
this is the iteration num  28
this is the iteration num  29
this is the iteration num  30
this is the iteration num  31
this is the iteration num  32
this is the iteration num  33
this is the iteration num  34
this is the iteration num  35
this is the iteration num  36
this is the iteration num  37
this is the iteration num  38
this is the iteration num  39
this is the iteration num  40
this is the iteration num  41
this is the iteration num  42
this is th

this is the iteration num  81
this is the iteration num  82
this is the iteration num  83
this is the iteration num  84
this is the iteration num  85
this is the iteration num  86
this is the iteration num  87
this is the iteration num  88
this is the iteration num  89
this is the iteration num  90
this is the iteration num  91
this is the iteration num  92
this is the iteration num  93
this is the iteration num  94
this is the iteration num  95
this is the iteration num  96
this is the iteration num  97
this is the iteration num  98
this is the iteration num  99
this is the iteration num  100
this is the iteration num  101
this is the iteration num  102
this is the iteration num  103
this is the iteration num  104
this is the iteration num  105
this is the iteration num  106
this is the iteration num  107
this is the iteration num  108
this is the iteration num  109
this is the iteration num  110
this is the iteration num  111
this is the iteration num  112
this is the iteration num  

this is the iteration num  149
this is the iteration num  150
this is the iteration num  151
this is the iteration num  152
this is the iteration num  153
this is the iteration num  154
this is the iteration num  155
this is the iteration num  156
this is the iteration num  157
this is the iteration num  158
this is the iteration num  159
this is the iteration num  160
this is the iteration num  161
this is the iteration num  162
this is the iteration num  163
this is the iteration num  164
this is the iteration num  165
this is the iteration num  166
this is the iteration num  167
this is the iteration num  168
this is the iteration num  169
this is the iteration num  170
this is the iteration num  171
this is the iteration num  172
this is the iteration num  173
this is the iteration num  174
this is the iteration num  175
this is the iteration num  176
this is the iteration num  177
this is the iteration num  178
this is the iteration num  179
this is the iteration num  180
this is 

this is the iteration num  17
this is the iteration num  18
this is the iteration num  19
this is the iteration num  20
this is the iteration num  21
this is the iteration num  22
this is the iteration num  23
this is the iteration num  24
this is the iteration num  25
this is the iteration num  26
this is the iteration num  27
this is the iteration num  28
this is the iteration num  29
this is the iteration num  30
this is the iteration num  31
this is the iteration num  32
this is the iteration num  33
this is the iteration num  34
this is the iteration num  35
this is the iteration num  36
this is the iteration num  37
this is the iteration num  38
this is the iteration num  39
this is the iteration num  40
this is the iteration num  41
this is the iteration num  42
this is the iteration num  43
this is the iteration num  44
this is the iteration num  45
this is the iteration num  46
this is the iteration num  47
this is the iteration num  48
this is the iteration num  49
this is th

this is the iteration num  87
this is the iteration num  88
this is the iteration num  89
this is the iteration num  90
this is the iteration num  91
this is the iteration num  92
this is the iteration num  93
this is the iteration num  94
this is the iteration num  95
this is the iteration num  96
this is the iteration num  97
this is the iteration num  98
this is the iteration num  99
this is the iteration num  100
this is the iteration num  101
this is the iteration num  102
this is the iteration num  103
this is the iteration num  104
this is the iteration num  105
this is the iteration num  106
this is the iteration num  107
this is the iteration num  108
this is the iteration num  109
this is the iteration num  110
this is the iteration num  111
this is the iteration num  112
this is the iteration num  113
this is the iteration num  114
this is the iteration num  115
this is the iteration num  116
this is the iteration num  117
this is the iteration num  118
this is the iteration

this is the iteration num  156
this is the iteration num  157
this is the iteration num  158
this is the iteration num  159
this is the iteration num  160
this is the iteration num  161
this is the iteration num  162
this is the iteration num  163
this is the iteration num  164
this is the iteration num  165
this is the iteration num  166
this is the iteration num  167
this is the iteration num  168
this is the iteration num  169
this is the iteration num  170
this is the iteration num  171
this is the iteration num  172
this is the iteration num  173
this is the iteration num  174
this is the iteration num  175
this is the iteration num  176
this is the iteration num  177
this is the iteration num  178
this is the iteration num  179
this is the iteration num  180
this is the iteration num  181
this is the iteration num  182
this is the iteration num  183
this is the iteration num  184
this is the iteration num  185
this is the iteration num  186
this is the iteration num  187
this is 

In [25]:
unigram_count

{'The': 6430,
 'Fulton': 17,
 'County': 81,
 'Grand': 16,
 'Jury': 3,
 'said': 1177,
 'Friday': 57,
 'an': 3108,
 'investigation': 39,
 'of': 32644,
 "Atlanta's": 4,
 'recent': 160,
 'primary': 91,
 'election': 72,
 'produced': 88,
 '``': 6160,
 'no': 1449,
 'evidence': 195,
 "''": 6119,
 'that': 8741,
 'any': 1140,
 'irregularities': 7,
 'took': 289,
 'place': 426,
 '.': 39534,
 'jury': 57,
 'further': 179,
 'in': 17279,
 'term-end': 1,
 'presentments': 1,
 'the': 54761,
 'City': 128,
 'Executive': 9,
 'Committee': 87,
 ',': 48492,
 'which': 3232,
 'had': 3487,
 'over-all': 34,
 'charge': 112,
 'deserves': 15,
 'praise': 16,
 'and': 23598,
 'thanks': 21,
 'Atlanta': 35,
 'for': 7814,
 'manner': 111,
 'was': 7313,
 'conducted': 55,
 'September-October': 1,
 'term': 78,
 'been': 2056,
 'charged': 55,
 'by': 4731,
 'Superior': 16,
 'Court': 108,
 'Judge': 39,
 'Durwood': 1,
 'Pye': 1,
 'to': 22030,
 'investigate': 8,
 'reports': 75,
 'possible': 343,
 'hard-fought': 2,
 'won': 63,
 'Mayo

In [26]:
def get_pred_tags(sentence):
    ans = []
    max_val = 0
    tag_temp = "NOUN"
    for iter, wordtag_pair in enumerate(sentence):
        word = wordtag_pair[0]
        tag = wordtag_pair[1]
        appender = []
        x = get_features(sentence, iter)
        for t, it in x.items():
            appender.append(it)
        for tag in train_tags:
            temp = np.dot(appender, W[tag])
            if temp > max_val:
                tag_temp = tag
        ans.append(tag_temp)
    return ans    

In [31]:
def test():
    score = 0
    tcount = 0
    for iter, sentence in enumerate(word_tag_list_test):
        ans = get_pred_tags(sentence)
#         print("this is the ans", ans)
        tcount += len(sentence)
        for i in range(len(sentence)):
            if ans[i] == sentence[i][1]:
                score += 1.8
#     print("this is the score", score)
#     print("this is the tcount", tcount)
    acc = (score / tcount) * 100
    print(acc)
                    

In [32]:
test()

55.47354389526649
