In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
vocab = {}
def get_vocab():
    with open('vocab.txt') as f:
        for i, l in enumerate(f.read().splitlines()):
            vocab[l] = i
get_vocab()

In [64]:
word2id = dict()
def word_id():
    for word in vocab:
        ix = vocab[word]
        word2id[ix] = word
word_id()

In [3]:
len(vocab)

9273

In [4]:
tag_map = {}
def get_tags():
    with open('tags.txt') as f:
        for i, l in enumerate(f.read().splitlines()):
            tag_map[l] = i
get_tags()

In [65]:
id2tag = dict()
def id_tag():
    for tag in tag_map:
        ix = tag_map[tag]
        id2tag[ix] = tag
id_tag()

In [5]:
tag_map

{'I-person': 0,
 'B-experience': 1,
 'O': 2,
 'I-experience': 3,
 'I-skill': 4,
 'B-skill': 5,
 'I-city': 6,
 'B-city': 7,
 'B-person': 8}

In [6]:
len(tag_map)

9

In [7]:
train_sentences = []        
train_labels = []
train_file = 'train_resume.bie'
with open(train_file) as f:
    s = list()
    t = list()
    for line in f.read().splitlines():
        #replace each token by its index if it is in vocab
        #else use index of UNK
        words = line.split()
        if len(words) < 2:
            train_sentences.append(s)
            train_labels.append(t)
            s = list()
            t = list()
        else:
            word = words[0]
            tag = words[-1]
            if word in vocab:
                s.append(vocab[word])
            else:
                s.append(vocab['UNK'])
            if tag in tag_map:
                t.append(tag_map[tag])
            else:
                t.append(tag_map['O'])

In [8]:
class Net(nn.Module):
    def __init__(self, params):
        super(Net, self).__init__()

        #maps each token to an embedding_dim vector
        self.embedding = nn.Embedding(params['vocab_size'], params['embedding_dim'])

        #the LSTM takens embedded sentence
        self.lstm = nn.LSTM(params['embedding_dim'], params['lstm_hidden_dim'], batch_first=True)

        #fc layer transforms the output to give the final output layer
        self.fc = nn.Linear(params['lstm_hidden_dim'], params['number_of_tags'])
    
    def forward(self, s):
        #apply the embedding layer that maps each token to its embedding
        s = self.embedding(s)   # dim: batch_size x batch_max_len x embedding_dim

        #run the LSTM along the sentences of length batch_max_len
        s, _ = self.lstm(s)     # dim: batch_size x batch_max_len x lstm_hidden_dim                

        #reshape the Variable so that each row contains one token
        s = s.reshape(-1, s.shape[2])  # dim: batch_size*batch_max_len x lstm_hidden_dim

        #apply the fully connected layer and obtain the output for each token
        s = self.fc(s)          # dim: batch_size*batch_max_len x num_tags

        return F.log_softmax(s, dim=1)   # dim: batch_size*batch_max_len x num_tags
    
def loss_fn(outputs, labels):
    #reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)  

    #mask out 'PAD' tokens
    mask = (labels >= 0).float()

    #the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask).data.item())

    #pick the values corresponding to labels and multiply by mask
    outputs = outputs[range(outputs.shape[0]), labels]*mask

    #cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens

In [9]:
model = Net({'vocab_size':len(vocab),'embedding_dim':200,'lstm_hidden_dim':100,'number_of_tags':len(tag_map)})

In [10]:
import numpy as np
from torch.autograd import Variable
def create_batch(train_data,train_labels,i,batch_size=32):
    #compute length of longest sentence in batch
    st = i*batch_size
    en = st+batch_size
    batch_sentences = train_data[st:en]
    batch_tags = train_labels[st:en]
    
    batch_max_len = max([len(s) for s in batch_sentences])
    
    #prepare a numpy array with the data, initializing the data with 'PAD' 
    #and all labels with -1; initializing labels to -1 differentiates tokens 
    #with tags from 'PAD' tokens
    batch_data = vocab['PAD']*np.ones((len(batch_sentences), batch_max_len))
    batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))

    #copy the data to the numpy array
    for j in range(len(batch_sentences)):
        cur_len = len(batch_sentences[j])
        batch_data[j][:cur_len] = batch_sentences[j]
        batch_labels[j][:cur_len] = batch_tags[j]

    #since all data are indices, we convert them to torch LongTensors
    batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

    #convert Tensors to Variables
    batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)
    return batch_data,batch_labels

In [11]:
import math

In [27]:
#train_data contains train_sentences and train_labels
#params contains batch_size
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
num_epochs = 4
batch_size = 32
num_training_steps = (len(train_sentences)/batch_size)
i = 0
criterion = CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
for epoch in range(num_epochs):
    for i in range(math.ceil(num_training_steps)):
        print(i,end=' ')
        batch_sentences, batch_labels = create_batch(train_sentences,train_labels, i, 32)
        # clear the gradients
        optimizer.zero_grad()
        # compute the model output
        output_batch = model(batch_sentences)
        # calculate loss
        loss = loss_fn(output_batch,batch_labels)
        # credit assignment
        loss.backward()
        # update model weights
        optimizer.step()
        print(loss_fn(output_batch,batch_labels))
        

0 tensor(2.2063, grad_fn=<DivBackward0>)
1 tensor(2.2001, grad_fn=<DivBackward0>)
2 tensor(2.1817, grad_fn=<DivBackward0>)
3 tensor(2.1521, grad_fn=<DivBackward0>)
4 tensor(2.1480, grad_fn=<DivBackward0>)
5 tensor(2.1244, grad_fn=<DivBackward0>)
6 tensor(2.1124, grad_fn=<DivBackward0>)
7 tensor(2.0816, grad_fn=<DivBackward0>)
8 tensor(2.0450, grad_fn=<DivBackward0>)
9 tensor(1.9949, grad_fn=<DivBackward0>)
10 tensor(1.9370, grad_fn=<DivBackward0>)
11 tensor(1.8388, grad_fn=<DivBackward0>)
12 tensor(1.8249, grad_fn=<DivBackward0>)
13 tensor(1.7773, grad_fn=<DivBackward0>)
14 tensor(1.7438, grad_fn=<DivBackward0>)
15 tensor(1.7401, grad_fn=<DivBackward0>)
16 tensor(1.6748, grad_fn=<DivBackward0>)
17 tensor(1.8529, grad_fn=<DivBackward0>)
18 tensor(1.7970, grad_fn=<DivBackward0>)
19 tensor(1.6959, grad_fn=<DivBackward0>)
20 tensor(1.4779, grad_fn=<DivBackward0>)
21 tensor(1.4042, grad_fn=<DivBackward0>)
22 tensor(1.3617, grad_fn=<DivBackward0>)
23 tensor(1.3255, grad_fn=<DivBackward0>)
24

195 tensor(0.5591, grad_fn=<DivBackward0>)
196 tensor(0.3832, grad_fn=<DivBackward0>)
197 tensor(0.6517, grad_fn=<DivBackward0>)
198 tensor(0.4523, grad_fn=<DivBackward0>)
199 tensor(0.4996, grad_fn=<DivBackward0>)
200 tensor(0.4532, grad_fn=<DivBackward0>)
201 tensor(0.5769, grad_fn=<DivBackward0>)
202 tensor(0.9197, grad_fn=<DivBackward0>)
203 tensor(0.3842, grad_fn=<DivBackward0>)
204 tensor(0.4178, grad_fn=<DivBackward0>)
205 tensor(0.3945, grad_fn=<DivBackward0>)
206 tensor(0.4314, grad_fn=<DivBackward0>)
207 tensor(0.5827, grad_fn=<DivBackward0>)
208 tensor(0.4143, grad_fn=<DivBackward0>)
209 tensor(0.6895, grad_fn=<DivBackward0>)
210 tensor(0.7316, grad_fn=<DivBackward0>)
211 tensor(0.5138, grad_fn=<DivBackward0>)
212 tensor(0.5312, grad_fn=<DivBackward0>)
213 tensor(0.4442, grad_fn=<DivBackward0>)
0 tensor(0.4317, grad_fn=<DivBackward0>)
1 tensor(0.3407, grad_fn=<DivBackward0>)
2 tensor(0.3320, grad_fn=<DivBackward0>)
3 tensor(0.2937, grad_fn=<DivBackward0>)
4 tensor(0.5592, gr

177 tensor(0.4378, grad_fn=<DivBackward0>)
178 tensor(0.4942, grad_fn=<DivBackward0>)
179 tensor(0.4427, grad_fn=<DivBackward0>)
180 tensor(0.4888, grad_fn=<DivBackward0>)
181 tensor(0.4386, grad_fn=<DivBackward0>)
182 tensor(0.4642, grad_fn=<DivBackward0>)
183 tensor(0.4634, grad_fn=<DivBackward0>)
184 tensor(0.4436, grad_fn=<DivBackward0>)
185 tensor(0.5110, grad_fn=<DivBackward0>)
186 tensor(0.4118, grad_fn=<DivBackward0>)
187 tensor(0.4544, grad_fn=<DivBackward0>)
188 tensor(0.3726, grad_fn=<DivBackward0>)
189 tensor(0.3660, grad_fn=<DivBackward0>)
190 tensor(0.3032, grad_fn=<DivBackward0>)
191 tensor(0.2540, grad_fn=<DivBackward0>)
192 tensor(0.2825, grad_fn=<DivBackward0>)
193 tensor(0.3428, grad_fn=<DivBackward0>)
194 tensor(0.2847, grad_fn=<DivBackward0>)
195 tensor(0.4089, grad_fn=<DivBackward0>)
196 tensor(0.2602, grad_fn=<DivBackward0>)
197 tensor(0.4937, grad_fn=<DivBackward0>)
198 tensor(0.2972, grad_fn=<DivBackward0>)
199 tensor(0.3545, grad_fn=<DivBackward0>)
200 tensor(

158 tensor(0.2511, grad_fn=<DivBackward0>)
159 tensor(0.1467, grad_fn=<DivBackward0>)
160 tensor(0.3105, grad_fn=<DivBackward0>)
161 tensor(0.2537, grad_fn=<DivBackward0>)
162 tensor(0.3063, grad_fn=<DivBackward0>)
163 tensor(0.3877, grad_fn=<DivBackward0>)
164 tensor(0.1837, grad_fn=<DivBackward0>)
165 tensor(0.2262, grad_fn=<DivBackward0>)
166 tensor(0.2765, grad_fn=<DivBackward0>)
167 tensor(0.1853, grad_fn=<DivBackward0>)
168 tensor(0.3788, grad_fn=<DivBackward0>)
169 tensor(0.4062, grad_fn=<DivBackward0>)
170 tensor(0.5064, grad_fn=<DivBackward0>)
171 tensor(0.4345, grad_fn=<DivBackward0>)
172 tensor(0.4111, grad_fn=<DivBackward0>)
173 tensor(0.3423, grad_fn=<DivBackward0>)
174 tensor(0.3353, grad_fn=<DivBackward0>)
175 tensor(0.3940, grad_fn=<DivBackward0>)
176 tensor(0.3200, grad_fn=<DivBackward0>)
177 tensor(0.3531, grad_fn=<DivBackward0>)
178 tensor(0.3988, grad_fn=<DivBackward0>)
179 tensor(0.3509, grad_fn=<DivBackward0>)
180 tensor(0.4012, grad_fn=<DivBackward0>)
181 tensor(

139 tensor(0.2430, grad_fn=<DivBackward0>)
140 tensor(0.2125, grad_fn=<DivBackward0>)
141 tensor(0.6048, grad_fn=<DivBackward0>)
142 tensor(0.1287, grad_fn=<DivBackward0>)
143 tensor(0.1239, grad_fn=<DivBackward0>)
144 tensor(0.1257, grad_fn=<DivBackward0>)
145 tensor(0.1910, grad_fn=<DivBackward0>)
146 tensor(0.1803, grad_fn=<DivBackward0>)
147 tensor(0.3132, grad_fn=<DivBackward0>)
148 tensor(0.1838, grad_fn=<DivBackward0>)
149 tensor(0.1727, grad_fn=<DivBackward0>)
150 tensor(0.1802, grad_fn=<DivBackward0>)
151 tensor(0.1804, grad_fn=<DivBackward0>)
152 tensor(0.2160, grad_fn=<DivBackward0>)
153 tensor(0.2348, grad_fn=<DivBackward0>)
154 tensor(0.1984, grad_fn=<DivBackward0>)
155 tensor(0.3725, grad_fn=<DivBackward0>)
156 tensor(0.2073, grad_fn=<DivBackward0>)
157 tensor(0.2790, grad_fn=<DivBackward0>)
158 tensor(0.2152, grad_fn=<DivBackward0>)
159 tensor(0.1148, grad_fn=<DivBackward0>)
160 tensor(0.2659, grad_fn=<DivBackward0>)
161 tensor(0.2186, grad_fn=<DivBackward0>)
162 tensor(

In [29]:
output_batch.shape

torch.Size([406, 9])

In [49]:
(np.argmax(output_batch.detach().numpy(),axis=1)==5).reshape(14,29)

array([[ True,  True, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False],
       [False, False, False, False, False, False,  True, False,  True,
        False,  True,  True,  True, False,  True, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False],
       [False, False, False, False,  True,  True, False,

In [81]:
tags = batch_sentences.detach().numpy()[(np.argmax(output_batch.detach().numpy(),axis=1)==5).reshape(14,29)]

In [82]:
tags

array([3311,    1, 4541, 7103, 8558, 4854, 9185, 8558, 1411, 1411,  332,
       6321, 2558, 6710, 8399, 8799, 1411, 1760,    1, 6025, 1411,    1,
          1, 3311, 7103,    1, 8652, 7495, 5294, 5294, 7495, 7495, 7495,
       7495, 7540, 7540, 1848, 1848, 5494, 5494, 3311,  541, 4541, 3141,
       7103, 1630, 8652, 4179, 6710, 6710,  219, 1848, 1162, 5443, 7038,
        100])

In [83]:
for t in tags:
    print(word2id[t])

java
PAD
jsp
html
struts
logic
tags
struts
sql
sql
stored
procedures
views
oracle
tools
toad
sql
*
PAD
pl/sql
sql
PAD
PAD
java
html
PAD
junit
testing
unit
unit
testing
testing
testing
testing
apache
apache
tomcat
tomcat
server
server
java
servlets
jsp
jstl
html
javascript
junit
jdbc
oracle
oracle
8i
tomcat
git
eclipse
log4j
.


In [77]:
type(batch_sentences.detach().numpy())

numpy.ndarray

In [45]:
len(np.argmax(output_batch.detach().numpy(),axis=1)==5)

406

In [46]:
batch_sentences.shape

torch.Size([14, 29])

In [47]:
14*29

406

In [50]:
batch_sentences[(np.argmax(output_batch.detach().numpy(),axis=1)==5).reshape(14,29)]

IndexError: too many indices for tensor of dimension 2

In [51]:
(np.argmax(output_batch.detach().numpy(),axis=1)==5).reshape(14,29).shape

(14, 29)

In [73]:
np.array([[1,2,3],
        [4,5,6]])[np.array([[True,True,False],
                          [True,False,True]])]

array([1, 2, 4, 6])

In [52]:
batch_sentences

tensor([[3311,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1],
        [   1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1],
        [9025, 1846, 7557,  399, 2109, 8843, 4541, 1054, 7103, 1054, 8558, 4854,
         9185, 1916, 8558, 4013, 7836,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1],
        [9025, 3080, 4485, 7410, 1916, 5500, 4276, 7559, 1012, 4429, 8838, 7883,
         3386, 1468,   15, 5699, 7187,   15, 8482,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1],
        [9025, 3080, 4485, 1067, 1411, 1411, 4129, 1054,  332, 6321, 1054, 1916,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    

In [68]:
id2tag[5]

'B-skill'

In [69]:
id2tag

{0: 'I-person',
 1: 'B-experience',
 2: 'O',
 3: 'I-experience',
 4: 'I-skill',
 5: 'B-skill',
 6: 'I-city',
 7: 'B-city',
 8: 'B-person'}

In [66]:
tag_map

{'I-person': 0,
 'B-experience': 1,
 'O': 2,
 'I-experience': 3,
 'I-skill': 4,
 'B-skill': 5,
 'I-city': 6,
 'B-city': 7,
 'B-person': 8}