In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
vocab = {}
def get_vocab():
    with open('vocab.txt') as f:
        for i, l in enumerate(f.read().splitlines()):
            vocab[l] = i
get_vocab()

In [3]:
word2id = dict()
def word_id():
    for word in vocab:
        ix = vocab[word]
        word2id[ix] = word
word_id()

In [4]:
len(vocab)

9273

In [5]:
tag_map = {}
def get_tags():
    with open('tags.txt') as f:
        for i, l in enumerate(f.read().splitlines()):
            tag_map[l] = i
get_tags()

In [6]:
id2tag = dict()
def id_tag():
    for tag in tag_map:
        ix = tag_map[tag]
        id2tag[ix] = tag
id_tag()

In [7]:
tag_map

{'I-person': 0,
 'B-experience': 1,
 'O': 2,
 'I-experience': 3,
 'I-skill': 4,
 'B-skill': 5,
 'I-city': 6,
 'B-city': 7,
 'B-person': 8}

In [8]:
len(tag_map)

9

In [9]:
train_sentences = []        
train_labels = []
train_file = 'train_resume.bie'
with open(train_file) as f:
    s = list()
    t = list()
    for line in f.read().splitlines():
        #replace each token by its index if it is in vocab
        #else use index of UNK
        words = line.split()
        if len(words) < 2:
            train_sentences.append(s)
            train_labels.append(t)
            s = list()
            t = list()
        else:
            word = words[0]
            tag = words[-1]
            if word in vocab:
                s.append(vocab[word])
            else:
                s.append(vocab['UNK'])
            if tag in tag_map:
                t.append(tag_map[tag])
            else:
                t.append(tag_map['O'])

In [10]:
class Net(nn.Module):
    def __init__(self, params):
        super(Net, self).__init__()

        #maps each token to an embedding_dim vector
        self.embedding = nn.Embedding(params['vocab_size'], params['embedding_dim'])

        #the LSTM takens embedded sentence
        self.lstm = nn.LSTM(params['embedding_dim'], params['lstm_hidden_dim'], batch_first=True)

        #fc layer transforms the output to give the final output layer
        self.fc = nn.Linear(params['lstm_hidden_dim'], params['number_of_tags'])
    
    def forward(self, s):
        #apply the embedding layer that maps each token to its embedding
        s = self.embedding(s)   # dim: batch_size x batch_max_len x embedding_dim

        #run the LSTM along the sentences of length batch_max_len
        s, _ = self.lstm(s)     # dim: batch_size x batch_max_len x lstm_hidden_dim                

        #reshape the Variable so that each row contains one token
        s = s.reshape(-1, s.shape[2])  # dim: batch_size*batch_max_len x lstm_hidden_dim

        #apply the fully connected layer and obtain the output for each token
        s = self.fc(s)          # dim: batch_size*batch_max_len x num_tags

        return F.log_softmax(s, dim=1)   # dim: batch_size*batch_max_len x num_tags
    
def loss_fn(outputs, labels):
    #reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)  

    #mask out 'PAD' tokens
    mask = (labels >= 0).float()

    #the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask).data.item())

    #pick the values corresponding to labels and multiply by mask
    outputs = outputs[range(outputs.shape[0]), labels]*mask

    #cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens

In [11]:
model = Net({'vocab_size':len(vocab),'embedding_dim':200,'lstm_hidden_dim':100,'number_of_tags':len(tag_map)})

In [12]:
import numpy as np
from torch.autograd import Variable
def create_batch(train_data,train_labels,i,batch_size=32):
    #compute length of longest sentence in batch
    st = i*batch_size
    en = st+batch_size
    batch_sentences = train_data[st:en]
    batch_tags = train_labels[st:en]
    
    batch_max_len = max([len(s) for s in batch_sentences])
    
    #prepare a numpy array with the data, initializing the data with 'PAD' 
    #and all labels with -1; initializing labels to -1 differentiates tokens 
    #with tags from 'PAD' tokens
    batch_data = vocab['PAD']*np.ones((len(batch_sentences), batch_max_len))
    batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))

    #copy the data to the numpy array
    for j in range(len(batch_sentences)):
        cur_len = len(batch_sentences[j])
        batch_data[j][:cur_len] = batch_sentences[j]
        batch_labels[j][:cur_len] = batch_tags[j]

    #since all data are indices, we convert them to torch LongTensors
    batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

    #convert Tensors to Variables
    batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)
    return batch_data,batch_labels

In [13]:
import math

In [14]:
#train_data contains train_sentences and train_labels
#params contains batch_size
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
num_epochs = 4
batch_size = 32
num_training_steps = (len(train_sentences)/batch_size)
i = 0
criterion = CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
for epoch in range(num_epochs):
    for i in range(math.ceil(num_training_steps)):
        print(i,end=' ')
        batch_sentences, batch_labels = create_batch(train_sentences,train_labels, i, 32)
        # clear the gradients
        optimizer.zero_grad()
        # compute the model output
        output_batch = model(batch_sentences)
        # calculate loss
        loss = loss_fn(output_batch,batch_labels)
        # credit assignment
        loss.backward()
        # update model weights
        optimizer.step()
        print(loss_fn(output_batch,batch_labels))
        

0 tensor(2.2783, grad_fn=<DivBackward0>)
1 tensor(2.2782, grad_fn=<DivBackward0>)
2 tensor(2.2625, grad_fn=<DivBackward0>)
3 tensor(2.2345, grad_fn=<DivBackward0>)
4 tensor(2.1997, grad_fn=<DivBackward0>)
5 tensor(2.1900, grad_fn=<DivBackward0>)
6 tensor(2.1625, grad_fn=<DivBackward0>)
7 tensor(2.1322, grad_fn=<DivBackward0>)
8 tensor(2.0921, grad_fn=<DivBackward0>)
9 tensor(2.0562, grad_fn=<DivBackward0>)
10 tensor(1.9976, grad_fn=<DivBackward0>)
11 tensor(1.9227, grad_fn=<DivBackward0>)
12 tensor(1.8877, grad_fn=<DivBackward0>)
13 tensor(1.8362, grad_fn=<DivBackward0>)
14 tensor(1.8087, grad_fn=<DivBackward0>)
15 tensor(1.7974, grad_fn=<DivBackward0>)
16 tensor(1.7455, grad_fn=<DivBackward0>)
17 tensor(1.8638, grad_fn=<DivBackward0>)
18 tensor(1.8292, grad_fn=<DivBackward0>)
19 tensor(1.7043, grad_fn=<DivBackward0>)
20 tensor(1.5096, grad_fn=<DivBackward0>)
21 tensor(1.4349, grad_fn=<DivBackward0>)
22 tensor(1.4002, grad_fn=<DivBackward0>)
23 tensor(1.3339, grad_fn=<DivBackward0>)
24

KeyboardInterrupt: 

In [15]:
output_batch.shape

torch.Size([992, 9])

In [19]:
len(output_batch)

992

In [20]:
(np.argmax(output_batch.detach().numpy(),axis=1)==5).reshape(992,9)

ValueError: cannot reshape array of size 992 into shape (992,9)

In [81]:
tags = batch_sentences.detach().numpy()[(np.argmax(output_batch.detach().numpy(),axis=1)==5).reshape(14,29)]

In [82]:
tags

array([3311,    1, 4541, 7103, 8558, 4854, 9185, 8558, 1411, 1411,  332,
       6321, 2558, 6710, 8399, 8799, 1411, 1760,    1, 6025, 1411,    1,
          1, 3311, 7103,    1, 8652, 7495, 5294, 5294, 7495, 7495, 7495,
       7495, 7540, 7540, 1848, 1848, 5494, 5494, 3311,  541, 4541, 3141,
       7103, 1630, 8652, 4179, 6710, 6710,  219, 1848, 1162, 5443, 7038,
        100])

In [83]:
for t in tags:
    print(word2id[t])

java
PAD
jsp
html
struts
logic
tags
struts
sql
sql
stored
procedures
views
oracle
tools
toad
sql
*
PAD
pl/sql
sql
PAD
PAD
java
html
PAD
junit
testing
unit
unit
testing
testing
testing
testing
apache
apache
tomcat
tomcat
server
server
java
servlets
jsp
jstl
html
javascript
junit
jdbc
oracle
oracle
8i
tomcat
git
eclipse
log4j
.


In [77]:
type(batch_sentences.detach().numpy())

numpy.ndarray

In [45]:
len(np.argmax(output_batch.detach().numpy(),axis=1)==5)

406

In [46]:
batch_sentences.shape

torch.Size([14, 29])

In [50]:
batch_sentences[(np.argmax(output_batch.detach().numpy(),axis=1)==5).reshape(14,29)]

IndexError: too many indices for tensor of dimension 2

In [51]:
(np.argmax(output_batch.detach().numpy(),axis=1)==5).reshape(14,29).shape

(14, 29)

In [52]:
batch_sentences

tensor([[3311,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1],
        [   1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1],
        [9025, 1846, 7557,  399, 2109, 8843, 4541, 1054, 7103, 1054, 8558, 4854,
         9185, 1916, 8558, 4013, 7836,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1],
        [9025, 3080, 4485, 7410, 1916, 5500, 4276, 7559, 1012, 4429, 8838, 7883,
         3386, 1468,   15, 5699, 7187,   15, 8482,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1],
        [9025, 3080, 4485, 1067, 1411, 1411, 4129, 1054,  332, 6321, 1054, 1916,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    

In [68]:
id2tag[5]

'B-skill'

In [69]:
id2tag

{0: 'I-person',
 1: 'B-experience',
 2: 'O',
 3: 'I-experience',
 4: 'I-skill',
 5: 'B-skill',
 6: 'I-city',
 7: 'B-city',
 8: 'B-person'}

In [66]:
tag_map

{'I-person': 0,
 'B-experience': 1,
 'O': 2,
 'I-experience': 3,
 'I-skill': 4,
 'B-skill': 5,
 'I-city': 6,
 'B-city': 7,
 'B-person': 8}

In [22]:
# Specify a path
PATH = "../model/resume_ner_model.pt"

# Save
#torch.save(model.state_dict(), PATH)

In [23]:
# Load
# Load
model = Net({'vocab_size':len(vocab),'embedding_dim':200,'lstm_hidden_dim':100,'number_of_tags':len(tag_map)})
model.load_state_dict(torch.load(PATH))
model.eval()

Net(
  (embedding): Embedding(9273, 200)
  (lstm): LSTM(200, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=9, bias=True)
)

In [24]:
output_batch = model(batch_sentences)

In [26]:
output_batch.shape

torch.Size([992, 9])

In [90]:
tags = batch_sentences.detach().numpy()[(np.argmax(output_batch.detach().numpy(),axis=1)==5).reshape(14,29)]

In [91]:
tags

array([3311,    1, 4541, 7103, 8558, 4854, 9185, 8558, 1411, 1411,  332,
       6321, 2558, 6710, 8399, 8799, 1411, 1760,    1, 6025, 1411,    1,
          1, 3311, 7103,    1, 8652, 7495, 5294, 5294, 7495, 7495, 7495,
       7495, 7540, 7540, 1848, 1848, 5494, 5494, 3311,  541, 4541, 3141,
       7103, 1630, 8652, 4179, 6710, 6710,  219, 1848, 1162, 5443, 7038,
        100])