In [1]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset
import torch.utils.data as data
import os
import re

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# Data Preprocess

In [None]:
# ## Data preprocess: 
# 1. creat train_sentence,train_sentence_label,vocabulary.
# 2. add '<unk>','<padding>' in vocabulary
# 3. padding each sentence
# 4. creat train_sentence_indices

ner_categories = ['person', 'geo-loc', 'company', 'facility', 'product',
            'musicartist', 'movie', 'sportsteam', 'tvshow', 'other']
ner_format = ['B','I','O']

def convert_to_indices(sentence,to_idx):
    # input example: ['I','am','a','pig']
    indices = []
    for token in sentence:
        if token in to_idx: indices.append(to_idx[token])
        else: indices.append(to_idx['<unk>'])
    return indices

def get_data(path,ner_categories,ner_format,train_vocabulary=None):
    vocabulary = set()
    train_sentence = []
    train_sentence_label = []
    with open(path,'r') as f1:
        lines = f1.readlines()
        sentence = []
        label = []
        for line in lines:
            if line!='\n':
                TAB_idx = line.index('\t')
                word = line[:TAB_idx]
                # 全部轉小寫
                word = word.lower()
                if re.match('http:',word):
                    vocabulary.add('<url>')
                    sentence.append('<url>')
                elif word[0]=='@' and len(word)>1:
                    vocabulary.add('<user>') 
                    sentence.append('<user>')
                else:
                    vocabulary.add(word)
                    sentence.append(word)
                label.append(line[TAB_idx+1:-1])
            else:
                train_sentence.append(sentence)
                train_sentence_label.append(label)
                sentence = []
                label = []
    vocabulary.add('<unk>')
    vocabulary.add('<padding>')
    #找出最長的句子，其他不夠長的要padding
    #max_sentence_len = len(max(train_sentence, key=lambda x: len(x)))
    max_sentence_len = 41
    
    # process padding
    for idx,sentence in enumerate(train_sentence):
        if len(sentence)<max_sentence_len:
            train_sentence[idx] = sentence+['<padding>']*(max_sentence_len-len(sentence))
    for idx,label in enumerate(train_sentence_label):
        if len(label)<max_sentence_len:
            train_sentence_label[idx] = label+['O']*(max_sentence_len-len(label))
            
    # get idx_to_word,word_to_idx
    if train_vocabulary!=None: vocabulary=train_vocabulary
    idx_to_word = sorted(list(vocabulary))
    word_to_idx = {word: ind for ind, word in enumerate(idx_to_word)}
    
    #
    train_vector = []
    for sentence in train_sentence:
        indices = convert_to_indices(sentence,word_to_idx)
        train_vector.append(indices)
    #
    idx_to_label = []
    for category in ner_categories:
        for j in ner_format[:2]:
            idx_to_label.append(j+'-'+category)
    idx_to_label.append('O')
    label_to_idx = {label:i for i,label in enumerate(idx_to_label)}
    
    # get train_vector_label
    train_vector_label = []
    for label in train_sentence_label:
        indices = convert_to_indices(label,label_to_idx)
        train_vector_label.append(indices)
    
    return train_vector,train_vector_label,vocabulary

In [None]:
class CustomDataset(Dataset):
    '''Dataset for loading and preprocessing'''
    def __init__(self, mode, x_vectors, y_vectors=None):
        self.mode = mode
        self.x_vectors = x_vectors
        self.y_vectors = y_vectors
    def __len__(self):
        return len(self.x_vectors)
    def __getitem__(self, idx):
        x = self.x_vectors[idx]
        if self.mode=='train' or self.mode=='val':
            y = self.y_vectors[idx]
            return torch.tensor(x),torch.tensor(y)
        else:
            return torch.tensor(x)

In [None]:
# Hyper parameter
# data
batch_size = 128
# model
NER_class = 21
max_sentence_len = 41
lstm_input_size = max_sentence_len
embedding_dim = 100
lstm_hidden_size = 128
lstm_num_layers = 2

In [None]:
# 4.train_sentence: [['I','am','a','pig'],['HAHA'],['Yes','you','are']]
# 1.train_vector: [[0,1,2,5],[29],[33,8,100]] 後面加padding

# 5.train_sentence_label: [['B-person','O','O','B-other'],['B-other'],['O','B-person','O']]
# 2.train_vector_label: [[0,20,20,18],[18],[20,18,20]] 後面加padding

# 3.vocabulary: set('I','am','a',......,'are')

# 6.idx_to_word: sorted(list(vocabulary))
# 7.word_to_idx: {'I':0,'am':1, 'a':2,......,'are':7}
# 8.idx_to_label: ['B-person', 'I-person',......,'O']
# 9.label_to_idx: {'B-person':0, 'I-person':1,......,'O':20}


# Get train/validation data
train_vector,train_vector_label, train_vocabulary = get_data('./train.txt',ner_categories,ner_format)
val_vector,val_vector_label, _ = get_data('./dev.txt',ner_categories,ner_format,train_vocabulary)

train_data = CustomDataset(mode='train', x_vectors=train_vector, y_vectors=train_vector_label)
train_loader = data.DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True, drop_last=False)

val_data = CustomDataset(mode='val', x_vectors=val_vector, y_vectors=val_vector_label)
val_loader = data.DataLoader(dataset=val_data, batch_size=1, shuffle=False, drop_last=False)

# train

In [None]:
# creat model
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

class NER_model(nn.Module):
    def __init__(self, NER_class, lstm_input_size, lstm_hidden_size, lstm_num_layers):
        super(NER_model, self).__init__()
        self.embed = nn.Embedding(num_embeddings=len(train_vocabulary), embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=lstm_hidden_size,
                            num_layers=lstm_num_layers, batch_first =True, bidirectional=True,
                            dropout=0.5)
        self.fc = nn.Linear(lstm_hidden_size*2, NER_class)
    def forward(self,x):
        embed = self.embed(x)
        lstm_out, (h_n,c_n) = self.lstm(embed)
        fc_out = self.fc(lstm_out)
        return fc_out
model = NER_model(NER_class, lstm_input_size, lstm_hidden_size, lstm_num_layers).to(device)

In [None]:
# train
# caculate f1 score
def cal_metrics(model,val_loader,device):
    model.eval()
    y_label = []
    y_pred = []
    
    for x,y in val_loader:
        x,y = x.to(device), y.to(device)
        y_label = y_label+y[0].cpu().numpy().tolist()
        with torch.no_grad():
            model_pred = model(x)
            flatten_pred = torch.argmax(model_pred, dim=2).cpu().numpy().tolist()
            y_pred = y_pred+flatten_pred[0]
    return f1_score(y_label,y_pred,average='macro')


# In[ ]:


# training
epochs = 1000
lr = 0.01
loss_fun = nn.CrossEntropyLoss(reduction='none')
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

min_val_loss = 10000
max_f1score = 0

for epoch in range(epochs):
    model.train()
    print('epoch {}:'.format(epoch))
    metrics = {'accuracy':None,'precision':None,'recall':None,'f1':None}
    
    temp = []
    for x,y in train_loader:
        optimizer.zero_grad()
        weight_loss = torch.tensor([0]).to(device).double()
        x,y = x.to(device), y.to(device)    # x.size(): batch_size*seqlen*n_class, y.size(): batch_size*seqlen
        model_pred = model(x)    # model_pred.shape: batch*seq_len*n_class
        # 讓 O label的loss比較小
        loss = loss_fun(model_pred.permute(0, 2, 1), y)
        for i in range(len(y)):
            for j in range(41):
                if y[i][j]==20:weight_loss+=(0.5*loss[i][j])
                else:weight_loss+=loss[i][j]
        weight_loss/=(256*41)
        temp.append(weight_loss)
        weight_loss.backward()
        optimizer.step()
    print('train loss:{}'.format((sum(temp)/len(train_loader)).item()))
    ## calculate metrics
    '''
    metrics['accuracy'] = accuracy_score(y_label,y_pred)
    metrics['precision'] = precision_score(y_label,y_pred,average=None)
    metrics['recall'] = recall_score(y_label,y_pred,average=None)
    '''
    #metrics['f1'] = f1_score(y_label,y_pred,average='macro')
    #print(np.round(f1_score(y_label,y_pred,average=None),3))
    # 依照 f1 score save model
    metrics['f1']  = cal_metrics(model,val_loader,device)
    if metrics['f1']>max_f1score:
        max_f1score=metrics['f1']
        torch.save({'best_state_dict':model.state_dict()},'/home/mick/NLP_HW/best_f1score.pth')
        print('save new high f1 score:{}\n'.format(metrics['f1']))

In [None]:
# check validation result

# In[ ]:


checkpoint  = torch.load('./best_f1score.pth')
model.load_state_dict(checkpoint['best_state_dict'])
model.eval()

#val_data = CustomDataset(mode='val', x_vectors=val_vector, y_vectors=val_vector_label)
#val_loader = data.DataLoader(dataset=val_data, batch_size=1, shuffle=False, drop_last=False)

y_label = []
y_pred = []
for x,y in val_loader:
    x,y = x.to(device), y.to(device)
    # x.size(): batch_size*seqlen*n_class
    # y.size(): batch_size*seqlen
    y_label = y_label+y[0].cpu().numpy().tolist()
    with torch.no_grad():
        model_pred = model(x)
        flatten_pred = torch.argmax(model_pred, dim=2).cpu().numpy().tolist()
        y_pred = y_pred+flatten_pred[0]


# In[ ]:


acc = accuracy_score(y_label,y_pred)
precision = precision_score(y_label,y_pred,average=None)
recall = recall_score(y_label,y_pred,average=None)
f1 = f1_score(y_label,y_pred,average='macro')

# generate result.txt

In [None]:
# !!two step
def get_test_data(path,ner_categories,ner_format,train_vocabulary=None):
    vocabulary = set()
    train_sentence = []
    with open(path,'r') as f1:
        lines = f1.readlines()
        sentence = []
        for line in lines:
            if line!='\n':
                word = line[:-1]
                # 全部轉小寫
                #word = word.lower()
                if re.match('http:',word):
                    vocabulary.add(word)
                    sentence.append(word)
                    #vocabulary.add('<url>')
                    #sentence.append('<url>')
                elif word[0]=='@' and len(word)>1:
                    vocabulary.add(word)
                    sentence.append(word)
                    #vocabulary.add('<user>') 
                    #sentence.append('<user>')
                else:
                    vocabulary.add(word)
                    sentence.append(word)
            else:
                train_sentence.append(sentence)
                sentence = []
    vocabulary.add('<unk>')
    vocabulary.add('<padding>')
    #找出最長的句子，其他不夠長的要padding
    #max_sentence_len = len(max(train_sentence, key=lambda x: len(x)))
    max_sentence_len = 41
    for i in train_sentence:
        if len(i)>41:print('True')
    
    # process padding
    for idx,sentence in enumerate(train_sentence):
        if len(sentence)<max_sentence_len:
            train_sentence[idx] = sentence+['<padding>']*(max_sentence_len-len(sentence))
            
    # get idx_to_word,word_to_idx
    if train_vocabulary!=None: vocabulary=train_vocabulary
    idx_to_word = sorted(list(vocabulary))
    word_to_idx = {word: ind for ind, word in enumerate(idx_to_word)}
    
    #
    train_vector = []
    for sentence in train_sentence:
        indices = convert_to_indices(sentence,word_to_idx)
        train_vector.append(indices)
        
    #
    idx_to_label = []
    for category in ner_categories:
        for j in ner_format[:2]:
            idx_to_label.append(j+'-'+category)
    idx_to_label.append('O')
    label_to_idx = {label:i for i,label in enumerate(idx_to_label)}

    return train_vector,word_to_idx,idx_to_label,train_sentence

In [None]:
test_vector,word_to_idx,idx_to_label,train_sentence = get_test_data('./result.txt',ner_categories,ner_format,train_vocabulary)

test_data = CustomDataset(mode='test', x_vectors=test_vector)
test_loader = data.DataLoader(dataset=test_data, batch_size=1, shuffle=False, drop_last=False)

_,_,_,train_sentence = get_test_data('./result.txt',ner_categories,ner_format,train_vocabulary)

In [None]:
checkpoint = torch.load('./best_f1score.pth')
model.load_state_dict(checkpoint['best_state_dict'])
model.eval()

#val_data = CustomDataset(mode='val', x_vectors=val_vector, y_vectors=val_vector_label)
#val_loader = data.DataLoader(dataset=val_data, batch_size=1, shuffle=False, drop_last=False)

y_pred = []
for x in test_loader:
    x = x.to(device)
    # x.size(): batch_size*seqlen*n_class
    # y.size(): batch_size*seqlen
    with torch.no_grad():
        model_pred = model(x)
        flatten_pred = torch.argmax(model_pred, dim=2).cpu().numpy().tolist()
        y_pred = y_pred+flatten_pred[0]

In [None]:
with open('./output.txt','w') as f1,open('./result.txt','r') as f2:
    lines2 = f2.readlines()
    lines = []
    count = -1
    for sentence in train_sentence:
        for token in sentence:
            count+=1
            if token!= '<padding>':
                lines.append(token+'\t'+idx_to_label[y_pred[count]]+'\n')
        lines.append('\n')
    f1.writelines(lines)