In [99]:
import numpy as np 
import jieba 
import langid 
from tqdm import tqdm
from torch.utils.data import TensorDataset,DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F
from sklearn import metrics

In [2]:
def read_file(data):
	''' This function is reading and preprocess the data '''
	
	word_insts = []
	labels = []
	with open(data,encoding='utf-8') as f :
		for sent in tqdm(f):
			full_line = sent.split('	')
			label = int(full_line[0])
			sent = full_line[1].replace(' ','')
			sent = sent.strip()
			sent_tuple = langid.classify(sent)
			if sent_tuple[0] == 'zh':
				words = list(jieba.cut(sent))
				words.insert(0,'<zh>')
			#elif sent_tuple[0] == 'ja':
			#	words = (mecab.parse(sent)).split()
				#words.insert(0,'<ja>')
		
			if words and label:
				word_insts += [words]
				labels.append(label)
	
	return word_insts,labels

In [3]:
def build_vocab_idx(word_insts):
	''' Trim vocab by number of occurence '''
	
	full_vocab = set(w for sent in word_insts for w in sent)
	print('[Info] Original Vocabulary size =', len(full_vocab))
	
	word2idx = {
		'<pad>': 0,
		'<unk>': 1,
		'<zh>' : 2,
		'<ja>' : 3}
	word_count = {w: 0 for w in full_vocab}
	
	for sent in word_insts:
		for word in sent:
			word_count[word] += 1
			
	for word,count in word_count.items():
		if word not in word2idx:
			word2idx[word] = len(word2idx)
	return word2idx

In [4]:
def convert_instance_to_idx_seq(word_insts,word2idx):
	''' Mapping  words to idx sequence.'''
	return [[word2idx.get(w,'<unk>') for w in s] for s in word_insts]

In [5]:
def pad_features(train_sec_insts, seq_length):
	''' Padding the insts'''
	
	features = np.zeros((len(train_sec_insts), seq_length), dtype=int)
	for i, row in enumerate(train_sec_insts):
		features[i,:len(row)] = np.array(row)[:seq_length]
	return features 

In [6]:
def split_data(features,labels):
	'''Split the dataset into train_data and val_data'''
	
	split_frac = 0.9
	split_idx = int(len(features) * split_frac)
	train_x, val_x = features[:split_idx], features[split_idx:]
	train_y, val_y = labels[:split_idx], labels[split_idx:]
	print("Train set: \t\t{}".format(train_x.shape), 
		"\nValidation set: \t{}".format(val_x.shape))
	return (train_x, train_y, val_x, val_y)

In [7]:
def prepare_dataloader(splited_data,batch_size):
	'''Turn the data into dataloader '''
	
	train_x, train_y, val_x, val_y = splited_data
	train_data = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
	val_data = TensorDataset(torch.from_numpy(val_x),torch.from_numpy(val_y))
	train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
	val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
	return train_loader, val_loader

In [8]:
def turn_label(labels):
    label_dict = {}
    label_dict[4] = 0
    for i in range(1,len(labels)):
        if labels[i] != labels[i-1] and labels[i] not in label_dict.keys():
            label_dict[labels[i]] = len(label_dict)
    new_label = []
    for j in labels:
        new_label.append(label_dict[j])
    return new_label

In [9]:
def prepare_dataloader(data,labels,batch_size):
    ''' Turn the data into dataloader'''
    
    tensor_data = TensorDataset(torch.from_numpy(data),torch.from_numpy(labels))
    data_loader = DataLoader(tensor_data,shuffle=True,batch_size=batch_size)
    return data_loader

In [10]:
data = 'midea.train'
train_insts,labels = read_file(data)
#labels = np.array(labels)
test_data = 'midea.test'
test_insts,test_labels = read_file(test_data)

0it [00:00, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.950 seconds.
Prefix dict has been built succesfully.
57849it [01:34, 612.28it/s]
1470it [00:02, 626.26it/s]


In [11]:
test_data = 'midea.test'
test_insts,test_labels = read_file(test_data)
print(test_insts[:10])

1470it [00:02, 649.68it/s]

[['<zh>', '启动', '微波炉'], ['<zh>', '开', '一下', '微波炉'], ['<zh>', '开启', '微波炉'], ['<zh>', '开始', '使用', '微波炉'], ['<zh>', '微波炉', '启动'], ['<zh>', '微波炉', '开机'], ['<zh>', '我要', '用', '微波炉'], ['<zh>', '打开', '微波炉'], ['<zh>', '打开', '微波炉', '电源'], ['<zh>', '把', '微波炉', '开开']]





In [12]:
all_insts = train_insts + test_insts
all_labels = labels + test_labels

train_insts = all_insts[:len(train_insts)]
train_labels = all_insts[:len(train_insts)]
test_insts = all_insts[len(train_insts):]
test_labels = all_labels[len(train_insts):]

print('The length of the train_insts is {}'.format(len(train_insts)))
print('The length of the test insts is {} '.format(len(test_insts)))
print('The length of the train label is {}'.format(len(train_labels)))
print('The length of the test label is {}'.format(len(test_labels)))

The length of the train_insts is 57849
The length of the test insts is 1470 
The length of the train label is 57849
The length of the test label is 1470


In [13]:
print(all_insts[:10])
print(test_insts[:10])

[['<zh>', '一定'], ['<zh>', '定个'], ['<zh>', '好', '的', '哎'], ['<zh>', '可以', '哟'], ['<zh>', '可以'], ['<zh>', '可以'], ['<zh>', '可以'], ['<zh>', '行', '吧'], ['<zh>', '行', '吧'], ['<zh>', '要', '确定']]
[['<zh>', '启动', '微波炉'], ['<zh>', '开', '一下', '微波炉'], ['<zh>', '开启', '微波炉'], ['<zh>', '开始', '使用', '微波炉'], ['<zh>', '微波炉', '启动'], ['<zh>', '微波炉', '开机'], ['<zh>', '我要', '用', '微波炉'], ['<zh>', '打开', '微波炉'], ['<zh>', '打开', '微波炉', '电源'], ['<zh>', '把', '微波炉', '开开']]


In [14]:
word2idx = build_vocab_idx(all_insts)
# train_sec_insts = convert_instance_to_idx_seq(train_insts,word2idx)
# test_sec_insts = convert_instance_to_idx_seq(test_insts,word2idx)
all_insts = convert_instance_to_idx_seq(all_insts,word2idx)
np_all_label = turn_label(all_labels)
# np_train_label = turn_label(train_labels)
# np_test_label = turn_label(test_labels)
train_sec_insts = all_insts[:len(train_insts)]
test_sec_insts = all_insts[len(train_insts):]

np_train_label = np_all_label[:len(train_insts)]
np_test_label = np_all_label[len(train_insts):]

np_train_label = np.array(np_train_label)
np_test_label = np.array(np_test_label)


train_feature = pad_features(train_sec_insts,30)
test_feature = pad_features(test_sec_insts ,30)

print("Train set: \t{}".format(train_feature.shape), 
		"\ntest set: \t{}".format(test_feature.shape))

train_loader = prepare_dataloader(train_feature, np_train_label, batch_size=50)
test_loader = prepare_dataloader(test_feature, np_test_label, batch_size=50)

[Info] Original Vocabulary size = 7269
Train set: 	(57849, 30) 
test set: 	(1470, 30)


In [None]:
# labels = np.array(new_labels)
# word2idx = build_vocab_idx(words_ints)
# train_sec_insts = convert_instance_to_idx_seq(words_ints,word2idx)
# features = pad_features(train_sec_insts,30)
# splited_data = split_data(features,labels)
# train_loader,val_loader = prepare_dataloader(splited_data,50)

In [15]:
dataiter = iter(test_loader)
sample_x  , sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 30])
Sample input: 
 tensor([[   2, 6299, 4915,  ...,    0,    0,    0],
        [   2, 4746, 6566,  ...,    0,    0,    0],
        [   2, 2754, 5417,  ...,    0,    0,    0],
        ...,
        [   2, 2771,  368,  ...,    0,    0,    0],
        [   2, 6299, 1215,  ...,    0,    0,    0],
        [   2, 6299,  132,  ...,    0,    0,    0]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([64, 23, 85, 80, 27, 10, 47, 45, 13, 40, 73, 26, 84, 92, 72, 11, 47, 65,
        20, 61, 95, 95, 25, 82, 41, 99, 25, 89, 43, 35, 10, 88, 11, 81, 34, 37,
        31, 54, 17, 28, 97, 53, 78, 11, 17, 12, 71, 14, 90, 23])


In [16]:
dataiter = iter(train_loader)
sample_x  , sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 30])
Sample input: 
 tensor([[   2,  149,  741,  ...,    0,    0,    0],
        [   2, 1273, 6614,  ...,    0,    0,    0],
        [   2, 7229, 6678,  ...,    0,    0,    0],
        ...,
        [   2, 1961, 2430,  ...,    0,    0,    0],
        [   2, 6516, 1784,  ...,    0,    0,    0],
        [   2, 5943,  102,  ...,    0,    0,    0]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([ 59,   4,  59,  59,  83,   3,  65,  57,  47,  21,  11,  25,  57, 101,
         90,  21,  27,  54,  84,   4,  73,  59,  16,  11,  59,  59,   3,  27,
         60,  47,   3,  17,  17,  82,  12,  94,  94,  72,  41,   1,  57,  57,
         94,  32,  17,  25,  28,  24,  11,  57])


In [80]:
class LSTM(nn.Module):
    
    def __init__(self,input_size, hidden_size, batch_size=50,num_layer=2,batch_first= True,
                dropout=0.5, output_size=105, embedding_dim=300,train_on_gpu=True):
        
        super(LSTM,self).__init__()
        
        self.num_layer = num_layer 
        self.train_on_gpu = train_on_gpu
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_dim, padding_idx=0)
        self.LSTM = nn.LSTM(input_size=embedding_dim,hidden_size=hidden_size,
                           num_layers=num_layer,#,batch_first=batch_first)
                           bidirectional=True)
        self.fc = nn.Linear(hidden_size*2,output_size)
    
    def init_hidden(self,batch_size=None):
        if batch_size is None:
            batch_size = self.batch_size
        if self.train_on_gpu:
            h0 = Variable(torch.zeros(self.num_layer*2, batch_size, self.hidden_size).cuda())
            c0 = Variable(torch.zeros(self.num_layer*2, batch_size, self.hidden_size).cuda())
        else:
            h0 = Variable(torch.zeros(self.num_layer*2, batch_size, self.hidden_size))
            c0 = Variable(torch.zeros(self.num_layer*2, batch_size, self.hidden_size))
    
    def forward(self, x):
        
#         batch_size = x.size(0)
        
#         hidden_state = Variable(torch.zeros(1, batch_size, self.hidden_size))   # [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
#         cell_state = Variable(torch.zeros(1,batch_size , self.hidden_size))  
        
#         hidden_state,cell_state = hidden_state.cuda(),cell_state.cuda()

        
        
#         embed = self.embedding(x)
        
#         lstm_out ,(_,_) = self.LSTM(embed,(hidden_state,cell_state))
#         lstm_out = lstm_out[:,-1,:]
#         out = self.fc(lstm_out)
#         #out = out[:,-1,:]
        
        embeds = self.embedding(x)
        x = embeds.permute(1,0,2)
        self.hidden = self.init_hidden(x.size()[0])
        lstm_out,self.hidden = self.LSTM(x,self.hidden)
        final = lstm_out[-1]
        out = self.fc(final)
        
        return out

In [93]:
vocab_size = len(word2idx)
net = LSTM(input_size=vocab_size,hidden_size=256)
print(net)

LSTM(
  (embedding): Embedding(7272, 300, padding_idx=0)
  (LSTM): LSTM(300, 256, num_layers=2, bidirectional=True)
  (fc): Linear(in_features=512, out_features=105, bias=True)
)


In [94]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(),lr=0.001)

In [95]:
train_on_gpu = torch.cuda.is_available()
print_every = 100
if train_on_gpu:
    net.cuda()
    
counter = 0
epochs =  5
clip=5

net.train()
for epoch in range(epochs):
    
    for inputs,labels in train_loader:
        counter += 1
        if train_on_gpu:
            inputs, labels = inputs.cuda(),labels.cuda()
        
        net.zero_grad()
        output = net(inputs)
        loss = criterion(output.squeeze(),labels)
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        if counter % print_every == 0:
#             test_losses = []
#             net.eval()
#             for inputs,labels in test_loader:
#                 if(train_on_gpu):
#                     inputs, labels = inputs.cuda(), labels.cuda()
                
#                 outputs = net(inputs)
#                 test_loss = criterion(outputs.squeeze(),labels)
#                 test_losses.append(test_loss.item())
# #                 print(val_losses)
#             net.train()
            print("Epoch: {}/{}...".format(epoch+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()))#,
#                   "Val Loss: {:.6f}".format(np.mean(test_losses)))

Epoch: 1/5... Step: 100... Loss: 3.581267...
Epoch: 1/5... Step: 200... Loss: 3.765254...
Epoch: 1/5... Step: 300... Loss: 2.970754...
Epoch: 1/5... Step: 400... Loss: 3.035429...
Epoch: 1/5... Step: 500... Loss: 2.724170...
Epoch: 1/5... Step: 600... Loss: 2.683354...
Epoch: 1/5... Step: 700... Loss: 1.886301...
Epoch: 1/5... Step: 800... Loss: 1.888956...
Epoch: 1/5... Step: 900... Loss: 1.511148...
Epoch: 1/5... Step: 1000... Loss: 1.245008...
Epoch: 1/5... Step: 1100... Loss: 0.708896...
Epoch: 2/5... Step: 1200... Loss: 0.767166...
Epoch: 2/5... Step: 1300... Loss: 0.417788...
Epoch: 2/5... Step: 1400... Loss: 0.519082...
Epoch: 2/5... Step: 1500... Loss: 0.521403...
Epoch: 2/5... Step: 1600... Loss: 0.321520...
Epoch: 2/5... Step: 1700... Loss: 0.382657...
Epoch: 2/5... Step: 1800... Loss: 0.345871...
Epoch: 2/5... Step: 1900... Loss: 0.124673...
Epoch: 2/5... Step: 2000... Loss: 0.182428...
Epoch: 2/5... Step: 2100... Loss: 0.308351...
Epoch: 2/5... Step: 2200... Loss: 0.276433.

In [105]:
num_correct = 0
net.eval() 

test_losses = []
pred_labels = []
true_labels = []
for inputs ,labels in test_loader:
    
    if  train_on_gpu:
        inputs , labels = inputs.cuda() , labels.cuda()
        
    output = net(inputs)
    test_loss = criterion(output.squeeze(),labels)
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze())
#     print(pred.size())
#     print(pred.data.max(1)[1])
    pred = pred.data.max(1)[1]
    pred_labels += list(pred.cpu().numpy())
    true_labels += list(labels.cpu().numpy())
    correct_tensor = pred.eq(labels).view_as(pred)
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
    
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))


Test loss: 1.116
Test accuracy: 0.786


In [113]:
# print(pred_labels)
# real_label = list(test_labels)
print(true_labels[:10])
print(pred_labels[:10])
f1_score = metrics.f1_score(true_labels,pred_labels,average='weighted')
print("This test's F1 score is {}".format(f1_score))
# print(metrics.f1_score(true_labels,pred_labels,average='weighted'))

[27, 76, 17, 50, 17, 9, 85, 104, 88, 65]
[27, 76, 17, 57, 17, 9, 85, 104, 88, 65]
This test's F1 score is 0.7866241638925211


In [61]:
def predict(net,test_review,word2idx,seqence_length = 30,train_on_put=True):
    
    net.eval()
    test_review = test_review.replace(' ','')
    test_review = list(jieba.cut(test_review))
    test_review.insert(0,'<zh>')
    test_ints = [[word2idx[word] for word in test_review]]
    
    features = pad_features(test_ints,30)
    
    feature_tensor = torch.from_numpy(features)
    
    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()
    
    output = net(feature_tensor)
    print(output.data.max(1))
    output = output.data.max(1)[1]
    print(output.size())
    print(output)
    #pred = torch.round(output.squeeze())
    
    #print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
#     print(len(pred))
#     print(pred)

In [62]:
test_review = '恢复 运行 一下'
predict(net,test_review,word2idx)

(tensor([13.0319], device='cuda:0'), tensor([104], device='cuda:0'))
torch.Size([1])
tensor([104], device='cuda:0')
