In [14]:
import numpy as np
import random
import math
import collections
import gensim
from gensim.models import Word2Vec

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [15]:
# t.from_numpy(np.load(opt.embedding_path)['vector'])
embedding_path = "../word2vec/AJData.word2vec.model"
model = Word2Vec.load(embedding_path)

In [44]:
a = []
# print(len(model.wv['好好']))
# model.wv['法庭']
a.append(model.wv['好好'])
a.append(model.wv['法庭'])
a = np.array(a)
print(a.shape)
print('法庭d' in model.wv)
print(model.layer1_size)

(2, 256)
False
256


In [27]:
def load_data(data_path):
    """
    载入数据
    """
    data= []
    labels = []
    max_sentence_len = 0
    with open(data_path, 'r') as f:
        for line in f:
            line_list = line.split('\t')
            one_data = line_list[1].split(' ')
            tmp_len = len(one_data)
            if tmp_len > max_sentence_len:
                max_sentence_len = tmp_len
            data.append(one_data)
            labels.append(int(line_list[2]))
        f.close()
    print("max sentence length: ", max_sentence_len)
    return data, labels

In [28]:
from itertools import groupby

def show_text_len_distribution(data):
    len_list = [len(text) for text in data]
#     print(len_list[1:100])
    step = 500
    for k, g in groupby(sorted(len_list), key=lambda x: (x-1)//step):
    #    dic['{}-{}'.format(k*step+1, (k+1)*step)] = len(list(g))
        print('{}-{}'.format(k*step+1, (k+1)*step)+":"+str(len(list(g))))


In [29]:
def build_voabulary(data, vocabulary_size=50000):
    """
    基于所有数据构建词表
    """
    count = [['UNK', -1]]
    words = []
    for line in data:
        words.extend(line)
    for line in data:
        words.extend(line)
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dict_word2index = dict()
    for word, _ in count:
        dict_word2index[word] = len(dict_word2index)
    dict_index2word = dict(zip(dict_word2index.values(), dict_word2index.keys()))
    
    return  count, dict_word2index, dict_index2word

In [30]:
def build_dataset(data, labels, dict_word2index, max_sentence_len=1000, label_size=8):
    """
    基于词表构建数据集（数值化）
    """
    dataset = []
    indices = np.arange(len(labels))
    np.random.shuffle(indices)
    new_labels = []
    for i in indices:
        new_labels.append(labels[i]-1) 
        new_line = []
        for word in data[i]:
            if word in dict_word2index:
                index = dict_word2index[word]
            else:
                index = 0    # UNK
            new_line.append(index)
        
        zero_num = max_sentence_len - len(new_line)
        while zero_num > 0:
            new_line.append(0)
            zero_num -= 1
        dataset.append(new_line[:max_sentence_len])
#     return dataset, new_labels
    return np.array(dataset, dtype=np.int64), np.array(new_labels, dtype=np.int64)

In [31]:
def split_data(data, radio=0.7):
    """
    将训练集分给为训练集和检验集
    """
    split_index = int(len(data) * 0.7)
    new_data1 = data[ : split_index]
    new_data2 = data[split_index : ]
    return new_data1, new_data2

In [32]:
from torch.utils import data

class MingLueData(data.Dataset):
    
    def __init__(self, X, y):
        self.len = X.shape[0]
        self.x_data = X
        self.y_data = y
    
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len

In [33]:
data_path = '../data/seg_sample_train.txt'
data, labels = load_data(data_path)
show_text_len_distribution(data)
vocab_size = 100000
max_text_len = 1000
count, dict_word2index, dict_index2word = build_voabulary(data, vocabulary_size=vocab_size)
train_data, train_labels = build_dataset(data, labels, dict_word2index, max_sentence_len=max_text_len)
train_X, valid_X = split_data(train_data)
train_y, valid_y = split_data(train_labels)
print(train_X.shape)
print(train_y.shape)
batch_size = 4
num_workers = 2
dataset = MingLueData(train_X[:3000], train_y[:3000])
train_loader = DataLoader(dataset=dataset, 
                               batch_size=batch_size, 
                               shuffle=False,
                               num_workers=num_workers)
dataset = MingLueData(valid_X[:1000], valid_y[:1000])
valid_loader = DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=num_workers)

max sentence length:  20420
1-500:6015
501-1000:2406
1001-1500:647
1501-2000:303
2001-2500:206
2501-3000:125
3001-3500:70
3501-4000:55
4001-4500:37
4501-5000:33
5001-5500:19
5501-6000:17
6001-6500:9
6501-7000:7
7001-7500:5
7501-8000:6
8001-8500:7
8501-9000:4
9001-9500:7
9501-10000:5
10501-11000:2
11501-12000:1
12001-12500:2
12501-13000:3
14501-15000:3
15001-15500:1
16001-16500:1
17001-17500:1
17501-18000:2
20001-20500:1
(7000, 1000)
(7000,)


In [34]:
embedding_size = 128
sentence_size = 1000
feature_size = 100
num_class = 8
vocab_size = 100000
window_sizes = [3, 4, 5]
dropout_rate = 0.5

class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)
        self.convs = nn.ModuleList([
                nn.Sequential(nn.Conv1d(in_channels=embedding_size, 
                                        out_channels=feature_size, 
                                        kernel_size=h),
                              nn.ReLU(),
                              nn.MaxPool1d(kernel_size=sentence_size-h+1))
                     for h in window_sizes
                    ])
        self.fc = nn.Linear(in_features=feature_size*len(window_sizes),out_features=num_class)
    
    
    def forward(self, x):
        embed_x = self.embedding(x)
        
#         print(embed_x.size())
#         # 4 x 1000 x 128  -> 4 x 128 x 1000
        embed_x = embed_x.permute(0, 2, 1)
#         print(embed_x.size())
        out = [conv(embed_x) for conv in self.convs]
#         for o in out:
#             print(o.size())
        out = torch.cat(out, dim=1)
#         print(out.size(1))
        out = out.view(-1, out.size(1))
#         print(out.size())
        out = F.dropout(input=out, p=dropout_rate)
        out = self.fc(out)
        return out
    
        
text_cnn = TextCNN()
print(text_cnn)

TextCNN (
  (embedding): Embedding(100000, 128)
  (convs): ModuleList (
    (0): Sequential (
      (0): Conv1d(128, 100, kernel_size=(3,), stride=(1,))
      (1): ReLU ()
      (2): MaxPool1d (size=998, stride=998, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential (
      (0): Conv1d(128, 100, kernel_size=(4,), stride=(1,))
      (1): ReLU ()
      (2): MaxPool1d (size=997, stride=997, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential (
      (0): Conv1d(128, 100, kernel_size=(5,), stride=(1,))
      (1): ReLU ()
      (2): MaxPool1d (size=996, stride=996, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (fc): Linear (300 -> 8)
)


In [35]:
learning_rate = 0.001
weight_decay = 0.001
loss_fun = nn.CrossEntropyLoss()
# weight_decay 等价与L2正则化
optimizer = optim.Adam(params=text_cnn.parameters(),lr=learning_rate,weight_decay=weight_decay)

In [36]:
epoch_num = 3
for epoch in range(epoch_num):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        texts, labels = data
        
        inputs, labels = Variable(texts), Variable(labels)
        optimizer.zero_grad()
        outputs = text_cnn(inputs)
        loss = loss_fun(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.data[0]
        if i % 100 == 99:
            print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

[1,   100] loss: 2.056
[1,   200] loss: 2.003
[1,   300] loss: 2.086
[1,   400] loss: 2.029
[1,   500] loss: 2.032
[1,   600] loss: 1.941
[1,   700] loss: 1.955
[2,   100] loss: 1.975
[2,   200] loss: 1.943
[2,   300] loss: 1.972
[2,   400] loss: 1.940
[2,   500] loss: 1.914
[2,   600] loss: 1.878
[2,   700] loss: 1.875
[3,   100] loss: 1.935
[3,   200] loss: 1.900
[3,   300] loss: 1.939
[3,   400] loss: 1.919
[3,   500] loss: 1.901
[3,   600] loss: 1.860
[3,   700] loss: 1.862


In [37]:
from collections import Counter


def micro_avg_f1(predict_label, true_label, num_class):
    N = len(predict_label)
    m = num_class
    w = Counter(true_label)
    print(w)
    score = 0
    for i in range(m):
        score += w[i] * f1(predict_label, true_label, i)

    return score / float(N)


def f1(predict_label, true_label, cur_label):
    true_pos, false_pos = 0, 0
    false_neg = 0
    for i in range(len(predict_label)):
        if predict_label[i] == cur_label:
            if true_label[i] == cur_label:
                true_pos += 1
            else:
                false_pos += 1
        else:  # predict_label != cur_label
            if true_label[i] == cur_label:
                false_neg += 1
    if true_pos == 0:
        precision, recall = 0, 0
    else:
        precision = true_pos / float(true_pos + false_pos)
        recall = true_pos / float(true_pos + false_neg)
    if precision == 0 or recall == 0:
        f1 = 0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    return f1

In [39]:
true_labels = []
predicted_labels = []

for data in valid_loader:
    texts, labels = data
    outputs = text_cnn(Variable(texts))
    _, predicted = torch.max(outputs.data, 1)
    true_labels.extend(labels)
    predicted = [i[0] for i in predicted]
    predicted_labels.extend(predicted)

print(true_labels[:10])
print(predicted_labels[:10])
print("Micro-Averaged F1:",micro_avg_f1(predicted_labels, true_labels, num_class))

[5, 1, 2, 0, 6, 3, 6, 0, 0, 1]
[6, 1, 6, 6, 6, 6, 6, 6, 6, 6]
Counter({6: 233, 1: 167, 5: 160, 0: 141, 4: 125, 2: 111, 3: 58, 7: 5})
Micro-Averaged F1: 0.23868395004777757
