In [9]:
import pandas as pd
import numpy as np 
from process import *
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"using {device} for training")

using cuda for training


In [10]:
# 从原始数据准备训练用的csv数据
#def prepare_data(train_data_origin_path, train_data_path, sampled_label=None):
#     df = pd.read_json(train_data_origin_path, lines=True)
#     # df = df.loc[df['label'].isin(sampled_label)]
#     df.to_csv(train_data_path, encoding='utf-8')
#     return df
# prepare_data('data/train.json','data/train.csv')
# prepare_data('data/dev.json','data/dev.csv')

In [11]:
lens=23
embedding_path = './embedding_models/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
print(" == loading word embedding")
vectors, size, dim = load_embeddings(embedding_path)
vectors['OOV'] = np.random.rand(dim)
vectors['PAD'] = np.zeros(dim)

 == loading word embedding
word embedding vocab loaded


In [12]:
# load train data
train_data = pd.read_csv('./data/train.csv')
dev_data = pd.read_csv('./data/dev.csv')
sentences = train_data['sentence'].values.tolist()
sentences_embedded = [vectorize(sentence=sentence, length=lens, padding='PAD', oov='OOV', vectors=vectors)
                      for sentence in sentences]

sentences_embedded = np.array(sentences_embedded)
sentences_embedded = sentences_embedded.reshape(len(train_data), lens, dim, 1)

labels = train_data['label'].values.tolist()
label_set = set(labels)
label_map = {}
for i, key in enumerate(label_set):
    label_map[key] = i
labels_mapped = np.array([label_map[label] for label in labels])

# load dev data
dev_sentences = dev_data['sentence'].values.tolist()
dev_sentences_embedded = [vectorize(sentence=sentence, length=lens, padding='PAD', oov='OOV', vectors=vectors)
                      for sentence in dev_sentences]

dev_sentences_embedded = np.array(dev_sentences_embedded)
dev_sentences_embedded = dev_sentences_embedded.reshape(len(dev_data), lens, dim, 1)

dev_labels = dev_data['label'].values.tolist()
dev_labels_mapped = np.array([label_map[label] for label in dev_labels])

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.689 seconds.
Prefix dict has been built successfully.


In [13]:
train_dataset = TensorDataset(torch.tensor(sentences_embedded,dtype=torch.float).permute(0,3,1,2),torch.tensor(labels_mapped,dtype=torch.long))
dev_dataset = TensorDataset(torch.tensor(dev_sentences_embedded,dtype=torch.float).permute(0,3,1,2),torch.tensor(dev_labels_mapped,dtype=torch.long))

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=256,
                                         shuffle=True)
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=256,
                                         shuffle=False)

## 模型构建

In [14]:
class textCNN(nn.Module):
    
    def __init__(self, num_class,emb_dim=100):
        super(textCNN, self).__init__()
    
        self.convs = nn.ModuleList([nn.Conv2d(1, 32, (w, emb_dim)) for w in [4,5,6,7]])
        self.conv_dropout = nn.Dropout(0.3)
        
        self.fc1 = nn.Linear(4*32,64)
        self.fc_dropout = nn.Dropout(0.6)
        self.fc2 = nn.Linear(64, num_class)
        
    def forward(self, x):
        con_x = [self.conv_dropout(F.relu(conv(x))) for conv in self.convs]

        pool_x = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in con_x]
        
        fc_x = torch.cat(pool_x, dim=1)
        
        fc_x = fc_x.squeeze(-1)
        fc_x = F.relu(self.fc1(fc_x))
        fc_x = self.fc_dropout(fc_x)
        logit = self.fc2(fc_x)
        return logit

In [15]:
model = textCNN(15).to(device)

In [16]:
EPOCHS = 30
BATCH_SIZE = 256

loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())


for epoch in range(1, EPOCHS + 1):
    model.train()
    total_acc, total_count = 0, 0

    for idx, (x,y) in enumerate(train_dataloader):
        optimizer.zero_grad()
        x,y = x.to(device),y.to(device)
        predicted_label = model(x)
        loss = loss_func(predicted_label, y)
        loss.backward()
        optimizer.step()
        
        total_acc += (predicted_label.argmax(1) == y).sum().item()
        total_count += y.size(0)
        if idx % 50 == 0 and idx > 0:
            print(f'{epoch:2d}-{idx:04d}: loss:{loss.item():7.4f}; accuracy:{total_acc/total_count:.4f}')
            
    
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (x, y) in enumerate(dev_dataloader):
            x, y = x.to(device), y.to(device)
            predicted_label = model(x)
            loss = loss_func(predicted_label, y)
            total_acc += (predicted_label.argmax(1) == y).sum().item()
            total_count += y.size(0)
    valid_acc = total_acc/total_count
    print(f'accuracy on valid set :{valid_acc:.3f}')


 1-0050: loss: 2.3473; accuracy:0.1497
 1-0100: loss: 1.9828; accuracy:0.2273
 1-0150: loss: 1.8125; accuracy:0.2814
 1-0200: loss: 1.8869; accuracy:0.3189
accuracy on valid set :0.528
 2-0050: loss: 1.6059; accuracy:0.4593
 2-0100: loss: 1.6283; accuracy:0.4691
 2-0150: loss: 1.6090; accuracy:0.4717
 2-0200: loss: 1.5761; accuracy:0.4766
accuracy on valid set :0.547
 3-0050: loss: 1.6332; accuracy:0.4939
 3-0100: loss: 1.5843; accuracy:0.5046
 3-0150: loss: 1.4423; accuracy:0.5066
 3-0200: loss: 1.4960; accuracy:0.5083
accuracy on valid set :0.558
 4-0050: loss: 1.5496; accuracy:0.5188
 4-0100: loss: 1.4704; accuracy:0.5215
 4-0150: loss: 1.5301; accuracy:0.5212
 4-0200: loss: 1.6085; accuracy:0.5218
accuracy on valid set :0.563
 5-0050: loss: 1.4572; accuracy:0.5293
 5-0100: loss: 1.5084; accuracy:0.5314
 5-0150: loss: 1.4211; accuracy:0.5309
 5-0200: loss: 1.4810; accuracy:0.5310
accuracy on valid set :0.568
 6-0050: loss: 1.6412; accuracy:0.5424
 6-0100: loss: 1.4275; accuracy:0.54