In [1]:
import torch
import numpy as np
import pandas as pd
import torchtext
import nltk
from torch import nn, optim
from tqdm import tqdm
from torch.autograd import Variable
import torch.nn.functional as F
from sklearn.metrics import f1_score

In [2]:
USE_GPU = True

# 数据加载

In [3]:
glove_vectors = torchtext.vocab.Vectors("../input/embeddings/glove.840B.300d/glove.840B.300d.txt")

# 数据预处理

In [4]:
FIX_LENGTH = 70
stop_words = nltk.corpus.stopwords.words('english')
TEXT = torchtext.data.Field(tokenize=torchtext.data.get_tokenizer('toktok'), init_token='<SOS>', 
                            eos_token='<EOS>',lower=True, fix_length=FIX_LENGTH, stop_words=None, 
                            batch_first=True)
LABEL = torchtext.data.Field(sequential=False, use_vocab=False, is_target=True, batch_first=True, 
                             dtype=torch.float64)

In [5]:
train_data = torchtext.data.TabularDataset(
    path='../input/train.csv', format = 'csv', 
    fields=[('qid', None),
            ('question_text', TEXT), 
            ('target', LABEL)], 
    skip_header = True)

In [6]:
train, test = train_data.split(split_ratio=0.95, strata_field='target')

In [7]:
BATCH_SIZE = 256
train_iter = torchtext.data.Iterator(dataset=train, batch_size=BATCH_SIZE, shuffle=True)
test_iter = torchtext.data.Iterator(dataset=test, batch_size=BATCH_SIZE, shuffle=True, 
                          sort=False, sort_within_batch=False, repeat=False)

In [8]:
TEXT.build_vocab(train, vectors = glove_vectors, max_size=20000, min_freq=10)
vocab = TEXT.vocab

In [9]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")

Unique tokens in TEXT vocabulary: 20004


In [10]:
pretrained_embeddings = vocab.vectors
print(f"Pretrained embeddings shape: {pretrained_embeddings.shape}")
EMBEDDING_LENGTH = 300

Pretrained embeddings shape: torch.Size([20004, 300])


# 定义模型

In [11]:
CNN_DROPOUT = 0.1
CNN_KERNEL_NUM = 50
class TextCNN(nn.Module):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    """
    def __init__(self, pretrained_weight, is_static=False):
        super(TextCNN, self).__init__()
        self.with_embedding = False
        in_channel = 1
        out_channel = CNN_KERNEL_NUM
        kernel_sizes = [3, 4, 5]
        self.embedding = nn.Embedding.from_pretrained(pretrained_weight, freeze=(not is_static))
        self.conv = nn.ModuleList([nn.Conv2d(in_channel, out_channel, (K, EMBEDDING_LENGTH)) for K in kernel_sizes])

        self.dropout = nn.Dropout(CNN_DROPOUT)
        self.fc = nn.Linear(len(kernel_sizes) * out_channel, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_x):
        """
        :param input_x: a list size having the number of batch_size elements with the same length
        :return: batch_size X num_aspects tensor
        """
        x = Variable(input_x)
        x = self.embedding(x)

        # Conv & max pool
        x = x.unsqueeze(1)  # dim: (batch_size, 1, max_seq_len, embedding_size)

        # turns to be a list: [ti : i \in kernel_sizes] where ti: tensor of dim([batch, num_kernels, max_seq_len-i+1])
        x = [F.relu(conv(x)).squeeze(3) for conv in self.conv]

        # dim: [(batch_size, num_kernels), ...]*len(kernel_sizes)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)

        # Dropout & output
        x = self.dropout(x)  # (batch_size,len(kernel_sizes)*num_kernels)
        # logit = F.log_softmax(self.fc(x))  # (batch_size, num_aspects)
        fc = self.fc(x)
        sigmoid = self.sigmoid(fc)
        return sigmoid

# 模型训练

In [12]:
class NormTrainer:
    def __init__(self, model, train_loader, test_loader, batch_size=BATCH_SIZE):
        self.model = model
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.optimizer = optim.Adam(self.model.parameters())
        self.criterion = torch.nn.BCELoss()
        if USE_GPU:
            self.model.cuda()

    def train(self):
        self.model.train()
        train_loss = 0
        for batch_idx, (data, label) in enumerate(tqdm(self.train_loader)):
            self.optimizer.zero_grad()
            vdata = Variable(data)
            vlabel = Variable(label)
            if USE_GPU:
                vdata = vdata.cuda()
                vlabel = vlabel.cuda()
            predict = self.model(vdata)
            predict = torch.squeeze(predict)
            loss = self.criterion(predict, vlabel)
            loss.backward()
            train_loss += loss.item()
            self.optimizer.step()
        print('Average loss: {:.4f}'.format(train_loss / len(self.train_loader)))

    def test(self):
        self.model.eval()
        test_loss = 0
        predict_numpy = np.zeros(0)
        label_numpy = np.zeros(0)
        with torch.no_grad():
            for batch_idx, (data, label) in enumerate(tqdm(self.test_loader)):
                vdata = Variable(data)
                vlabel = Variable(label)
                if USE_GPU:
                    vdata = vdata.cuda()
                    vlabel = vlabel.cuda()
                predict = self.model(vdata)
                predict = torch.squeeze(predict)
                predict_numpy = np.append(predict_numpy, predict.cpu().numpy())
                label_numpy = np.append(label_numpy, vlabel.cpu().numpy())
                loss = self.criterion(predict, vlabel)
                test_loss += loss.item()
        print("Total loss: {:.4f}".format(test_loss))
        print("F1 score is: {: .4f}".format(NormTrainer.get_f1_score_by_predict_sigmoid(predict_numpy, label_numpy)))
    
    @staticmethod
    def get_f1_score_by_predict_sigmoid(predict, true_label):
        predict = np.round(predict)
        true_label = np.round(true_label)
        return f1_score(predict, true_label)
    
    def save_model(self, path):
        torch.save(self.model, path)

In [13]:
EPOCH = 25
BATCH_SIZE = 256

In [14]:
model = TextCNN(pretrained_embeddings)
if USE_GPU:
    model = model.double().cuda()

In [None]:
trainer = NormTrainer(model=model, train_loader=train_iter, test_loader=test_iter)
for epoch in range(EPOCH):
        print("===============================  EPOCH {:d}  ===============================".format(epoch))
        trainer.train()
        if epoch % 5 == 0:
            print("===============================  Test  ===============================")
            trainer.test()

  0%|          | 0/4847 [00:00<?, ?it/s]



 27%|██▋       | 1303/4847 [01:21<03:34, 16.53it/s]

# 预测

In [16]:
test_df = pd.read_csv('../input/test.csv')
submission_df = pd.read_csv('../input/sample_submission.csv')

In [17]:
def predict(model, text_numpy):
    preprocess_text_result = []
    for text in text_numpy:
        preprocess_text = TEXT.preprocess(text)
        preprocess_text_result.append(preprocess_text)
    processed_text = TEXT.process(preprocess_text_result)
    TEST_BATCH_SIZE = 256
    idx = 0
    predict_vector = np.zeros(0)
    with torch.no_grad():
        while idx < len(processed_text):
            batch_data = processed_text[idx: idx+TEST_BATCH_SIZE]
            if USE_GPU:
                batch_data = batch_data.cuda()
            predicts = model(batch_data)
            predicts = torch.squeeze(predicts)
            predicts_numpy = predicts.cpu().numpy()
            predict_vector = np.append(predict_vector, predicts_numpy)
            idx += TEST_BATCH_SIZE
    return predict_vector

In [18]:
predict_result = predict(model, test_df.question_text.values)

In [19]:
result = np.round(predict_result).astype(np.int32)

# 输出结果

In [20]:
submission_df = pd.read_csv('../input/sample_submission.csv')
submission_df['prediction'] = result

In [1]:
submission_df.to_csv('submission.csv', index=False)