In [1]:
import torch
import numpy as np
import pandas as pd
import torchtext
import nltk
from torch import nn, optim
from tqdm import tqdm
from torch.autograd import Variable
import torch.nn.functional as F
from sklearn.metrics import f1_score

In [2]:
USE_GPU = True

# 工具函数

In [147]:
def get_label(predict, threshold=0.23):
    return (predict >= threshold).astype(int)

# 数据加载

In [4]:
glove_vectors = torchtext.vocab.Vectors("../input/embeddings/glove.840B.300d/glove.840B.300d.txt")

# 数据预处理

In [5]:
FIX_LENGTH = 70
stop_words = nltk.corpus.stopwords.words('english')
TEXT = torchtext.data.Field(tokenize=torchtext.data.get_tokenizer('toktok'), init_token='<SOS>', 
                            eos_token='<EOS>',lower=True, fix_length=FIX_LENGTH, stop_words=None, 
                            batch_first=True)
LABEL = torchtext.data.Field(sequential=False, use_vocab=False, is_target=True, batch_first=True, 
                             dtype=torch.float)

In [6]:
train_data = torchtext.data.TabularDataset(
    path='../input/train.csv', format = 'csv', 
    fields=[('qid', None),
            ('question_text', TEXT), 
            ('target', LABEL)], 
    skip_header = True)

In [7]:
train, test = train_data.split(split_ratio=0.95, strata_field='target')

In [8]:
BATCH_SIZE = 256
train_iter = torchtext.data.Iterator(dataset=train, batch_size=BATCH_SIZE, shuffle=True)
test_iter = torchtext.data.Iterator(dataset=test, batch_size=BATCH_SIZE, shuffle=True, 
                          sort=False, sort_within_batch=False, repeat=False)

In [9]:
TEXT.build_vocab(train, vectors = glove_vectors, max_size=40000)
vocab = TEXT.vocab

In [10]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")

Unique tokens in TEXT vocabulary: 40004


In [11]:
pretrained_embeddings = vocab.vectors
print(f"Pretrained embeddings shape: {pretrained_embeddings.shape}")
EMBEDDING_LENGTH = 300

Pretrained embeddings shape: torch.Size([40004, 300])


# 定义模型

In [12]:
# filter_sizes = [1,2,3,5]
# filter_sizes_sum = sum(filter_sizes)
# num_filters = 42
# CNN_DROPOUT = 0.1
# class TextCNN(nn.Module):
#     def __init__(self, pretrained_weight):
#         super(TextCNN, self).__init__()
#         self.embedding = nn.Embedding.from_pretrained(pretrained_weight, freeze=False)
#         self.conv0 = nn.Sequential(
#             nn.Conv2d(1, num_filters, kernel_size=(filter_sizes[0], EMBEDDING_LENGTH)),
#             nn.Tanh())
#         self.conv0.apply(self.init_weight)
#         self.pool0 = nn.MaxPool2d((FIX_LENGTH-filter_sizes[0]+1, 1), stride=1)
#         self.conv1 = nn.Sequential(
#             nn.Conv2d(1, num_filters, kernel_size=(filter_sizes[0], EMBEDDING_LENGTH)),
#             nn.Tanh())
#         self.conv1.apply(self.init_weight)
#         self.pool1 = nn.MaxPool2d((FIX_LENGTH-filter_sizes[1]+1, 1), stride=1)
#         self.conv2 = nn.Sequential(
#             nn.Conv2d(1, num_filters, kernel_size=(filter_sizes[0], EMBEDDING_LENGTH)),
#             nn.Tanh())
#         self.conv2.apply(self.init_weight)
#         self.pool2 = nn.MaxPool2d((FIX_LENGTH-filter_sizes[2]+1, 1), stride=1)
#         self.conv3 = nn.Sequential(
#             nn.Conv2d(1, num_filters, kernel_size=(filter_sizes[0], EMBEDDING_LENGTH)),
#             nn.Tanh())
#         self.conv3.apply(self.init_weight)
#         self.pool3 = nn.MaxPool2d((FIX_LENGTH-filter_sizes[3]+1, 1), stride=1)
#         self.dropout = nn.Dropout(CNN_DROPOUT)
#         self.fc = nn.Linear(filter_sizes_sum * 42, 1)
#         self.sigmoid = nn.Sigmoid()
        
#     def forward(self, x_input):
#         x = self.embedding(x_input)
#         x = x.unsqueeze(1)
#         conv0 = self.conv0(x)
#         pool0 = self.pool0(conv0)
#         conv1 = self.conv1(x)
#         pool1 = self.pool1(conv1)
#         conv2 = self.conv2(x)
#         pool2 = self.pool2(conv2)
#         conv3 = self.conv3(x)
#         pool3 = self.pool3(conv3)
#         z = torch.cat((pool0, pool1, pool2, pool3), 2)
#         z = z.view(z.size(0), -1)
#         z = self.dropout(z)
#         x = self.fc(z)
#         x = self.sigmoid(x)
#         return x

#     def init_weight(self, layer):
#         if (type(layer) == nn.Conv2d):
#             nn.init.kaiming_normal_(layer.weight, mode='fan_in')

In [141]:
filter_sizes = [1,2,3,5]
filter_sizes_len = len(filter_sizes)
num_filters = 36
CNN_DROPOUT = 0.1
class TextCNN(nn.Module):
    def __init__(self, pretrained_weight):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_weight, freeze=False)
        self.conv0 = nn.Sequential(
            nn.Conv1d(EMBEDDING_LENGTH, num_filters, kernel_size=(filter_sizes[0])),
            nn.ELU())
        self.conv0.apply(self.init_weight)
        self.pool0 = nn.MaxPool1d((FIX_LENGTH-filter_sizes[0]+1))
        self.conv1 = nn.Sequential(
            nn.Conv1d(EMBEDDING_LENGTH, num_filters, kernel_size=(filter_sizes[1])),
            nn.ELU())
        self.conv1.apply(self.init_weight)
        self.pool1 = nn.MaxPool1d((FIX_LENGTH-filter_sizes[1]+1))
        self.conv2 = nn.Sequential(
            nn.Conv1d(EMBEDDING_LENGTH, num_filters, kernel_size=(filter_sizes[2])),
            nn.ELU())
        self.conv2.apply(self.init_weight)
        self.pool2 = nn.MaxPool1d((FIX_LENGTH-filter_sizes[2]+1))
        self.conv3 = nn.Sequential(
            nn.Conv1d(EMBEDDING_LENGTH, num_filters, kernel_size=(filter_sizes[3])),
            nn.ELU())
        self.conv3.apply(self.init_weight)
        self.pool3 = nn.MaxPool1d((FIX_LENGTH-filter_sizes[3]+1))
        self.batch_norm = nn.BatchNorm1d(filter_sizes_len * num_filters)
        self.dropout = nn.Dropout(CNN_DROPOUT)
        self.fc = nn.Linear(filter_sizes_len * num_filters, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x_input):
        x = self.embedding(x_input)
        x = x.permute(0, 2, 1)
        conv0 = self.conv0(x)
        pool0 = self.pool0(conv0)
        conv1 = self.conv1(x)
        pool1 = self.pool1(conv1)
        conv2 = self.conv2(x)
        pool2 = self.pool2(conv2)
        conv3 = self.conv3(x)
        pool3 = self.pool3(conv3)
        z = torch.cat((pool0, pool1, pool2, pool3), 1)
        z = z.view(z.size(0), -1)
#         z = self.dropout(z)
        z = self.batch_norm(z)
        x = self.fc(z)
        x = self.sigmoid(x)
        return x

    def init_weight(self, layer):
        if (type(layer) == nn.Conv2d):
            nn.init.kaiming_normal_(layer.weight, mode='fan_in')

# 模型训练

In [142]:
class NormTrainer:
    def __init__(self, model, train_loader, test_loader, get_label_f, batch_size=BATCH_SIZE):
        self.model = model
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.optimizer = optim.Adam(self.model.parameters())
        self.criterion = torch.nn.BCELoss()
        if USE_GPU:
            self.model.cuda()
        self.get_label_f = get_label_f

    def train(self):
        self.model.train()
        train_loss = 0
        for batch_idx, (data, label) in enumerate(self.train_loader):
            self.optimizer.zero_grad()
            vdata = Variable(data)
            vlabel = Variable(label)
            if USE_GPU:
                vdata = vdata.cuda()
                vlabel = vlabel.cuda()
            predict = self.model(vdata)
            predict = torch.squeeze(predict)
            loss = self.criterion(predict, vlabel)
            loss.backward()
            train_loss += loss.item()
            self.optimizer.step()
        print('Average loss: {:.4f}'.format(train_loss / len(self.train_loader)))

    def test(self):
        self.model.eval()
        test_loss = 0
        predict_numpy = np.zeros(0)
        label_numpy = np.zeros(0)
        with torch.no_grad():
            for batch_idx, (data, label) in enumerate(self.test_loader):
                vdata = torch.LongTensor(data)
                vlabel = torch.Tensor(label)
                if USE_GPU:
                    vdata = vdata.cuda()
                    vlabel = vlabel.cuda()
                predict = self.model(vdata)
                predict = torch.squeeze(predict)
                predict_numpy = np.append(predict_numpy, predict.cpu().numpy())
                label_numpy = np.append(label_numpy, vlabel.cpu().numpy())
                loss = self.criterion(predict, vlabel)
                test_loss += loss.item()
        print("Total loss: {:.4f}".format(test_loss))
        print("Threshold F1 score is: {: .4f}".format(self.get_f1_score_by_predict_sigmoid(predict_numpy, label_numpy)))
        print("Standard F1 score is: {: .4f}".format(self.get_f1_score_by_predict_sigmoid(predict_numpy, label_numpy, 0.5)))
    
    def get_f1_score_by_predict_sigmoid(self, predict, true_label, threshold=None):
        if threshold is not None:
            predict = self.get_label_f(predict, threshold)
        else:
            predict = self.get_label_f(predict)
        true_label = true_label.astype(np.int32)
        return f1_score(predict, true_label)
    
    def save_model(self, path):
        torch.save(self.model, path)

In [143]:
EPOCH = 2
BATCH_SIZE = 256

In [144]:
model = TextCNN(pretrained_embeddings)
if USE_GPU:
    model = model.cuda()

In [148]:
trainer = NormTrainer(model=model, train_loader=train_iter, test_loader=test_iter, get_label_f=get_label)
for epoch in range(EPOCH):
        print("===============================  EPOCH {:d}  ===============================".format(epoch))
#         trainer.train()
        print("===============================  Test  ===============================")
        trainer.test()

Total loss: 33.5108
Threshold F1 score is:  0.6304
Standard F1 score is:  0.6194
Total loss: 33.3757
Threshold F1 score is:  0.6304
Standard F1 score is:  0.6194


# 预测

In [None]:
test_df = pd.read_csv('../input/test.csv')
submission_df = pd.read_csv('../input/sample_submission.csv')

In [None]:
def predict(model, text_numpy):
    preprocess_text_result = []
    for text in text_numpy:
        preprocess_text = TEXT.preprocess(text)
        preprocess_text_result.append(preprocess_text)
    processed_text = TEXT.process(preprocess_text_result)
    TEST_BATCH_SIZE = 256
    idx = 0
    predict_vector = np.zeros(0)
    with torch.no_grad():
        while idx < len(processed_text):
            batch_data = processed_text[idx: idx+TEST_BATCH_SIZE]
            if USE_GPU:
                batch_data = batch_data.cuda()
            predicts = model(batch_data)
            predicts = torch.squeeze(predicts)
            predicts_numpy = predicts.cpu().numpy()
            predict_vector = np.append(predict_vector, predicts_numpy)
            idx += TEST_BATCH_SIZE
    return predict_vector

In [None]:
predict_result = predict(model, test_df.question_text.values)

In [None]:
result = get_label(predict_result)

# 输出结果

In [None]:
submission_df = pd.read_csv('../input/sample_submission.csv')
submission_df['prediction'] = result

In [None]:
submission_df.to_csv('submission.csv', index=False)