In [1]:
import torch
import numpy as np
import pandas as pd
import torchtext
import nltk
from torch import nn, optim
from tqdm import tqdm
from torch.autograd import Variable
import torch.nn.functional as F
from sklearn.metrics import f1_score
import math
import random

In [2]:
USE_GPU = True

# 工具函数

In [48]:
def get_label(predict, threshold=0.36):
    return (predict >= threshold).astype(int)

In [78]:
class EnhanceIter(object):
    """ 数据加强类
    input: 数据集 原生数据集比率 
    """ 
    
    def __init__(self, dataset, ratio=0.8, batch_size=256):
        self.batch_size = batch_size
        self.ratio = ratio
        self.dataset = dataset
        self.origin_loader = torchtext.data.Iterator(dataset=dataset, 
                                                     batch_size=math.ceil(ratio*batch_size), 
                                                     shuffle=True)
        self.postive_idxs = self._get_postive_idxs()
        self.TEXT = self.dataset.fields['question_text']
        self.iter_origin = iter(self.origin_loader)
        self.postive_count = self.batch_size - math.ceil(self.ratio*self.batch_size)
        
    def _get_postive_idxs(self):
        postive_list = []
        for idx, example in enumerate(self.dataset):
            if (example.target == str(1)):
                postive_list.append(idx)
        return np.array(postive_list, dtype=np.int64)
    
    def init_epoch(self):
        self.origin_loader = torchtext.data.Iterator(dataset=self.dataset, 
                                             batch_size=math.ceil(self.ratio*self.batch_size), 
                                             shuffle=True)
        self.iter_origin = iter(self.origin_loader)

    def __len__(self):
        return math.floor(len(self.dataset) / self.ratio / self.batch_size)
        
    def __iter__(self):
        while True:
            self.init_epoch()
            for _ in range(len(self)-1):
                enhace_data = self.get_batch_enhace_data()
                enhace_label = torch.ones(self.postive_count)
                origin_data, origin_label = next(self.iter_origin)
                batch_data = torch.cat((enhace_data, origin_data), 0)
                batch_label = torch.cat((enhace_label, origin_label), 0)
                idx = torch.randperm(len(batch_label))
                batch_data = batch_data[idx]
                batch_label = batch_label[idx]
                yield batch_data, batch_label
            return

    def get_batch_enhace_data(self):
        idxs = self._get_postive_idx_in_batch()
        enhance_texts = self._get_enhance_texts_with_idxs(idxs)
        preprocess_texts = []
        for text in enhance_texts:
            preprocess_text = self.TEXT.preprocess(text)
            preprocess_texts.append(preprocess_text)
        processed_text = self.TEXT.process(preprocess_texts)
        return processed_text
        
    def _get_enhance_texts_with_idxs(self, idxs):
        enhance_texts = []
        for idx in idxs:
            postive_text = self.dataset[idx].question_text
            enhance_text = self._enhance_text(postive_text)
            enhance_texts.append(enhance_text)
        return enhance_texts
    
    def _get_postive_idx_in_batch(self):
        idxs = np.random.choice(self.postive_idxs, self.postive_count)
        return idxs
    
    def _enhance_text(self, text):
        """
        对文本进行增强
        
        目前的方法是打乱顺序
        
        input: text 为一个文本 eg. ['hello', 'world']
        output: 增强后的文本 eg. ['world', 'hello']
        
        """
        return random.sample(text, k=len(text))

# 数据加载

In [None]:
glove_vectors = torchtext.vocab.Vectors("../input/embeddings/glove.840B.300d/glove.840B.300d.txt")

# 数据预处理

In [None]:
FIX_LENGTH = 70
stop_words = nltk.corpus.stopwords.words('english')
TEXT = torchtext.data.Field(tokenize=torchtext.data.get_tokenizer('toktok'), init_token='<SOS>', 
                            eos_token='<EOS>',lower=True, fix_length=FIX_LENGTH, stop_words=None, 
                            batch_first=True)
LABEL = torchtext.data.Field(sequential=False, use_vocab=False, is_target=True, batch_first=True, 
                             dtype=torch.float)

In [None]:
train_data = torchtext.data.TabularDataset(
    path='../input/train.csv', format = 'csv', 
    fields=[('qid', None),
            ('question_text', TEXT), 
            ('target', LABEL)], 
    skip_header = True)

In [None]:
train, test = train_data.split(split_ratio=0.95, strata_field='target')

In [10]:
TEXT.build_vocab(train, vectors = glove_vectors, max_size=40000)
vocab = TEXT.vocab

In [11]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")

Unique tokens in TEXT vocabulary: 40004


In [12]:
pretrained_embeddings = vocab.vectors
print(f"Pretrained embeddings shape: {pretrained_embeddings.shape}")
EMBEDDING_LENGTH = 300

Pretrained embeddings shape: torch.Size([40004, 300])


# 定义模型

In [13]:
# filter_sizes = [1,2,3,5]
# filter_sizes_sum = sum(filter_sizes)
# num_filters = 42
# CNN_DROPOUT = 0.1
# class TextCNN(nn.Module):
#     def __init__(self, pretrained_weight):
#         super(TextCNN, self).__init__()
#         self.embedding = nn.Embedding.from_pretrained(pretrained_weight, freeze=False)
#         self.conv0 = nn.Sequential(
#             nn.Conv2d(1, num_filters, kernel_size=(filter_sizes[0], EMBEDDING_LENGTH)),
#             nn.Tanh())
#         self.conv0.apply(self.init_weight)
#         self.pool0 = nn.MaxPool2d((FIX_LENGTH-filter_sizes[0]+1, 1), stride=1)
#         self.conv1 = nn.Sequential(
#             nn.Conv2d(1, num_filters, kernel_size=(filter_sizes[0], EMBEDDING_LENGTH)),
#             nn.Tanh())
#         self.conv1.apply(self.init_weight)
#         self.pool1 = nn.MaxPool2d((FIX_LENGTH-filter_sizes[1]+1, 1), stride=1)
#         self.conv2 = nn.Sequential(
#             nn.Conv2d(1, num_filters, kernel_size=(filter_sizes[0], EMBEDDING_LENGTH)),
#             nn.Tanh())
#         self.conv2.apply(self.init_weight)
#         self.pool2 = nn.MaxPool2d((FIX_LENGTH-filter_sizes[2]+1, 1), stride=1)
#         self.conv3 = nn.Sequential(
#             nn.Conv2d(1, num_filters, kernel_size=(filter_sizes[0], EMBEDDING_LENGTH)),
#             nn.Tanh())
#         self.conv3.apply(self.init_weight)
#         self.pool3 = nn.MaxPool2d((FIX_LENGTH-filter_sizes[3]+1, 1), stride=1)
#         self.dropout = nn.Dropout(CNN_DROPOUT)
#         self.fc = nn.Linear(filter_sizes_sum * 42, 1)
#         self.sigmoid = nn.Sigmoid()
        
#     def forward(self, x_input):
#         x = self.embedding(x_input)
#         x = x.unsqueeze(1)
#         conv0 = self.conv0(x)
#         pool0 = self.pool0(conv0)
#         conv1 = self.conv1(x)
#         pool1 = self.pool1(conv1)
#         conv2 = self.conv2(x)
#         pool2 = self.pool2(conv2)
#         conv3 = self.conv3(x)
#         pool3 = self.pool3(conv3)
#         z = torch.cat((pool0, pool1, pool2, pool3), 2)
#         z = z.view(z.size(0), -1)
#         z = self.dropout(z)
#         x = self.fc(z)
#         x = self.sigmoid(x)
#         return x

#     def init_weight(self, layer):
#         if (type(layer) == nn.Conv2d):
#             nn.init.kaiming_normal_(layer.weight, mode='fan_in')

In [72]:
filter_sizes = [1,2,3,6]
filter_sizes_len = len(filter_sizes)
num_filters = 36
CNN_DROPOUT = 0.1
class TextCNN(nn.Module):
    def __init__(self, pretrained_weight):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_weight, freeze=False)
        self.conv0 = nn.Sequential(
            nn.Conv1d(EMBEDDING_LENGTH, num_filters, kernel_size=(filter_sizes[0])),
            nn.ELU())
        self.conv0.apply(self.init_weight)
        self.pool0 = nn.MaxPool1d((FIX_LENGTH-filter_sizes[0]+1))
        self.conv1 = nn.Sequential(
            nn.Conv1d(EMBEDDING_LENGTH, num_filters, kernel_size=(filter_sizes[1])),
            nn.ELU())
        self.conv1.apply(self.init_weight)
        self.pool1 = nn.MaxPool1d((FIX_LENGTH-filter_sizes[1]+1))
        self.conv2 = nn.Sequential(
            nn.Conv1d(EMBEDDING_LENGTH, num_filters, kernel_size=(filter_sizes[2])),
            nn.ELU())
        self.conv2.apply(self.init_weight)
        self.pool2 = nn.MaxPool1d((FIX_LENGTH-filter_sizes[2]+1))
        self.conv3 = nn.Sequential(
            nn.Conv1d(EMBEDDING_LENGTH, num_filters, kernel_size=(filter_sizes[3])),
            nn.ELU())
        self.conv3.apply(self.init_weight)
        self.pool3 = nn.MaxPool1d((FIX_LENGTH-filter_sizes[3]+1))
        self.batch_norm = nn.BatchNorm1d(filter_sizes_len * num_filters)
        self.dropout = nn.Dropout(CNN_DROPOUT)
        self.fc = nn.Linear(filter_sizes_len * num_filters, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x_input):
        x = self.embedding(x_input)
        x = x.permute(0, 2, 1)
        conv0 = self.conv0(x)
        pool0 = self.pool0(conv0)
        conv1 = self.conv1(x)
        pool1 = self.pool1(conv1)
        conv2 = self.conv2(x)
        pool2 = self.pool2(conv2)
        conv3 = self.conv3(x)
        pool3 = self.pool3(conv3)
        z = torch.cat((pool0, pool1, pool2, pool3), 1)
        z = z.view(z.size(0), -1)
        z = self.dropout(z)
#         z = self.batch_norm(z)
        x = self.fc(z)
        x = self.sigmoid(x)
        return x

    def init_weight(self, layer):
        if (type(layer) == nn.Conv2d):
            nn.init.kaiming_normal_(layer.weight, mode='fan_in')

# 模型训练

In [73]:
class NormTrainer:
    def __init__(self, model, train_loader, test_loader, get_label_f, batch_size=BATCH_SIZE):
        self.model = model
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.optimizer = optim.Adam(self.model.parameters())
        self.criterion = torch.nn.BCELoss()
        if USE_GPU:
            self.model.cuda()
        self.get_label_f = get_label_f

    def train(self):
        self.model.train()
        train_loss = 0
        for batch_idx, (data, label) in enumerate(self.train_loader):
            self.optimizer.zero_grad()
            vdata = Variable(data)
            vlabel = Variable(label)
            if USE_GPU:
                vdata = vdata.cuda()
                vlabel = vlabel.cuda()
            predict = self.model(vdata)
            predict = torch.squeeze(predict)
            loss = self.criterion(predict, vlabel)
            loss.backward()
            train_loss += loss.item()
            self.optimizer.step()
        print('Average loss: {:.4f}'.format(train_loss / len(self.train_loader)))

    def test(self):
        self.model.eval()
        test_loss = 0
        predict_numpy = np.zeros(0)
        label_numpy = np.zeros(0)
        with torch.no_grad():
            for batch_idx, (data, label) in enumerate(self.test_loader):
                vdata = torch.LongTensor(data)
                vlabel = torch.Tensor(label)
                if USE_GPU:
                    vdata = vdata.cuda()
                    vlabel = vlabel.cuda()
                predict = self.model(vdata)
                predict = torch.squeeze(predict)
                predict_numpy = np.append(predict_numpy, predict.cpu().numpy())
                label_numpy = np.append(label_numpy, vlabel.cpu().numpy())
                loss = self.criterion(predict, vlabel)
                test_loss += loss.item()
        print("Total loss: {:.4f}".format(test_loss))
        print("Threshold F1 score is: {: .4f}".format(self.get_f1_score_by_predict_sigmoid(predict_numpy, label_numpy)))
        print("Standard F1 score is: {: .4f}".format(self.get_f1_score_by_predict_sigmoid(predict_numpy, label_numpy, 0.5)))
    
    def get_f1_score_by_predict_sigmoid(self, predict, true_label, threshold=None):
        if threshold is not None:
            predict = self.get_label_f(predict, threshold)
        else:
            predict = self.get_label_f(predict)
        true_label = true_label.astype(np.int32)
        return f1_score(predict, true_label)
    
    def save_model(self, path):
        torch.save(self.model, path)

In [79]:
EPOCH = 2
BATCH_SIZE = 256

In [80]:
# train_iter = torchtext.data.Iterator(dataset=train, batch_size=BATCH_SIZE, shuffle=True)
train_iter = EnhanceIter(dataset=train, batch_size=BATCH_SIZE, ratio=0.8)
test_iter = torchtext.data.Iterator(dataset=test, batch_size=BATCH_SIZE, shuffle=True, 
                          sort=False, sort_within_batch=False, repeat=False)

In [81]:
model = TextCNN(pretrained_embeddings)
if USE_GPU:
    model = model.cuda()

In [82]:
trainer = NormTrainer(model=model, train_loader=train_iter, test_loader=test_iter, get_label_f=get_label)
for epoch in range(EPOCH):
        print("===============================  EPOCH {:d}  ===============================".format(epoch))
        trainer.train()
        print("===============================  Test  ===============================")
        trainer.test()



  from ipykernel import kernelapp as app


Average loss: 0.0984
Total loss: 27.4417
Threshold F1 score is:  0.6504
Standard F1 score is:  0.6364
Average loss: 0.0788
Total loss: 27.4590
Threshold F1 score is:  0.6538
Standard F1 score is:  0.6341


# 预测

In [50]:
test_df = pd.read_csv('../input/test.csv')
submission_df = pd.read_csv('../input/sample_submission.csv')

In [51]:
def predict(model, text_numpy):
    preprocess_text_result = []
    for text in text_numpy:
        preprocess_text = TEXT.preprocess(text)
        preprocess_text_result.append(preprocess_text)
    processed_text = TEXT.process(preprocess_text_result)
    TEST_BATCH_SIZE = 256
    idx = 0
    predict_vector = np.zeros(0)
    with torch.no_grad():
        while idx < len(processed_text):
            batch_data = processed_text[idx: idx+TEST_BATCH_SIZE]
            if USE_GPU:
                batch_data = batch_data.cuda()
            predicts = model(batch_data)
            predicts = torch.squeeze(predicts)
            predicts_numpy = predicts.cpu().numpy()
            predict_vector = np.append(predict_vector, predicts_numpy)
            idx += TEST_BATCH_SIZE
    return predict_vector

In [52]:
predict_result = predict(model, test_df.question_text.values)

In [53]:
result = get_label(predict_result)

# 输出结果

In [54]:
submission_df = pd.read_csv('../input/sample_submission.csv')
submission_df['prediction'] = result

In [55]:
submission_df.to_csv('submission.csv', index=False)

In [56]:
s = '['
for i in result:
    s += f"{i},"