In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
from tqdm import tqdm
tqdm.pandas()
import time
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import torch
import torch.utils.data
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
import torch.nn as nn
import torch.nn.functional as F
import random
import gc

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print('Train data dimension: ', train_df.shape)
display(train_df.head())
print('Test data dimension: ', test_df.shape)
display(test_df.head())

Train data dimension:  (1306122, 3)


Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


Test data dimension:  (56370, 2)


Unnamed: 0,qid,question_text
0,00014894849d00ba98a9,My voice range is A2-C5. My chest voice goes u...
1,000156468431f09b3cae,How much does a tutor earn in Bangalore?
2,000227734433360e1aae,What are the best made pocket knives under $20...
3,0005e06fbe3045bd2a92,Why would they add a hypothetical scenario tha...
4,00068a0f7f41f50fc399,What is the dresscode for Techmahindra freshers?


In [3]:
enable_local_test = False
if enable_local_test:
    n_test = len(test_df)*4
    train_df,local_test_df = (train_df.iloc[:-n_test].reset_index(drop=True),
                             train_df.iloc[-n_test:].reset_index(drop=True))
else:
    local_test_df = pd.DataFrame([[None,None,0],[None,None,0]],columns=['qid','question_text','target'])
    n_test = 2

In [4]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(4396)

In [5]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in tqdm([i * 0.01 for i in range(100)]):
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [6]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [7]:
from contextlib import contextmanager
@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')


In [8]:
embed_size = 300 
max_features = 120000 
maxlen = 70 

In [9]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [10]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]
def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

In [11]:
def reverse_text(text):
    t = text.split()
    list.reverse(t)
    return " ".join(t)

In [12]:
with timer("processing"):
    train_df["clean_question"] = train_df["question_text"].str.lower()
    test_df["clean_question"] = test_df["question_text"].str.lower()
    
    train_df["clean_question"] = train_df["clean_question"].apply(lambda x: clean_text(x))
    test_df["clean_question"] = test_df["clean_question"].apply(lambda x:clean_text(x))

    train_df['reverse_question'] = train_df['clean_question'].apply(lambda x:reverse_text(x))
    test_df['reverse_question'] = test_df['clean_question'].apply(lambda x:reverse_text(x))
    
    x_train = train_df["reverse_question"].fillna("_##_").values
    x_test = test_df["reverse_question"].fillna("_##_").values

    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(x_train)+list(x_test))
    
    x_train = tokenizer.texts_to_sequences(x_train)
    x_test = tokenizer.texts_to_sequences(x_test)

    x_train = pad_sequences(x_train, maxlen=maxlen)
    x_test = pad_sequences(x_test, maxlen=maxlen)

    y_train = train_df['target'].values

[processing] done in 117 s


In [13]:
from gensim import utils
def fast_load_all(embedding_path,emb_mean,emb_std,ready_pos = None,\
                  encoding='latin1',unicode_errors='strict',datatype=np.float32,word_index=None):
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, 300))
    in_words = 0
    stop = 0
    new_pos = []
    count = 0
    with utils.smart_open(embedding_path) as fin:
        end = False
        while end==False:
            count+=1
            word = []
            while True:
                ch = fin.read(1)
                if ch == b' ':
                    break
                elif ch == b'':
                    end=True
                    break
                else:
                    word.append(ch)
            word_length = len(word)
            try:
                word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
            except:
                word = utils.to_unicode(b''.join(word), encoding='latin1', errors=unicode_errors)
            if end==True:
                print(count)
                break
            if word not in word_index:
                fin.readline()
                stop += 1
            else:
                in_words+=1
                i = word_index[word]
                if i>=max_features:
                    continue
                if ready_pos is not None:
                    if ready_pos[i]==1:
                        continue
                    else:
                        ready_pos[i]=1
                        new_pos.append(i)
                embedding_matrix[i] = np.asarray(fin.readline().split()[:300],dtype='float32')
                stop = 0
    print("total ooi words now:{0},total finding words:{2},stopping at:{1}".format(in_words,stop,np.sum(ready_pos)))
    return in_words,embedding_matrix,new_pos

In [14]:
def load_word2vec(fname,emb_mean,emb_std,ready_pos = None,\
                  encoding='utf-8',unicode_errors='strict',datatype=np.float32,word_index=None):
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, 300))
    in_words = 0
    stop = 0
    new_pos = []
    count = 0
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size,vector_size = (int(x) for x in header.split())
        binary_len = np.dtype(datatype).itemsize*vector_size
        for _ in range(vocab_size-180000):
            word = []
            while True:
                ch = fin.read(1)
                if ch==b' ':
                    break
                if ch==b'':
                    raise EOFError("unexpected end of input")
                if ch!=b'\n':
                    word.append(ch)
            try:
                word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
            except:
                word = utils.to_unicode(b''.join(word), encoding='latin1', errors=unicode_errors)
            weights = np.fromstring(fin.read(binary_len),dtype=datatype).astype(datatype)
            if word not in word_index:
                stop += 1
                continue
            else:
                in_words+=1
                i = word_index[word]
                if i>=max_features:
                    continue
                if ready_pos is not None:
                    if ready_pos[i]==1:
                        continue
                    else:
                        ready_pos[i]=1
                        new_pos.append(i)
                embedding_matrix[i] = weights
                stop = 0
    print("total ooi words now:{0},total finding words:{2},stopping at:{1}".format(in_words,stop,np.sum(ready_pos)))
    return in_words,embedding_matrix,new_pos

In [15]:
seed_everything(42)
glove_pos = np.zeros(max_features)
para_pos = np.zeros(max_features)
fast_pos = np.zeros(max_features)
with timer("build embeddings"):
    in_glove,embedding_glove,glove_pos = fast_load_all('../input/embeddings/glove.840B.300d/glove.840B.300d.txt',-0.005838499,0.48782197,encoding='utf-8',word_index=tokenizer.word_index,ready_pos=glove_pos)
    in_para,embedding_para,para_pos = fast_load_all('../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt',-0.0053247833,0.49346462,encoding='utf-8',word_index=tokenizer.word_index,ready_pos=para_pos)
    in_fast,embedding_fasttext,fast_pos = fast_load_all('../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec',-0.0051106834, 0.18445626,encoding='utf-8',word_index=tokenizer.word_index,ready_pos=fast_pos)

    print(len(glove_pos),len(para_pos),len(fast_pos))

2229343
total ooi words now:126249,total finding words:92924.0,stopping at:72
1748657
total ooi words now:148066,total finding words:103166.0,stopping at:57
1020334
total ooi words now:96138,total finding words:75800.0,stopping at:42
92924 103166 75800
[build embeddings] done in 58 s


In [16]:
diff_glove_para = list(set(para_pos)-set(glove_pos))
diff_glove_fast = list(set(fast_pos)-set(glove_pos))
diff_glove_p_f = list(set(diff_glove_fast)-set(diff_glove_para))

In [17]:
splits = list(StratifiedKFold(n_splits=9, shuffle=True, random_state=10).split(x_train, y_train))

In [18]:
class CyclicLR(object):
    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer
        self.lr_history = []
        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        
        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        #why 2* step_size?
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        self.lr_history.append(lrs)
        return lrs

In [19]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [20]:
use_pretrained_embedding = True
hidden_size = 60
gru_len = hidden_size
Routings = 4
Num_capsule = 5
Dim_capsule = 5
T_epsilon = 1e-7

class Caps_layer(nn.Module):
    def __init__(self, input_dim_capsule=gru_len * 2, num_capsule=Num_capsule, dim_capsule=Dim_capsule, \
                 routings=Routings, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Caps_layer, self).__init__(**kwargs)

        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size 
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = self.squash
        else:
            self.activation = nn.ReLU(inplace=True)

        if self.share_weights:
            self.W = nn.Parameter(
                nn.init.xavier_normal_(torch.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule)))
        else:
            self.W = nn.Parameter(
                torch.randn(BATCH_SIZE, input_dim_capsule, self.num_capsule * self.dim_capsule))  # 64即batch_size

    def forward(self, x):

        if self.share_weights:
            u_hat_vecs = torch.matmul(x, self.W)
        else:
            print('add later')

        batch_size = x.size(0)
        input_num_capsule = x.size(1)
        u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule,
                                      self.num_capsule, self.dim_capsule))
        u_hat_vecs = u_hat_vecs.permute(0, 2, 1, 3) 
        b = torch.zeros_like(u_hat_vecs[:, :, :, 0])  

        for i in range(self.routings):
            b = b.permute(0, 2, 1)
            c = F.softmax(b, dim=2)
            c = c.permute(0, 2, 1)
            b = b.permute(0, 2, 1)
            outputs = self.activation(torch.einsum('bij,bijk->bik', (c, u_hat_vecs))) 

            if i < self.routings - 1:
                b = torch.einsum('bik,bijk->bij', (outputs, u_hat_vecs))  
        return outputs  

    def squash(self, x, axis=-1):
        s_squared_norm = (x ** 2).sum(axis, keepdim=True)
        scale = torch.sqrt(s_squared_norm + T_epsilon)
        return x / scale


'\nclass Capsule_Main(nn.Module):\n    def __init__(self, embedding_matrix=None, vocab_size=None):\n        super(Capsule_Main, self).__init__()\n        self.embed_layer = Embed_Layer(embedding_matrix, vocab_size)\n        self.gru_layer = GRU_Layer()\n        # 【重要】初始化GRU权重操作，这一步非常关键，acc上升到0.98，如果用默认的uniform初始化则acc一直在0.5左右\n        self.gru_layer.init_weights()\n        self.caps_layer = Caps_Layer()\n        self.dense_layer = Dense_Layer()\n\n    def forward(self, content):\n        content1 = self.embed_layer(content)\n        content2, _ = self.gru_layer(\n            content1)  # 这个输出是个tuple，一个output(seq_len, batch_size, num_directions * hidden_size)，一个hn\n        content3 = self.caps_layer(content2)\n        output = self.dense_layer(content3)\n        return output\n'

In [21]:
class NeuralNet_2(nn.Module):
    def __init__(self,embeddings=None):
        super(NeuralNet_2,self).__init__()
        hidden_size = 60
        fc_layer = 16
        fc_layer1 = 16
        self.embedding = nn.Embedding(max_features,embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embeddings,dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_size,hidden_size,bidirectional=True,batch_first = True)
        self.gru = nn.GRU(hidden_size*2,hidden_size,bidirectional=True,batch_first=True)        
        self.lstm_attention = Attention(hidden_size*2,maxlen)
        self.gru_attention = Attention(hidden_size*2,maxlen)
        self.bn = nn.BatchNorm1d(16,momentum=0.5)
        self.linear = nn.Linear(hidden_size*8+1,fc_layer1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(fc_layer**2,fc_layer)
        self.out = nn.Linear(fc_layer,1)        
        self.lincaps = nn.Linear(Num_capsule*Dim_capsule,1)
        self.caps_layer = Caps_layer()
    def forward(self,x):
        h_embedding = self.embedding(x)      
        h_lstm,_ = self.lstm(h_embedding)
        h_gru,_ = self.gru(h_lstm)
        contents3 = self.caps_layer(h_gru)
        contents3 = self.dropout(contents3)
        batch_size =contents3.size(0)
        contents3 = contents3.view(batch_size,-1)
        contents3 = self.relu(self.lincaps(contents3))
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)    
        avg_pool = torch.mean(h_gru,1)
        max_pool,_ = torch.max(h_gru,1)
        conc = torch.cat((h_lstm_atten,h_gru_atten,contents3,avg_pool,max_pool),1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        return out

In [22]:
class NeuralNet(nn.Module):
    def __init__(self,embeddings=None):
        super(NeuralNet, self).__init__()
        
        hidden_size = 60
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embeddings, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)        
        self.lstm_attention = Attention(hidden_size * 2, maxlen)
        self.gru_attention = Attention(hidden_size * 2, maxlen)        
        self.linear = nn.Linear(hidden_size*8, 16)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(16, 1)
    
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)     
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        avg_pool = torch.mean(h_gru, 1)
        max_pool, _ = torch.max(h_gru, 1)        
        conc = torch.cat((h_lstm_atten, h_gru_atten, avg_pool, max_pool), 1)   
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        
        return out

In [23]:
def create_pseudo_data(model,unlabeled_data_loader,batch_size,sample_rate = 0.2):
    num_of_samples = int(len(unlabeled_data_loader.dataset)*sample_rate)
    pseudo_preds = np.zeros(len(unlabeled_data_loader.dataset))
    print("creating pseudo labels!!")
    model.eval()
    for i,(x_batch,) in enumerate(unlabeled_data_loader):
        y_pred = model(x_batch).detach()
        pseudo_preds[i*batch_size:(i+1)*batch_size] = sigmoid(y_pred.cpu().numpy())[:,0]
    model.train()
    pseudo_index = np.random.choice(range(len(pseudo_preds)),int(len(pseudo_preds)*sample_rate),replace=False)
    return pseudo_index,pseudo_preds[pseudo_index]

def create_augmented_train(model,train_data,train_y,unlabeled_data,batch_size,sample_rate=0.2):
    unlabeled_data_cuda = torch.tensor(unlabeled_data,dtype=torch.long).cuda()
    unlabeled_data_ = torch.utils.data.TensorDataset(unlabeled_data_cuda)
    unlabeled_data_loader = torch.utils.data.DataLoader(unlabeled_data_,batch_size=batch_size,shuffle=False)
    pseudo_index,pseudo_labels = create_pseudo_data(model,unlabeled_data_loader,batch_size,sample_rate=sample_rate)
    pseudo_data = unlabeled_data[pseudo_index].copy()
    new_train_data = train_data.copy()
    new_train_y = train_y.copy()
    new_train_data = np.concatenate([new_train_data,pseudo_data],axis=0)
    new_train_y = np.concatenate([new_train_y,pseudo_labels[:,np.newaxis]],axis=0)    
    new_x_train_fold = torch.tensor(new_train_data,dtype=torch.long).cuda()
    new_y_train_fold = torch.tensor(new_train_y,dtype=torch.float32).cuda()    
    new_train = torch.utils.data.TensorDataset(new_x_train_fold,new_y_train_fold)
    new_train_loader = torch.utils.data.DataLoader(new_train,batch_size=batch_size,shuffle=True)
    return new_train_loader

In [24]:
batch_size = 512
n_epochs =4

In [25]:
def train_model(model,x_train,y_train,x_val=None,y_val=None,n_epochs=4,validate=True,using_pseudo=False):
    x_train_fold = torch.tensor(x_train, dtype=torch.long).cuda()
    y_train_fold = torch.tensor(y_train, dtype=torch.float32).cuda()
    train = torch.utils.data.TensorDataset(x_train_fold,y_train_fold)
    train_loader = torch.utils.data.DataLoader(train,batch_size=batch_size,shuffle=True)
    if validate:
        x_val_fold = torch.tensor(x_val, dtype=torch.long).cuda()
        y_val_fold = torch.tensor(y_val, dtype=torch.float32).cuda()
        valid = torch.utils.data.TensorDataset(x_val_fold,y_val_fold)
        val_loader = torch.utils.data.DataLoader(valid,batch_size=batch_size,shuffle=False)

    base_lr, max_lr = 0.001, 0.003 
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 
                             lr=max_lr)

    scheduler = CyclicLR(optimizer,base_lr =base_lr ,max_lr=max_lr,
                        step_size =len(train_loader) ,mode='triangular',gamma=0.994)
    
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean')
    best_score = -np.inf
    test_preds_local = np.zeros(len(test_local_loader.dataset))
    for epoch in range(n_epochs):
        start_time = time.time()
        model.train()
        avg_loss = 0.
        aug_num = 0
        
        if epoch==n_epochs-1:
            if using_pseudo==True:
                train_loader = create_augmented_train(model,x_train,y_train,x_test_local,batch_size=batch_size,sample_rate=0.2)
        
        for x_batch, y_batch in tqdm(train_loader, disable=True):
            y_pred = model(x_batch)
            if scheduler:
                scheduler.batch_step()           
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        model.eval()
        if validate:
            valid_preds = np.zeros((x_val_fold.size(0)))
            avg_val_loss = 0.
            for i,(x_batch,y_batch) in enumerate(val_loader):
                y_pred = model(x_batch).detach()
                avg_val_loss+= loss_fn(y_pred,y_batch).item()/len(val_loader)
                valid_preds[i*batch_size:(i+1)*batch_size] = sigmoid(y_pred.cpu().numpy())[:,0]
            search_result = threshold_search(y_val, valid_preds)
            val_f1, val_threshold = search_result['f1'], search_result['threshold']
            elapsed_time = time.time() - start_time
            print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t val_f1={:.4f} best_t={:.2f} \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, val_f1, val_threshold, elapsed_time))
        else:
            elapsed_time = time.time()-start_time
            print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, elapsed_time))
    if validate:
        valid_preds = np.zeros((x_val_fold.size(0)))
        avg_val_loss = 0.
        for i, (x_batch, y_batch) in enumerate(val_loader):
            y_pred = model(x_batch).detach()

            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(val_loader)
            valid_preds[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

        print('Validation loss: ', avg_val_loss)
          
    test_preds = np.zeros(len(test_df))
    test_preds_local = np.zeros(len(test_local_loader.dataset))
    for i,(x_batch,)in enumerate(test_local_loader):
        y_pred = model(x_batch).detach()
        test_preds_local[i*batch_size:(i+1)*batch_size]=sigmoid(y_pred.cpu().numpy())[:,0]
    if validate:
        return valid_preds,test_preds,test_preds_local
    else:
        return test_preds,test_preds_local

In [26]:
x_test_local = x_test
x_train = x_train
y_train = y_train
single_or_multi = 2

**Training for single models with folds**

In [27]:
if single_or_multi==1:
    seed=6017
    train_preds = np.zeros(len(train_df))
    test_preds = np.zeros((len(test_df), len(splits)))
    test_preds_local = np.zeros((n_test, len(splits)))
    x_test_local_cuda = torch.tensor(x_test_local,dtype=torch.long).cuda()
    test_local = torch.utils.data.TensorDataset(x_test_local_cuda)
    test_local_loader = torch.utils.data.DataLoader(test_local, batch_size=batch_size, shuffle=False)

    for i,(train_idx,valid_idx) in enumerate(splits):
        x_train_fold = x_train[train_idx]
        y_train_fold = y_train[train_idx, np.newaxis]
        x_val_fold = x_train[valid_idx]
        y_val_fold = y_train[valid_idx, np.newaxis]
        print(f'Fold {i+1}')
        seed_everything(seed+i)
        model = NeuralNet()
        model.cuda()
        valid_preds_fold, test_preds_fold, test_preds_local_fold = train_model(model,
                                                                               x_train_fold, 
                                                                               y_train_fold, 
                                                                               x_val_fold, 
                                                                               y_val_fold, 
                                                                               n_epochs=n_epochs,
                                                                               validate=True,
                                                                              using_pseudo=False)
        train_preds[valid_idx] = valid_preds_fold
        test_preds[:,i] = test_preds_fold
        test_preds_local[:,i] = test_preds_local_fold

**Training many models and stacking them**

In [28]:
embedding_matrix_1 = embedding_glove.copy()
embedding_matrix_1[diff_glove_para] = embedding_para[diff_glove_para]
embedding_matrix_1[diff_glove_p_f] = embedding_fasttext[diff_glove_p_f]
embedding_matrix_2 = embedding_para.copy()
embedding_matrix_2[diff_glove_p_f] = embedding_fasttext[diff_glove_p_f]

embedding_matrix_3 = (embedding_glove+embedding_para)/2.0
embedding_matrix_4 = (0.45*embedding_glove+0.35*embedding_para+0.20*embedding_fasttext)

In [29]:
del embedding_glove,embedding_para,embedding_fasttext
gc.collect()

0

**defing models**

In [30]:
models={
    "glove_main_model_1":NeuralNet(embeddings=embedding_matrix_1),
    "avg_2_model_1":NeuralNet(embeddings=embedding_matrix_3),
    "avg_3_model_1":NeuralNet(embeddings=embedding_matrix_4),
    "glove_main_model_2":NeuralNet_2(embeddings=embedding_matrix_1),
    "para_main_model_2":NeuralNet_2(embeddings=embedding_matrix_2),
    "avg_2_model_2":NeuralNet_2(embeddings=embedding_matrix_3),
    "avg_3_model_2":NeuralNet_2(embeddings=embedding_matrix_4)
}

In [31]:
if single_or_multi==2:
    seed = 6017
    
    epochs = [4]*len(models)

    y_train = y_train[:, np.newaxis]
    
    test_preds = np.zeros((len(test_df),len(models)))
    test_preds_local = np.zeros((n_test,len(models)))

    x_test_local_cuda = torch.tensor(x_test_local,dtype=torch.long).cuda()
    test_local = torch.utils.data.TensorDataset(x_test_local_cuda)
    test_local_loader = torch.utils.data.DataLoader(test_local, batch_size=batch_size, shuffle=False)
    
    i=0
    for key,model in models.items():
        print('model {0}'.format(key))
        model.cuda()
        test_preds_fold,test_preds_local_fold = train_model(
            model,x_train,y_train,n_epochs=epochs[i],validate=False)

        test_preds[:,i] = test_preds_local_fold
        i+=1

model glove_main_model_1
Epoch 1/4 	 loss=0.1202 	 time=208.12s
Epoch 2/4 	 loss=0.0996 	 time=208.33s
Epoch 3/4 	 loss=0.0927 	 time=208.84s
Epoch 4/4 	 loss=0.0884 	 time=209.04s
model avg_2_model_1
Epoch 1/4 	 loss=0.1160 	 time=209.02s
Epoch 2/4 	 loss=0.0974 	 time=209.01s
Epoch 3/4 	 loss=0.0900 	 time=207.97s
Epoch 4/4 	 loss=0.0856 	 time=208.21s
model avg_3_model_1
Epoch 1/4 	 loss=0.1166 	 time=207.88s
Epoch 2/4 	 loss=0.0988 	 time=209.26s
Epoch 3/4 	 loss=0.0916 	 time=209.19s
Epoch 4/4 	 loss=0.0870 	 time=208.57s
model glove_main_model_2
Epoch 1/4 	 loss=0.1173 	 time=285.93s
Epoch 2/4 	 loss=0.0988 	 time=286.86s
Epoch 3/4 	 loss=0.0916 	 time=286.43s
Epoch 4/4 	 loss=0.0872 	 time=286.38s
model para_main_model_2
Epoch 1/4 	 loss=0.1193 	 time=287.01s
Epoch 2/4 	 loss=0.0989 	 time=286.98s
Epoch 3/4 	 loss=0.0902 	 time=287.53s
Epoch 4/4 	 loss=0.0858 	 time=287.09s
model avg_2_model_2
Epoch 1/4 	 loss=0.1169 	 time=287.14s
Epoch 2/4 	 loss=0.0981 	 time=287.25s
Epoch 3/

In [32]:
magic_numbers = [0.22161071,0.13261581,0.13514855,0.18424001,0.09617107,0.13212665,0.0977806]

In [33]:
submission = test_df[['qid']].copy()
submission['prediction'] = np.sum(test_preds*np.array(magic_numbers),axis=1)>0.37
submission.to_csv('submission.csv',index=False)