In [1]:
#on_kaggle = True
on_kaggle = False

#TRAIN_PREDICT = 'predict'
TRAIN_PREDICT = 'train'

if not on_kaggle:
    import os
    os.environ["CUDA_VISIBLE_DEVICES"]="3"

SEED = 42

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil
import random

from unidecode import unidecode
import re

import tensorflow as tf
import tensorflow.keras.backend as K

#fix bug with using CuDNNLSTM
#gpus = tf.config.experimental.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(gpus[0], True)

from transformers import *
import transformers
print(transformers.__version__)

np.set_printoptions(suppress=True)

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed_all(seed)
    #torch.backends.cudnn.deterministic = True
    tf.random.set_seed(seed)
seed_everything(SEED)

2.3.0


#### 1. Read data and tokenizer

Read tokenizer and data, as well as defining the maximum sequence length that will be used for the input to Bert (maximum is usually 512 tokens)

In [3]:
if on_kaggle:
    PATH = '../input/google-quest-challenge/'
    BERT_PATH = '../input/bert-base-uncased-huggingface-transformer/'
    CKPT_LOAD_PATH = ['../input/google-quest-qa-labeling-checkpoints-v3/fold0-epoch3.h5py', 
                      '../input/google-quest-qa-labeling-checkpoints-v3/fold1-epoch3.h5py',
                      '../input/google-quest-qa-labeling-checkpoints-v3/fold2-epoch3.h5py',
                      '../input/google-quest-qa-labeling-checkpoints-v3/fold3-epoch3.h5py',
                      '../input/google-quest-qa-labeling-checkpoints-v3/fold4-epoch3.h5py'
                     ]
    tokenizer = BertTokenizer.from_pretrained(BERT_PATH+'bert-base-uncased-vocab.txt')
else:##offline
    PATH = '../data/'
    BERT_PATH = '../model/roberta-base'
    CKPT_SAVE_PATH = '../checkpoint/roberta-base-v2/'
    CKPT_LOAD_PATH = ['../checkpoint/roberta-base-v2/']
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

MAX_SEQUENCE_LENGTH = 512

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5,9,10]])#9:category, 10:host
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

train shape = (6079, 41)
test shape = (476, 11)

output categories:
	 ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']

input categories:
	 ['question_title', 'question_body', 'answer', 'category', 'host']


#### 2. Preprocessing functions

These are some functions that will be used to preprocess the raw text data into useable Bert inputs.

In [4]:
# t,q,a = df_train[input_categories].iloc[3].to_list()
# ids = tokenizer.encode_plus(a, None, 'longest_first', max_length=None)

# input_ids = ids['input_ids']
# len(input_ids)

#input_segments = ids['token_type_ids']
# for i in range(5):
#     t,q,a = df_train.loc[i, ['question_title', 'question_body', 'answer']]
#     print(t)
#     print('-'*10)
#     print(q)
#     print('-'*10)
#     print(a)
#     print('='*40)

# t = df_train.loc[df_train.host=='stackoverflow.com', 'question_title'].values[11]
# q = df_train.loc[df_train.host=='stackoverflow.com', 'question_body'].values[11]

# tokens = tokenizer.encode_plus(t, q, add_special_tokens=True, max_length=512, truncation_strategy='longest_first')

# print(t+'END OF TITLE\n'+q)
# print(' '.join(tokenizer.ids_to_tokens[i] for i in tokens['input_ids']))
# [tokenizer.ids_to_tokens[i] for i in tokens['input_ids']]

In [4]:
def count_special_char(x):
    special_character = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]', re.IGNORECASE)
    x_ascii = unidecode(x)
    s = special_character.findall(x_ascii)
    #print(s)
    c = len(s)
    return c

def count_cap_char(x):
    c = sum(1 for l in x if l.isupper())
    return c

def count_unique_words(x):
    """returns a ratio"""
    special_character = re.compile(r'[^A-Za-z0-9]', re.IGNORECASE)
    a = [w for w in special_character.split(x) if w!='']
    r = len(set(a))/len(a)
    return r

#print(t)
#count_cap_char(t), len(t)
#count_special_char(t), len(t)
#string.printable

#from textblob import TextBlob
# print(t.lower())
# print('='*40)
# print(TextBlob(t.lower()).correct())

In [5]:
l = df_train.category.unique().tolist()
category2index = dict([(l[i],i+1) for i in range(len(l))])
category2index['UNK'] = 0

l = df_train.host.unique().tolist()
host2index = dict([(l[i],i+1) for i in range(len(l))])
host2index['UNK'] = 0

category2index

{'LIFE_ARTS': 1,
 'CULTURE': 2,
 'SCIENCE': 3,
 'STACKOVERFLOW': 4,
 'TECHNOLOGY': 5,
 'UNK': 0}

In [6]:
def _convert_to_transformer_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for transformer (including bert)
    
    NOTE: USE Head + Tail truncation
    """
    
    def return_id(str1, str2, truncation_strategy, length):

        inputs = tokenizer.encode_plus(str1, str2,
            add_special_tokens=True,
            max_length=None,
            truncation_strategy=truncation_strategy)
        
        input_ids =  inputs["input_ids"]
        input_masks = [1] * len(input_ids)
        input_segments = inputs["token_type_ids"]
        if len(input_ids)>length:#Head + Tail truncate
            input_ids = input_ids[:128] + input_ids[-384:]
            input_masks = input_masks[:128] + input_masks[-384:]
            input_segments = input_segments[:128] + input_segments[-384:]
        padding_length = length - len(input_ids)
        padding_id = tokenizer.pad_token_id
        input_ids = input_ids + ([padding_id] * padding_length)
        input_masks = input_masks + ([0] * padding_length)
        input_segments = input_segments + ([0] * padding_length)
        
        return [input_ids, input_masks, input_segments]
    
    input_ids_q, input_masks_q, input_segments_q = return_id(
        title + ' ' + question, None, 'longest_first', max_sequence_length)
    
    input_ids_a, input_masks_a, input_segments_a = return_id(
        answer, None, 'longest_first', max_sequence_length)
    
    return [input_ids_q, input_masks_q, input_segments_q,
            input_ids_a, input_masks_a, input_segments_a]

def compute_input_arrays(df, columns, tokenizer, max_sequence_length):
    input_ids_q, input_masks_q, input_segments_q = [], [], []
    input_ids_a, input_masks_a, input_segments_a = [], [], []
    special_char_q, special_char_a = [], []
    cap_char_q, cap_char_a = [], []
    unique_words_q, unique_words_a = [], []
    category_index, host_index = [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer
        
        ids_q, masks_q, segments_q, ids_a, masks_a, segments_a = \
        _convert_to_transformer_inputs(t, q, a, tokenizer, max_sequence_length)
        
        input_ids_q.append(ids_q)
        input_masks_q.append(masks_q)
        input_segments_q.append(segments_q)

        input_ids_a.append(ids_a)
        input_masks_a.append(masks_a)
        input_segments_a.append(segments_a)
        
        special_char_q.append(count_special_char(t+q)/len(t+q))
        special_char_a.append(count_special_char(a)/len(a))
        cap_char_q.append(count_cap_char(t+q)/len(t+q))
        cap_char_a.append(count_cap_char(a)/len(a))
        unique_words_q.append(count_unique_words(t+q))
        unique_words_a.append(count_unique_words(a))
        
        category, host = instance.category, instance.host
        category_index.append([category2index.get(category, 0)])
        host_index.append([host2index.get(host, 0)])
        
    return [np.asarray(input_ids_q, dtype=np.int32), 
            np.asarray(input_masks_q, dtype=np.int32), 
            np.asarray(input_segments_q, dtype=np.int32),
            np.asarray(input_ids_a, dtype=np.int32), 
            np.asarray(input_masks_a, dtype=np.int32), 
            np.asarray(input_segments_a, dtype=np.int32),
            np.asarray([special_char_q, special_char_a, cap_char_q, cap_char_a, unique_words_q, unique_words_a], dtype=np.float32).T, 
            np.asarray(category_index, dtype=np.int32),
            np.asarray(host_index, dtype=np.int32)
           ]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

#### 3. Create model

`compute_spearmanr()` is used to compute the competition metric for the validation set
<br><br>
`CustomCallback()` is a class which inherits from `tf.keras.callbacks.Callback` and will compute and append validation score and validation/test predictions respectively, after each epoch.
<br><br>
`bert_model()` contains the actual architecture that will be used to finetune BERT to our dataset. It's simple, just taking the sequence_output of the bert_layer and pass it to an AveragePooling layer and finally to an output layer of 30 units (30 classes that we have to predict)
<br><br>
`train_and_predict()` this function will be run to train and obtain predictions

In [22]:
def compute_spearmanr_ignore_nan(trues, preds):
    rhos = []
    for tcol, pcol in zip(np.transpose(trues), np.transpose(preds)):
        rhos.append(spearmanr(tcol, pcol).correlation)
    return np.nanmean(rhos)


class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, valid_data, test_data=None, batch_size=16, fold=None, stage2=False):

        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        self.test_inputs = test_data
        
        self.batch_size = batch_size
        self.fold = fold
        self.stage2 = stage2
        
    def on_train_begin(self, logs={}):
        self.valid_predictions = []
        self.test_predictions = []
        
    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions.append(
            self.model.predict(self.valid_inputs, batch_size=self.batch_size))
        
        rho_val = compute_spearmanr_ignore_nan(
            self.valid_outputs, np.average(self.valid_predictions, axis=0))
        
        print("\nvalidation rho: %.4f" % rho_val)
        
        if self.fold is not None and epoch>1:
            if self.stage2:
                self.model.save_weights(CKPT_SAVE_PATH+f'fold{fold}-epoch{epoch}-stage2.h5py')
            else:
                self.model.save_weights(CKPT_SAVE_PATH+f'fold{fold}-epoch{epoch}.h5py')
        
#         self.test_predictions.append(
#             self.model.predict(self.test_inputs, batch_size=self.batch_size)
#         )

def roberta_model():
    q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    a_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    a_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    a_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    feats = tf.keras.layers.Input((6,), dtype=tf.float32)#set dim of additional numeric feats
    category_index = tf.keras.layers.Input((1,), dtype=tf.int32)
    host_index = tf.keras.layers.Input((1,), dtype=tf.int32)
    
    #config = RobertaConfig.from_pretrained('roberta-base') # print(config) to see settings
    #config.output_hidden_states = False # Set to True to obtain hidden states
    # caution: when using e.g. XLNet, XLNetConfig() will automatically use xlnet-large config
    
    # normally ".from_pretrained('bert-base-uncased')", but because of no internet, the 
    # pretrained model has been downloaded manually and uploaded to kaggle.
    #base_model = bert_base_model()
    #base_model.load_weights(CKPT_SAVE_PATH+f'fold{fold}-epoch3.h5py')
    #bert_weights = base_model.layers[8].get_weights()
    if on_kaggle:
        #wget https://storage.googleapis.com/gpt-2/detector-models/v1/detector-base.pt
        roberta_layer = TFBertModel.from_pretrained(BERT_PATH+'bert-base-uncased-tf_model.h5', config=config)
    else:
        #roberta_layer = TFRobertaModel.from_pretrained('../model/roberta-base-tf_model.h5', config=config)
        roberta_layer = TFRobertaModel.from_pretrained('roberta-base')
    #bert_layer.set_weights(bert_weights)
    #bert_layer.trainable = False
    
    # if config.output_hidden_states = True, obtain hidden states via bert_model(...)[-1]
    q_embedding = roberta_layer(q_id, attention_mask=q_mask, token_type_ids=q_atn)[0]
    a_embedding = roberta_layer(a_id, attention_mask=a_mask, token_type_ids=a_atn)[0]
    
    #q_embedding = tf.keras.layers.SpatialDropout1D(0.2)(q_embedding)
    #a_embedding = tf.keras.layers.SpatialDropout1D(0.2)(a_embedding)
    
    #qa_embedding = tf.keras.layers.Concatenate()([q_embedding, a_embedding])
    
    q_feats = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024, return_sequences=True, 
                                                            activation='tanh', recurrent_activation='sigmoid', 
                                                            recurrent_dropout=0, unroll=False, use_bias=False
                                                            ))(q_embedding)
    a_feats = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, return_sequences=True, 
                                                            activation='tanh', recurrent_activation='sigmoid', 
                                                            recurrent_dropout=0, unroll=False, use_bias=False
                                                            ))(a_embedding)
    
    qa_feats = tf.keras.layers.Concatenate()([q_feats, a_feats])
    qa_feats = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True, 
                                                            activation='tanh', recurrent_activation='sigmoid', 
                                                            recurrent_dropout=0, unroll=False, use_bias=False
                                                            ))(qa_feats)
    
    q_feats_avgpool = tf.keras.layers.GlobalAveragePooling1D()(q_feats)
    a_feats_avgpool = tf.keras.layers.GlobalAveragePooling1D()(a_feats)
    qa_feats_avgpool = tf.keras.layers.GlobalAveragePooling1D()(qa_feats)

    q_avgpool = tf.keras.layers.GlobalAveragePooling1D()(q_embedding)
    a_avgpool = tf.keras.layers.GlobalAveragePooling1D()(a_embedding)
    
    category_embed_layer = tf.keras.layers.Embedding(6, 8, input_length=1)
    host_embed_layer = tf.keras.layers.Embedding(64, 8, input_length=1)
    cat_embed = category_embed_layer(category_index)
    host_embed = host_embed_layer(host_index)
    cat_embed = tf.keras.layers.Flatten()(cat_embed)
    host_embed = tf.keras.layers.Flatten()(host_embed)
    
#     x = tf.keras.layers.Concatenate()([q_feats_avgpool, a_feats_avgpool, qa_feats_avgpool, q_avgpool, a_avgpool,
#                                        feats, cat_embed, host_embed])
#     x = tf.keras.layers.Concatenate()([q_avgpool, a_avgpool,
#                                        feats, cat_embed, host_embed])
    
    x_q = tf.keras.layers.Concatenate()([q_feats_avgpool, q_avgpool, feats, cat_embed, host_embed])
    x_q = tf.keras.layers.Dropout(0.2)(x_q)
    q_logit = tf.keras.layers.Dense(21, activation='sigmoid')(x_q)
    
    x_a = tf.keras.layers.Concatenate()([a_feats_avgpool, a_avgpool, feats, cat_embed, host_embed])
    x_a = tf.keras.layers.Dropout(0.2)(x_a)
    a_logit = tf.keras.layers.Dense(9, activation='sigmoid')(x_a)
    
    x_qa = tf.keras.layers.Concatenate()([qa_feats_avgpool, feats, cat_embed, host_embed])
    x_qa = tf.keras.layers.Dropout(0.2)(x_qa)
    qa_logit = tf.keras.layers.Dense(30, activation='sigmoid')(x_qa)
    
    x = tf.keras.layers.Concatenate()([q_logit, a_logit])
    x = tf.add(tf.multiply(x, 0.6), tf.multiply(qa_logit, 0.4))
    
    model = tf.keras.models.Model(inputs=[q_id, q_mask, q_atn, a_id, a_mask, a_atn, 
                                          feats, category_index, host_index, ], outputs=x)
    
    return model


def train_and_predict(model, train_data, valid_data, test_data, 
                      learning_rate, epochs, batch_size, loss_function, fold, stage2=False):
    """
    multipliers = {'dense_1': 0.5, 'dense_2': 0.4}
    optimizer = LearningRateMultiplier(tf.keras.optimizers.Adam, 
                                        lr_multiplier=multipliers, learning_rate=learning_rate)
    
    """
    custom_callback = CustomCallback(
        valid_data=(valid_data[0], valid_data[1]), 
        test_data=test_data,
        batch_size=batch_size,
        fold=fold, stage2=stage2)

    #decay_steps = 1000
    #lr_decayed_fn = tf.keras.experimental.CosineDecay(learning_rate, decay_steps)
    #clr = CyclicLR(base_lr=2e-5, max_lr=5e-5, step_size=2000., mode='triangular')

#     mirrored_strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])
#     with mirrored_strategy.scope():
#         model = roberta_model() ##define here !!!
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)#clipvalue=0.5
    model.compile(loss=loss_function, optimizer=optimizer)
    model.fit(train_data[0], train_data[1], epochs=epochs, 
              batch_size=batch_size, callbacks=[custom_callback])
    
    return custom_callback

def predict(model, test_data, load_weights_path):
    model.load_weights(load_weights_path)
    return model.predict(test_data, batch_size=BATCH_SIZE)

In [23]:
#config = RobertaConfig()
#print(config)
#config.output_hidden_states = False
#roberta_layer = TFRobertaModel.from_pretrained('roberta-base')
model = roberta_model()
model.summary()

#ckpt = torch.load('../model/detector-base.pt')
#ckpt['args']

#base_model.load_weights('../checkpoint/bert-base-uncased-v3/fold0-epoch3.h5py')

#bert_weights = base_model.layers[8].get_weights()

#new_bert_layer = TFBertModel.from_pretrained('bert-base-uncased', config=config)
#new_bert_layer.set_weights(bert_weights)

#inputs[0][2]

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_55 (InputLayer)           [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_57 (InputLayer)           [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_59 (InputLayer)           [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_56 (InputLayer)           [(None, 512)]        0                                            
____________________________________________________________________________________________

#### 4. Obtain inputs and targets, as well as the indices of the train/validation splits

In [24]:
if TRAIN_PREDICT == 'train':
    outputs = compute_output_arrays(df_train, output_categories)
    inputs = compute_input_arrays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (881 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2908 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (848 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (976 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1078 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (705 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Token indices sequence length is longer than the specified maximum sequence length for this model (1299 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1079 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (5402 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (707 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for 

Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (588 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (795 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (557 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Token indices sequence length is longer than the specified maximum sequence length for this model (1082 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (754 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1362 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (712 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Token indices sequence length is longer than the specified maximum sequence length for this model (612 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (801 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (833 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Token indices sequence length is longer than the specified maximum sequence length for this model (834 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1133 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (707 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (847 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Token indices sequence length is longer than the specified maximum sequence length for this model (782 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (646 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Token indices sequence length is longer than the specified maximum sequence length for this model (1368 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (806 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (663 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Token indices sequence length is longer than the specified maximum sequence length for this model (2612 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2404 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (819 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1186 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for 

Token indices sequence length is longer than the specified maximum sequence length for this model (1099 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1768 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (643 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (934 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Token indices sequence length is longer than the specified maximum sequence length for this model (820 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (622 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1024 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (577 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Token indices sequence length is longer than the specified maximum sequence length for this model (2306 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4309 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3185 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (672 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1579 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for

Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2286 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (737 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1772 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2122 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for 

Token indices sequence length is longer than the specified maximum sequence length for this model (1450 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (846 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1036 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (610 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Token indices sequence length is longer than the specified maximum sequence length for this model (4977 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (994 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (642 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1267 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Token indices sequence length is longer than the specified maximum sequence length for this model (632 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (630 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (577 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Token indices sequence length is longer than the specified maximum sequence length for this model (899 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (830 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (754 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (780 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Token indices sequence length is longer than the specified maximum sequence length for this model (639 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1571 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (868 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1410 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Token indices sequence length is longer than the specified maximum sequence length for this model (1776 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2221 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (568 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (762 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (658 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Token indices sequence length is longer than the specified maximum sequence length for this model (616 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (806 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1062 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (654 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Token indices sequence length is longer than the specified maximum sequence length for this model (779 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (610 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (685 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1078 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Token indices sequence length is longer than the specified maximum sequence length for this model (786 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1953 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4585 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1088 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for 

Token indices sequence length is longer than the specified maximum sequence length for this model (1083 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1484 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (623 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (588 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Token indices sequence length is longer than the specified maximum sequence length for this model (732 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1099 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3476 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1111 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (898 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for 

Token indices sequence length is longer than the specified maximum sequence length for this model (1062 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (806 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (773 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (653 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (719 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th




In [25]:
import pickle

#bug: if create for the first time, need to run twice to load it
if TRAIN_PREDICT == 'train':
    if not os.path.isfile('../data/gkf%d.pkl'%SEED):
        print('Create gkf')
        gkf = GroupKFold(n_splits=5).split(X=df_train.question_body, groups=df_train.question_body)
        with open('../data/gkf%d.pkl'%SEED, 'wb') as f:
            pickle.dump(list(gkf), f)
    else:
        print('Load gkf')
        with open('../data/gkf%d.pkl'%SEED, 'rb') as f:
            gkf = pickle.load(f)

Load gkf


In [12]:
#print(len(inputs), inputs[6].shape, inputs[7], inputs[8])
#inputs[6][:,4:]
# for i,(train_idx,val_idx) in enumerate(gkf):
#     if i==0:
#         break
# train_idx[20:30]

#gkf0 = list(gkf)

#### 5. Training, validation and testing

Loops over the folds in gkf and trains each fold for 5 epochs --- with a learning rate of 1e-5 and batch_size of 8. A simple binary crossentropy is used as the objective-/loss-function. 

In [26]:
NUM_EPOCHS = 4
BATCH_SIZE = 8
LearningRate = 3e-5

In [27]:
def custom_bce_loss(y_true, y_pred):
    """
    nunique = [df_train[col].nunique() for col in output_categories]#count unique values of each column
    weights_dict = {5:0.5, 9:1.0, 17:1.5, 3:0.5}
    weights = [weights_dict[i] for i in nunique]
    """
    weights = tf.convert_to_tensor([1. , 1. , 0.5, 0.5, 0.5, 0.5, 1. , 1. , 0.5, 0.5, 0.5, 0.5, 0.5,
       0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1. , 1. , 1. , 1. , 1. , 1.5,
       0.5, 0.5, 0.5, 1. ], dtype=float)
    weights = weights/K.mean(weights)
    loss = tf.multiply(y_true, K.log(y_pred+K.epsilon())) + tf.multiply((1-y_true), K.log(1-y_pred+K.epsilon()))
    loss = tf.multiply(loss, weights)
    bce_loss = tf.reduce_mean(-loss)
    return bce_loss
#     y_true_clip = K.clip(y_true, K.epsilon(), 1)
#     y_pred_clip = K.clip(y_pred, K.epsilon(), 1)
#     kl_loss = tf.reduce_mean(tf.reduce_sum(y_true_clip * K.log(y_true_clip / y_pred_clip), axis=0))
#     return bce_loss*0.9 + kl_loss*0.1

#y_pred = tf.random.uniform((8, 30))
#y_true = tf.random.uniform((8, 30))

#custom_bce_loss(y_true, y_pred)

In [28]:
## training
## if stage2, modify 2 places
if TRAIN_PREDICT == 'train':
    histories = []
    for fold, (train_idx, valid_idx) in enumerate(gkf):

        # will actually only do 1 fold (out of 5) to compare models
        if fold >-1:
            print('========training fold %d========'%fold)
            K.clear_session()
            
            model = roberta_model()
            #model=None#use multi-gpu in train_and_predict()

            #prepare dataset
            train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
            train_outputs = outputs[train_idx]
            valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
            valid_outputs = outputs[valid_idx]

            # history contains two lists of valid and test preds respectively:
            #  [valid_predictions_{fold}, test_predictions_{fold}]
            history = train_and_predict(model, 
                              train_data=(train_inputs, train_outputs), 
                              valid_data=(valid_inputs, valid_outputs),
                              test_data=None, 
                              learning_rate=LearningRate, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
                              loss_function=custom_bce_loss, fold=fold, 
                                        stage2=False)#'binary_crossentropy'
            histories.append(history)

# ## training full trainset to lift LB score in the final
# if TRAIN_PREDICT == 'train':
#     histories = []
#     print('========Start training========')
#     K.clear_session()
#     model = bert_model()

#     train_inputs = inputs
#     train_outputs = outputs

#     history = train_and_predict(model, 
#                       train_data=(train_inputs, train_outputs), 
#                       valid_data=None,
#                       test_data=None, 
#                       learning_rate=LearningRate, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
#                       loss_function='binary_crossentropy', fold=fold)

#     histories.append(history)

Train on 4863 samples
Epoch 1/4
validation rho: 0.3778
Epoch 2/4
validation rho: 0.3996
Epoch 3/4
validation rho: 0.4082
Epoch 4/4
validation rho: 0.4115
Train on 4863 samples
Epoch 1/4
validation rho: 0.3701
Epoch 2/4
validation rho: 0.3939
Epoch 3/4
validation rho: 0.4041
Epoch 4/4
validation rho: 0.4072
Train on 4863 samples
Epoch 1/4
validation rho: 0.3880
Epoch 2/4
validation rho: 0.4113
Epoch 3/4
validation rho: 0.4224
Epoch 4/4
validation rho: 0.4254
Train on 4863 samples
Epoch 1/4
validation rho: 0.3667
Epoch 2/4
validation rho: 0.3883
Epoch 3/4
validation rho: 0.3964
Epoch 4/4
validation rho: 0.4026
Train on 4864 samples
Epoch 1/4
validation rho: 0.3583
Epoch 2/4
validation rho: 0.3828
Epoch 3/4
validation rho: 0.3986
Epoch 4/4
validation rho: 0.4025


In [16]:
#more ideas todo: 
#1.1 modify model
#1.2 OOV words spelling correction
#2. loss, ranking loss?
#3. add custom new tokens?(e.g stackoverflow)
#4. roberta?alxnet?
#5. RankGauss average folds?
#6. freeze some layers of bert?
"""
CV history
---------------
#### bert-base ####
Epoch 5/15
4856/4863 [============================>.] - ETA: 0s - loss: 0.3091
validation rho: 0.3923
4863/4863 [==============================] - 339s 70ms/sample - loss: 0.3091
------LB=0.346

switch to HuggingFace
----------------------
SEEMS DEPENDS ON THE SEED!!!
----------------------------
t + q[:1/2], q[1/2:], a
same

category + host + t + q, category + host + a
same


CUSTOM LOSS
Epoch 4/4
4856/4863 [============================>.] - ETA: 0s - loss: 0.3612
validation rho: 0.3989
4863/4863 [==============================] - 711s 146ms/sample - loss: 0.3612

Epoch 4/4
4856/4863 [============================>.] - ETA: 0s - loss: 0.3597
validation rho: 0.3998
4863/4863 [==============================] - 724s 149ms/sample - loss: 0.3597

add 2 feats
Epoch 4/4
4856/4863 [============================>.] - ETA: 0s - loss: 0.3616
validation rho: 0.4049
4863/4863 [==============================] - 718s 148ms/sample - loss: 0.3616

4 feats
Epoch 4/4
4856/4863 [============================>.] - ETA: 0s - loss: 0.3592
validation rho: 0.4027
4863/4863 [==============================] - 706s 145ms/sample - loss: 0.3591

add cat+host embed dim=16
Epoch 4/4
4856/4863 [============================>.] - ETA: 0s - loss: 0.3594
validation rho: 0.4005
4863/4863 [==============================] - 698s 144ms/sample - loss: 0.3594

embed dim=8
Epoch 4/4
4856/4863 [============================>.] - ETA: 0s - loss: 0.3592
validation rho: 0.4071
4863/4863 [==============================] - 734s 151ms/sample - loss: 0.3592

add LSTM features
validation rho: 0.4081 --fold0
validation rho: 0.4050 --fold1
validation rho: 0.4138 --fold2


--------
epochs progression:

validation rho: 0.3810
validation rho: 0.3974
validation rho: 0.4035
validation rho: 0.4057

"""



#### 6. Process and submit test predictions

First the test predictions are read from the list of lists of `histories`. Then each test prediction list (in lists) is averaged. Then a mean of the averages is computed to get a single prediction for each data point. Finally, this is saved to `submission.csv`

In [17]:
if TRAIN_PREDICT == 'predict':
    
    test_inputs = compute_input_arrays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
    
    model = bert_model()
    
    test_predictions = [predict(model, test_inputs, load_weights_path=ckpt_load_path) 
                        for ckpt_load_path in CKPT_LOAD_PATH]
    #test_predictions = [np.average(test_predictions[i], axis=0) for i in range(len(test_predictions))]
    test_predictions = np.mean(test_predictions, axis=0)

    df_sub.iloc[:, 1:] = test_predictions

    df_sub.to_csv('submission.csv', index=False)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [18]:
df_sub.head(10)

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.950409,0.667482,0.239416,0.339715,0.570079,0.466379,0.700281,0.700085,0.571651,...,0.939537,0.927281,0.502933,0.972254,0.975164,0.780713,0.031082,0.050007,0.861596,0.923908
1,46,0.859322,0.47154,0.005667,0.772744,0.760206,0.914244,0.574795,0.459414,0.150252,...,0.614215,0.9653,0.622804,0.984339,0.987231,0.901024,0.941507,0.119569,0.09648,0.902197
2,70,0.908096,0.620519,0.014831,0.787047,0.934155,0.9627,0.608684,0.413993,0.300243,...,0.878956,0.914537,0.562891,0.970561,0.969091,0.786442,0.036871,0.066868,0.924306,0.903338
3,132,0.894989,0.414773,0.009222,0.709674,0.744439,0.904797,0.542779,0.426597,0.098071,...,0.731927,0.948509,0.699444,0.967164,0.983113,0.905921,0.857659,0.183874,0.640012,0.921505
4,200,0.921447,0.422136,0.035422,0.858205,0.73669,0.84148,0.63592,0.6014,0.237748,...,0.662975,0.926064,0.67784,0.978456,0.973675,0.848344,0.341229,0.161605,0.517119,0.905178
5,245,0.949044,0.838918,0.037897,0.670364,0.950395,0.876493,0.649106,0.48559,0.183473,...,0.928902,0.98858,0.675213,0.990091,0.993232,0.954224,0.005941,0.106807,0.949878,0.933668
6,257,0.8957,0.486271,0.006919,0.726316,0.739863,0.893735,0.538539,0.457449,0.074345,...,0.751802,0.954958,0.695134,0.972706,0.987342,0.906162,0.864686,0.182657,0.50921,0.922592
7,267,0.967291,0.712311,0.267039,0.739393,0.795763,0.796656,0.677582,0.649118,0.201059,...,0.906255,0.908691,0.674458,0.960482,0.976234,0.81804,0.005299,0.00622,0.976141,0.915883
8,284,0.892987,0.445296,0.00682,0.72465,0.789642,0.890839,0.530727,0.467378,0.412352,...,0.803386,0.972898,0.646042,0.983505,0.991843,0.919998,0.803075,0.159615,0.55188,0.917416
9,292,0.969639,0.732235,0.017571,0.876691,0.911313,0.86033,0.687019,0.57353,0.061705,...,0.913862,0.876258,0.63274,0.962797,0.977443,0.785966,0.316395,0.109399,0.727858,0.927811


In [19]:
# ##post process
# cols2process = df_train.columns.tolist()[11:]
# print(len(cols2process))

In [20]:
# for col in tqdm(cols2process):
#     ###step1
#     quantiles = df_train[col].value_counts()/len(df_train.index)
#     quantiles = quantiles.to_dict()
#     quantiles = {k: v for k, v in sorted(quantiles.items(), key=lambda item: item[0])}
#     ks = list(quantiles.keys())
#     vs = list(quantiles.values())
#     qs = np.cumsum(vs)
#     #print(ks)
#     #print(qs)
#     ###step2
#     qs = np.quantile(df_sub[col], qs)
#     #print(qs)
#     for i in range(len(qs)):
#         if i==0:
#             q = qs[0]
#             df_sub.loc[df_sub[col]<q, col] = ks[i]
#         elif i>0 and i<=len(qs)-1:
#             q0,q1 = qs[i-1], qs[i]
#             df_sub.loc[(df_sub[col]<q1)&(df_sub[col]>=q0), col] = ks[i]

In [21]:
# df_sub.head(10)

In [22]:
# df_sub.to_csv('submission.csv', index=False)