In [1]:
import numpy as np 
import pandas as pd
import os, gc
#regex libraries
import re, regex

#model libraries
from sklearn import preprocessing
from scipy.sparse import hstack, csr_matrix, load_npz
from itertools import combinations

# keras-tensorflow
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Embedding, Dropout, Activation, concatenate, BatchNormalization, Flatten, Concatenate, Conv1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.model_selection import KFold, train_test_split
from keras import backend as K
from keras.engine.topology import Layer
from keras_utils import KMaxPooling
from keras.losses import mean_squared_error
#peter's cache
from cache import save_in_cache, load_cache

from utils import rmse, print_step
# timer function
import time
from contextlib import contextmanager
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(mean_squared_error(y_true, y_pred)) 
    
start_time = time.time()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
embed_file = 'embeddings/avito_lookup_cc_ru_300.txt'
data_dir = 'data'
feat_dir = 'cache'

In [4]:
# prepare data
with timer("load data:"):
    usecols = ['image_top_1','city','price','region','title','description','parent_category_name', 'user_type','category_name',
              'item_seq_number', 'param_1', 'param_2', 'param_3', 'image']
    train = pd.read_csv(f'{data_dir}/train.csv', usecols=usecols+['deal_probability'])
    test = pd.read_csv(f'{data_dir}/test.csv', usecols=usecols)

    train_split = len(train)
    y = train['deal_probability'].copy()
    train.drop("deal_probability",axis=1, inplace=True)
    
    df = pd.concat([train,test],axis=0)

    gc.collect()

with timer("Loading Lat-Lon & Region:"):
    # Lat Lon
    df['location'] = df['city'] + ', ' + df['region']
    locations = pd.read_csv('city_latlons.csv')
    df = df.merge(locations, how='left', on='location')
    df.drop('location', axis=1, inplace=True)
    ## Region Macro
    region_macro = pd.read_csv('region_macro.csv')
    df = df.merge(region_macro, how='left', on='region')
    for c in ['lat', 'lon', 'unemployment_rate', 'GDP_PC_PPP', 'HDI']:
        df[c] = (df[c] - np.mean(df[c]))/np.std(df[c])

with timer('bring in nima features'):
    train_img_nima = pd.read_csv(f'{feat_dir}/train_img_nima.csv')
    test_img_nima = pd.read_csv(f'{feat_dir}/test_img_nima.csv')
    df_img_nima = pd.concat([train_img_nima,test_img_nima],axis=0)
    train_img_nima_softmax = pd.read_csv(f'{feat_dir}/train_img_nima_softmax.csv')
    test_img_nima_softmax = pd.read_csv(f'{feat_dir}/test_img_nima_softmax.csv')
    df_img_nima_softmax = pd.concat([train_img_nima_softmax,test_img_nima_softmax],axis=0)
    df = df.merge(df_img_nima, on = 'image', how = 'left').merge(df_img_nima_softmax, on = 'image', how = 'left')
    df.drop(['image'], axis=1, inplace=True)
    nima_cols = list(np.setdiff1d(df_img_nima.columns, ['image'])) + list(np.setdiff1d(df_img_nima_softmax.columns, ['image']))
    df[nima_cols] = df[nima_cols].fillna(0)
    for c in nima_cols:
        df[c] = (df[c] - np.mean(df[c]))/np.std(df[c])
        
# pre-processing
with timer("preprocess dense features:"):
       
    # merge params
    param_cols = ['param_1', 'param_2', 'param_3']
    for c in param_cols:
        df[c] = df[c].astype(str)
        df[c] = df[c].fillna(value='missing')

    df['param123'] = (df['param_1']+'_'+df['param_2']+'_'+df['param_3']).astype(str)
    df.drop(['param_2','param_3'], axis=1, inplace=True)

    gc.collect()
    
with timer("bring in time features:"):
    train_time = pd.read_csv(f'{feat_dir}/train_time.csv')
    test_time = pd.read_csv(f'{feat_dir}/test_time.csv')
    df_time = pd.concat([train_time, test_time], axis=0).reset_index(drop = True)
    df = pd.concat([df, df_time], axis=1)
    
with timer('Processing Numerical Features'):
    # price - log transform & fill NA
    num_cols = ['price', 'item_seq_number','days_up_user_mean','times_up_user_mean','days_up_user_median',
                'times_up_user_median','days_up_user_min','times_up_user_min','days_up_user_max','times_up_user_max',
                'n_user_items']
    
    for c in num_cols:
        df[c+'_missing'] = 0
        df[c] = df[c].replace([-999], np.NaN) # remove previously imputed NaN (-999)
        df[c+'_missing'] = np.where(df[c].isnull(), 1, df[c+'_missing'])
        df[c] = df[c].replace([np.NaN], 0)
        df[c] = np.log1p(df[c])
        df[c] = (df[c] - np.mean(df[c]))/np.std(df[c])
    df.drop(['item_seq_number_missing','n_user_items_missing'], axis=1, inplace=True)
    
    ## add in previously processed features
    num_cols += nima_cols
    num_cols += ['lat', 'lon', 'unemployment_rate', 'GDP_PC_PPP', 'HDI']
    
    miss_cols = [col for col in df.columns if '_missing' in col]

with timer("preprocess text data:"):
    def clean_text(text):
        text = bytes(text, encoding="utf-8")
        text = text.lower()
        text = re.sub(b'(?<! )(?=[.,!?()])|(?<=[.,!?()])(?! )', b' ', text)
        text = re.sub(b'\s+(?=\d)|(?<=\d)\s+', b' ', text)
        text = text.replace(b"\b", b" ")
        text = text.replace(b"\r", b" ")
        text = regex.sub(b"\s+", b" ", text)
        text = str(text, 'utf-8')
        text = re.sub(r"\W+", " ", text.lower())
        return text
    
    text_input = df['title'].str.cat([
        df['description'], df['parent_category_name']], sep=' ', na_rep='').astype(str).fillna('missing')
    
    text_output = [clean_text(x) for x in text_input]
    
    df.drop(['description','title'], axis=1, inplace=True)
    gc.collect()
    
with timer('Processing Categorical Features'):
    cat_cols = ['image_top_1', 'city','region','parent_category_name','user_type','category_name','param123','param_1']
    for c in cat_cols:
        df[c] = df[c].astype(str)
        df[c].fillna(value='missing', inplace=True)
        df[c] = df[c].str.lower()
        df[c] = df[c].replace(to_replace=' +', value=' ', regex=True)

    lbl = preprocessing.LabelEncoder()
    for c in cat_cols:
        df[c] = lbl.fit_transform(df[c])
    
with timer("resize features:"):
    # reduce size
    for c in cat_cols:
        if df[c].max()<2**7:
            df[c] = df[c].astype('int8')
        elif df[c].max()<2**15:
            df[c] = df[c].astype('int16')
        elif df[c].max()<2**31:
            df[c] = df[c].astype('int32')
        else:
            continue

            cat_cols = cat_cols+miss_cols
            
with timer("split data back to train and test:"):
    df_train = df[:train_split]
    df_test = df[train_split:]
    
    text_train = text_output[:train_split]
    text_test = text_output[train_split:]

    del df

    # get max cat size
    emb_cat_max = {}
    for c in cat_cols:
        emb_cat_max[c] = max(df_train[c].max(), df_test[c].max())+1
    
    # get embedding cat size
    emb_cat_size = {}
    for c in cat_cols:
        emb_cat_size[c] = int(min(50, pd.concat([df_train[c], df_test[c]], axis=0).nunique() / 2))
    
    gc.collect()
    
with timer("Loading Image data:"):
    train_img, test_img = load_cache('img_data')
    cols = ['img_size_x', 'img_size_y', 'img_file_size', 'img_mean_color', 'img_dullness_light_percent', 'img_dullness_dark_percent', 'img_blur', 'img_blue_mean', 'img_green_mean', 'img_red_mean', 'img_blue_std', 'img_green_std', 'img_red_std', 'img_average_red', 'img_average_green', 'img_average_blue', 'img_sobel00', 'img_sobel10', 'img_sobel20', 'img_sobel01', 'img_sobel11', 'img_sobel21', 'img_kurtosis', 'img_skew', 'thing1', 'thing2']
    train_img[cols] = train_img[cols].fillna(0)
    test_img[cols] = test_img[cols].fillna(0)
    img = pd.concat([train_img[cols],test_img[cols]],axis=0)
    for c in cols:
        img[c] = (img[c] - np.mean(img[c]))/np.std(img[c])
    train_img = img[:train_split]
    test_img = img[train_split:]
    df_train = pd.concat([df_train, train_img], axis=1)
    df_test = pd.concat([df_test.reset_index(drop = True), test_img], axis=1)
    num_cols += cols

[load data:] done in 12 s
[Loading Lat-Lon & Region:] done in 2 s
[bring in nima features] done in 17 s
[preprocess dense features:] done in 2 s
[bring in time features:] done in 1 s
[Processing Numerical Features] done in 2 s
[preprocess text data:] done in 107 s
[Processing Categorical Features] done in 36 s
[resize features:] done in 0 s
[split data back to train and test:] done in 0 s
Test shape: (508438, 302)
Train shape: (1503424, 303)
[2018-06-24 15:10:08.958821] Skipped... Loaded cache/train_img_data.csv and cache/test_img_data.csv from cache!
[Loading Image data:] done in 61 s


In [5]:
# prepare embeddings
embed_size = 300
maxlen = 250

with timer("tokenize text data:"):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_output)
    word_index = tokenizer.word_index
    
with timer("prepare embeddings:"):
    # embed_size = 300
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')

    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(embed_file, encoding="utf8"))

    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()

    hit = 0
    total = 0
    nb_words = len(word_index) + 1
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            hit += 1
            embedding_matrix[i] = embedding_vector
        total += 1
    print("Unique Words: found {} out of total {} words at a rate of {:.2f}%".format(hit, total, hit * 100.0 / total))
    del embeddings_index, embedding_vector
    gc.collect()
    
with timer("prepare text input:"):
    text_train_token = tokenizer.texts_to_sequences(text_train)
    text_train = pad_sequences(text_train_token, maxlen=maxlen)
    
    text_test_token = tokenizer.texts_to_sequences(text_test)
    text_test = pad_sequences(text_test_token, maxlen=maxlen)
    del text_output, text_train_token, text_test_token
    gc.collect()

[tokenize text data:] done in 66 s
Unique Words: found 793000 out of total 793308 words at a rate of 99.96%
[prepare embeddings:] done in 194 s
[prepare text input:] done in 62 s


In [6]:
from keras.callbacks import EarlyStopping, LearningRateScheduler, TensorBoard, ModelCheckpoint
def get_keras_data(df, text):
    X = {}
    for c in df.columns:
        X[c] = df[c].values
    X['text'] = text
    return X

def runCNN(train_X, train_y, val_X, val_y, test_X):
    model = CNN()
    
    ### Callbacks
    def schedule(ind):
        return(1.5e-3/(1.5**(ind)))

    lr = LearningRateScheduler(schedule)
    
    model_checkpoint = ModelCheckpoint(filepath = 'tmp_bst_mdl.hdf5', monitor='val_mean_squared_error', verbose=0, save_best_only = True, mode='min')

    tb = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)
    model.fit(train_X, train_y, 
              validation_data=(val_X, val_y),
              batch_size=256, epochs=4, verbose=2,
             callbacks = [lr, model_checkpoint])
    print_step("Loading best model")
    model.load_weights(filepath = 'tmp_bst_mdl.hdf5')
    print_step('Predict Val 1/2')
    pred_val_y = model.predict(val_X)
    print_step('Predict Test 2/2')
    pred_test_y = model.predict(test_X)
    return pred_val_y, pred_test_y

In [10]:
def CNN(): 
        
    K.clear_session()

    cats = [Input(shape=[1], name=name) for name in cat_cols]
    nums = [Input(shape=[1], name=name) for name in num_cols]

    emb_fn = lambda name: Embedding(emb_cat_max[name], emb_cat_size[name])
    embs = []
    for name, cat in zip(cat_cols, cats):
        embs.append(emb_fn(name)(cat))

    texts = Input(shape=(maxlen, ), name='text')

    text_emb = Embedding(nb_words, 
                         embed_size,
                         trainable=False,
                         weights=[embedding_matrix],
                         name='text_emb')(texts)
    outs = []

    filter_sizes = [1,2,3,4]
    convs = []
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=64, kernel_size=filter_size, padding='same', activation='relu')(text_emb)
        l_pool = KMaxPooling(k=20, axis=1)(l_conv)
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)
    l_flat = Flatten()(l_merge)
    conv_in = Dense(64, activation='relu')(l_flat)

    outs += [conv_in]

    # bring in embeddings        
    all_in = [Flatten()(emb) for emb in embs] + nums
    x_in = concatenate(all_in)

#     for idx, (drop_p, num_dense) in enumerate(zip([0.2, 0.2], [64, 32])):
    for num_dense in [192, 96]:
        x_in = Dense(num_dense, activation='relu')(x_in)
#         x_in = Dropout(drop_p)(x_in)

    deep = x_in

    outs += [deep]

    output = concatenate(outs) if len(outs)>1 else outs[0]
    

#    output = Dropout(0.2)(output)

    output = Dense(64, activation='relu')(output)
#     l_dense = Dropout(0.2)(l_dense)
#     output = Dense(32, activation='relu')(output)

    output = Dense(1, activation='sigmoid')(output)

    model = Model(inputs=cats+nums+[texts], outputs=output)
    model.compile(loss = 'mse',
                  metrics=['mse'],
                  optimizer='nadam')
    return model

In [11]:
def run_cv_model(train, text_train, test, text_test, target, model_fn, eval_fn):
    kf = KFold(n_splits=5, shuffle=True, random_state=2017)
    fold_splits = kf.split(train)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros(train.shape[0])
    i = 1
    test_X = get_keras_data(test, text_test)
    for dev_index, val_index in fold_splits:
        print_step('Started ' + ' fold ' + str(i) + '/5')
        dev_X = get_keras_data(train.loc[dev_index], text_train[dev_index])
        val_X = get_keras_data(train.loc[val_index], text_train[val_index])
        dev_y, val_y = target[dev_index], target[val_index]
        pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y, test_X)
        pred_val_y = pred_val_y.flatten()
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        cv_score = eval_fn(val_y, pred_val_y)
        cv_scores.append(eval_fn(val_y, pred_val_y))
        print_step(' cv score ' + str(i) + ' : ' + str(cv_score))
        i += 1
    print_step(' cv scores : ' + str(cv_scores))
    print_step(' mean cv score : ' + str(np.mean(cv_scores)))
    print_step(' std cv score : ' + str(np.std(cv_scores)))
    pred_full_test = pred_full_test / 5.0
    results = {'train': pred_train, 'test': pred_full_test,
                'cv': cv_scores}
    return results

In [12]:
results = run_cv_model(df_train, text_train, df_test, text_test, y, runCNN, rmse)

[2018-06-24 18:20:25.963430] Started  fold 1/5
Train on 1202739 samples, validate on 300685 samples
Epoch 1/4
 - 304s - loss: 0.0509 - mean_squared_error: 0.0509 - val_loss: 0.0502 - val_mean_squared_error: 0.0502
Epoch 2/4
 - 306s - loss: 0.0478 - mean_squared_error: 0.0478 - val_loss: 0.0486 - val_mean_squared_error: 0.0486
Epoch 3/4
 - 305s - loss: 0.0460 - mean_squared_error: 0.0460 - val_loss: 0.0479 - val_mean_squared_error: 0.0479
Epoch 4/4
 - 299s - loss: 0.0442 - mean_squared_error: 0.0442 - val_loss: 0.0483 - val_mean_squared_error: 0.0483
[2018-06-24 18:40:43.303013] Loading best model
[2018-06-24 18:40:43.835861] Predict Val 1/2
[2018-06-24 18:41:39.369875] Predict Test 2/2
[2018-06-24 18:43:12.453167]  cv score 1 : 0.21889769191455064
[2018-06-24 18:43:12.462730] Started  fold 2/5
Train on 1202739 samples, validate on 300685 samples
Epoch 1/4
 - 303s - loss: 0.0510 - mean_squared_error: 0.0510 - val_loss: 0.0492 - val_mean_squared_error: 0.0492
Epoch 2/4
 - 301s - loss: 0.

In [23]:
# base
# 0.2187

In [17]:
train_cnn_1, test_cnn_1 = load_cache('CNN_FastText')
train_cnn_4, test_cnn_4 = load_cache('CNN_FastText_4')
train_cnn_5, test_cnn_5 = load_cache('CNN_FastText_5')
train_cnn_6, test_cnn_6 = load_cache('CNN_FastText_6')

Test shape: (508438, 1)
Train shape: (1503424, 1)
[2018-06-24 21:14:01.428988] Skipped... Loaded cache/train_CNN_FastText.csv and cache/test_CNN_FastText.csv from cache!
Test shape: (508438, 1)
Train shape: (1503424, 1)
[2018-06-24 21:14:01.686429] Skipped... Loaded cache/train_CNN_FastText_4.csv and cache/test_CNN_FastText_4.csv from cache!
Test shape: (508438, 1)
Train shape: (1503424, 1)
[2018-06-24 21:14:01.943099] Skipped... Loaded cache/train_CNN_FastText_5.csv and cache/test_CNN_FastText_5.csv from cache!
Test shape: (508438, 1)
Train shape: (1503424, 1)
[2018-06-24 21:14:02.199830] Skipped... Loaded cache/train_CNN_FastText_6.csv and cache/test_CNN_FastText_6.csv from cache!


In [20]:
train_cnn_1['cnn_4'] =train_cnn_4['CNN_FastText_4']
train_cnn_1['cnn_5'] =train_cnn_5['CNN_FastText_5']
train_cnn_1['cnn_6'] =train_cnn_6['CNN_FastText_6']

In [21]:
train_cnn_1.corr()

Unnamed: 0,CNN_FastText,cnn_4,cnn_5,cnn_6
CNN_FastText,1.0,0.929525,0.931626,0.928311
cnn_4,0.929525,1.0,0.951353,0.946422
cnn_5,0.931626,0.951353,1.0,0.950345
cnn_6,0.928311,0.946422,0.950345,1.0


In [13]:
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print_step('Prepping submission file')
submission = pd.read_csv(f'{data_dir}/test.csv', usecols=['item_id'])
submission['deal_probability'] = results['test'].clip(0.0, 1.0)
submission.to_csv('submit/submit_CNN_FastText_6.csv', index=False)
print_step('Done!')

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[2018-06-24 21:05:01.795001] Prepping submission file
[2018-06-24 21:05:03.739539] Done!


In [16]:
print('~~~~~~~~~~')
print_step('Cache')
save_in_cache('CNN_FastText_6', pd.DataFrame({'CNN_FastText_6': results['train']}),
                           pd.DataFrame({'CNN_FastText_6': results['test'].flatten()}))

~~~~~~~~~~
[2018-06-24 21:13:39.073772] Cache
[2018-06-24 21:13:41.950718] Saved cache/train_CNN_FastText_6.csv and cache/test_CNN_FastText_6.csv to cache!
