In [49]:
import pandas as pd
import numpy as np
from keras.models import Model
from keras.layers import Dense, Input, Embedding, Dropout, Flatten
from keras.layers.merge import concatenate, dot, multiply, add
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Nadam, RMSprop, adam
from keras.layers.noise import AlphaDropout, GaussianNoise
from keras import backend as K
import keras


In [51]:
#Import Data
train_data = pd.read_csv("finaltrain.csv",encoding='utf-8') 
test_data  = pd.read_csv("finaltest.csv",encoding='utf-8')

In [53]:
#split into x_train/x_val
val_split = 0.3
train_data = train_data.sample(frac=1).reset_index(drop=True)
val_ix = int(np.rint(len(train_data)*(1.-val_split)))

In [54]:
#data frame formats with y-values packed in
train_df = train_data[:val_ix]
val_df = train_data[val_ix:]
test_df = test_data

In [55]:
#Create the Tokenizers
region_tk = {x:i+1 for i, x in enumerate(train_df.region.unique())}#+1 because we want to reserve 0 for new but not missing values
city_tk =  {x:i+1 for i, x in enumerate(train_df.city.unique())}
cat1_tk =  {x:i+1 for i, x in enumerate(train_df.parent_category_name.unique())}
cat2_tk =  {x:i+1 for i, x in enumerate(train_df.category_name.unique())}
seqnum_tk =  {x:i+1 for i, x in enumerate(train_df.item_seq_number.unique())}
usertype_tk = {x:i+1 for i, x in enumerate(train_df.user_type.unique())}
imgtype_tk = {x:i+1 for i, x in enumerate(train_df.image_top_1.unique())}
price_tk = {x:i+1 for i, x in enumerate(train_df.price.unique())}
a_tk = {x:i+1 for i, x in enumerate(train_df.description_num_chars.unique())}
b_tk = {x:i+1 for i, x in enumerate(train_df.description_num_words.unique())}
c_tk = {x:i+1 for i, x in enumerate(train_df.description_num_unique_words.unique())}
d_tk = {x:i+1 for i, x in enumerate(train_df.description_words_vs_unique.unique())}
e_tk = {x:i+1 for i, x in enumerate(train_df.text_feat_num_chars.unique())}
f_tk = {x:i+1 for i, x in enumerate(train_df.text_feat_num_words.unique())}
g_tk = {x:i+1 for i, x in enumerate(train_df.text_feat_num_unique_words.unique())}
h_tk = {x:i+1 for i, x in enumerate(train_df.text_feat_words_vs_unique.unique())}
i_tk = {x:i+1 for i, x in enumerate(train_df.title_num_chars.unique())}
j_tk = {x:i+1 for i, x in enumerate(train_df.title_num_words.unique())}
k_tk = {x:i+1 for i, x in enumerate(train_df.title_num_unique_words.unique())}
l_tk = {x:i+1 for i, x in enumerate(train_df.text_feat_words_vs_unique.unique())}
m_tk = {x:i+1 for i, x in enumerate(train_df.hi_dp_wordcount.unique())}
n_tk = {x:i+1 for i, x in enumerate(train_df.lo_dp_wordcount.unique())}
o_tk = {x:i+1 for i, x in enumerate(train_df.perform_white_analysis.unique())}
p_tk = {x:i+1 for i, x in enumerate(train_df.perform_black_analysis.unique())}
q_tk = {x:i+1 for i, x in enumerate(train_df.image_size.unique())}
r_tk = {x:i+1 for i, x in enumerate(train_df.average_pixel_width.unique())}
s_tk = {x:i+1 for i, x in enumerate(train_df.get_blurrness_score.unique())}
t_tk = {x:i+1 for i, x in enumerate(train_df.average_red.unique())}
u_tk = {x:i+1 for i, x in enumerate(train_df.average_green.unique())}
v_tk = {x:i+1 for i, x in enumerate(train_df.average_blue.unique())}
w_tk = {x:i+1 for i, x in enumerate(train_df.descsentiment.unique())}
x_tk = {x:i+1 for i, x in enumerate(train_df.titlesentiment.unique())}
y_tk = {x:i+1 for i, x in enumerate(train_df.reg_dense.unique())}
z_tk = {x:i+1 for i, x in enumerate(train_df.rural.unique())}
rp_tk = {x:i+1 for i, x in enumerate(train_df.reg_Population.unique())}
ru_tk = {x:i+1 for i, x in enumerate(train_df.reg_Urban.unique())}
cp_tk = {x:i+1 for i, x in enumerate(train_df.city_population.unique())}


tokenizers = [region_tk, city_tk, cat1_tk, cat2_tk, seqnum_tk, usertype_tk, imgtype_tk,price_tk,a_tk,b_tk,c_tk,d_tk,e_tk,f_tk,g_tk,h_tk,i_tk,j_tk,k_tk,l_tk,m_tk,n_tk,o_tk,p_tk,q_tk,r_tk,s_tk,t_tk,u_tk,v_tk,w_tk,x_tk,y_tk,z_tk,rp_tk,ru_tk,cp_tk]

In [56]:
#These functions are going to get repeated on train, val, and test data
def tokenize_data(data, tokenizers):
    region_tk, city_tk, cat1_tk, cat2_tk, seqnum_tk, usertype_tk, imgtype_tk,price_tk,a_tk,b_tk,c_tk,d_tk,e_tk,f_tk,g_tk,h_tk,i_tk,j_tk,k_tk,l_tk,m_tk,n_tk,o_tk,p_tk,q_tk,r_tk,s_tk,t_tk,u_tk,v_tk,w_tk,x_tk,y_tk,z_tk,rp_tk,ru_tk,cp_tk = tokenizers
    x_reg = np.asarray([region_tk.get(key, 0) for key in data.region], dtype=int)
    x_city   = np.asarray([city_tk.get(key, 0) for key in data.city], dtype=int)
    x_cat1   = np.asarray([cat1_tk.get(key, 0) for key in data.parent_category_name], dtype=int)
    x_cat2   = np.asarray([cat2_tk.get(key, 0) for key in data.category_name], dtype=int)
    x_sqnm = np.asarray([seqnum_tk.get(key, 0) for key in data.item_seq_number], dtype=int)
    x_usr = np.asarray([usertype_tk.get(key, 0) for key in data.user_type], dtype=int)
    x_itype = np.asarray([imgtype_tk.get(key, 0) for key in data.image_top_1], dtype=int)
    x_price = np.asarray([price_tk.get(key, 0) for key in data.price], dtype=float)
    x_a = np.asarray([a_tk.get(key, 0) for key in data.description_num_chars], dtype=float)
    x_b = np.asarray([b_tk.get(key, 0) for key in data.description_num_words], dtype=float)
    x_c = np.asarray([c_tk.get(key, 0) for key in data.description_num_unique_words], dtype=float)
    x_d = np.asarray([d_tk.get(key, 0) for key in data.description_words_vs_unique], dtype=float)
    x_e = np.asarray([e_tk.get(key, 0) for key in data.text_feat_num_chars], dtype=float)
    x_f = np.asarray([f_tk.get(key, 0) for key in data.text_feat_num_words], dtype=float)
    x_g = np.asarray([g_tk.get(key, 0) for key in data.text_feat_num_unique_words], dtype=float)
    x_h = np.asarray([h_tk.get(key, 0) for key in data.text_feat_words_vs_unique], dtype=float)
    x_i = np.asarray([i_tk.get(key, 0) for key in data.title_num_chars], dtype=float)
    x_j = np.asarray([j_tk.get(key, 0) for key in data.title_num_words], dtype=float)
    x_k = np.asarray([k_tk.get(key, 0) for key in data.title_num_unique_words], dtype=float)
    x_l = np.asarray([l_tk.get(key, 0) for key in data.text_feat_words_vs_unique], dtype=float)
    x_m = np.asarray([m_tk.get(key, 0) for key in data.hi_dp_wordcount], dtype=float)
    x_n = np.asarray([n_tk.get(key, 0) for key in data.lo_dp_wordcount], dtype=float)
    x_o = np.asarray([o_tk.get(key, 0) for key in data.perform_white_analysis], dtype=float)
    x_p = np.asarray([p_tk.get(key, 0) for key in data.perform_black_analysis], dtype=float)
    x_q = np.asarray([q_tk.get(key, 0) for key in data.image_size], dtype=float)
    x_r = np.asarray([r_tk.get(key, 0) for key in data.average_pixel_width], dtype=float)
    x_s = np.asarray([s_tk.get(key, 0) for key in data.get_blurrness_score], dtype=float)
    x_t = np.asarray([t_tk.get(key, 0) for key in data.average_red], dtype=float)
    x_u = np.asarray([u_tk.get(key, 0) for key in data.average_green], dtype=float)
    x_v = np.asarray([v_tk.get(key, 0) for key in data.average_blue], dtype=float)
    x_w = np.asarray([w_tk.get(key, 0) for key in data.descsentiment], dtype=float)
    x_x = np.asarray([x_tk.get(key, 0) for key in data.titlesentiment], dtype=float)
    x_y = np.asarray([y_tk.get(key, 0) for key in data.reg_dense], dtype=float)
    x_z = np.asarray([z_tk.get(key, 0) for key in data.rural], dtype=float)
    x_rp = np.asarray([rp_tk.get(key, 0) for key in data.reg_Population], dtype=float)
    x_ru = np.asarray([ru_tk.get(key, 0) for key in data.reg_Urban], dtype=float)
    x_cp = np.asarray([cp_tk.get(key, 0) for key in data.city_population], dtype=float)

    
    return [x_reg, x_city, x_cat1, x_cat2, x_sqnm, x_usr, x_itype,x_price,x_a,x_b,x_c,x_d,x_e,x_f,x_g,x_h,x_i,x_j,x_k,x_l,x_m,x_n,x_o,x_p,x_q,x_r,x_s,x_t,x_u,x_v,x_w,x_x,x_y,x_z,x_rp,x_ru,x_cp]


In [57]:
#Final Processing on x, y train, val, test data
x_train = tokenize_data(train_df, tokenizers)
y_train = train_df.deal_probability.as_matrix()

x_val = tokenize_data(val_df, tokenizers)
y_val = val_df.deal_probability.as_matrix()

x_test = tokenize_data(test_df, tokenizers)


In [58]:
#define the NN Model Outline.
def build_model():
    inp_reg = Input(shape=(1,))
    inp_city = Input(shape=(1,))
    inp_cat1 = Input(shape=(1,))
    inp_cat2 = Input(shape=(1,))
    inp_sqnm = Input(shape=(1,))
    inp_usr = Input(shape=(1,))
    inp_itype = Input(shape=(1,))
    inp_price = Input(shape=(1,))
    inp_a = Input(shape=(1,))
    inp_b = Input(shape=(1,))
    inp_c = Input(shape=(1,))
    inp_d = Input(shape=(1,))
    inp_e = Input(shape=(1,))
    inp_f = Input(shape=(1,))
    inp_g = Input(shape=(1,))
    inp_h = Input(shape=(1,))
    inp_i = Input(shape=(1,))
    inp_j = Input(shape=(1,))
    inp_k = Input(shape=(1,))
    inp_l = Input(shape=(1,))
    inp_m = Input(shape=(1,))
    inp_n = Input(shape=(1,))
    inp_o = Input(shape=(1,))
    inp_p = Input(shape=(1,))
    inp_q = Input(shape=(1,))
    inp_r = Input(shape=(1,))
    inp_s = Input(shape=(1,))
    inp_t = Input(shape=(1,))
    inp_u = Input(shape=(1,))
    inp_v = Input(shape=(1,))
    inp_w = Input(shape=(1,))
    inp_x = Input(shape=(1,))
    inp_y = Input(shape=(1,))
    inp_z = Input(shape=(1,))
    inp_rp = Input(shape=(1,))
    inp_ru = Input(shape=(1,))
    inp_cp = Input(shape=(1,))
  

    emb_size = 8
    emb_reg  = Embedding(len(region_tk)+1, emb_size)(inp_reg)
    emb_city = Embedding(len(city_tk)+1, emb_size)(inp_city)
    emb_cat1 = Embedding(len(cat1_tk)+1, emb_size)(inp_cat1)
    emb_cat2 = Embedding(len(cat2_tk)+1, emb_size)(inp_cat2)
    emb_sqnm = Embedding(len(seqnum_tk)+1, emb_size)(inp_sqnm)
    emb_usr  = Embedding(len(usertype_tk)+1, emb_size)(inp_usr)
    emb_itype= Embedding(len(imgtype_tk)+1, emb_size)(inp_itype)
    emb_price= Embedding(len(price_tk)+1, emb_size)(inp_price)
    emb_a= Embedding(len(a_tk)+1, emb_size)(inp_a)
    emb_b= Embedding(len(b_tk)+1, emb_size)(inp_b)
    emb_c= Embedding(len(c_tk)+1, emb_size)(inp_c)
    emb_d= Embedding(len(d_tk)+1, emb_size)(inp_d)
    emb_e= Embedding(len(e_tk)+1, emb_size)(inp_e)
    emb_f= Embedding(len(f_tk)+1, emb_size)(inp_f)
    emb_g= Embedding(len(g_tk)+1, emb_size)(inp_g)
    emb_h= Embedding(len(h_tk)+1, emb_size)(inp_h)
    emb_i= Embedding(len(i_tk)+1, emb_size)(inp_i)
    emb_j= Embedding(len(j_tk)+1, emb_size)(inp_j)
    emb_k= Embedding(len(k_tk)+1, emb_size)(inp_k)
    emb_l= Embedding(len(l_tk)+1, emb_size)(inp_l)
    emb_m= Embedding(len(m_tk)+1, emb_size)(inp_m)
    emb_n= Embedding(len(n_tk)+1, emb_size)(inp_n)
    emb_o= Embedding(len(o_tk)+1, emb_size)(inp_o)
    emb_p= Embedding(len(p_tk)+1, emb_size)(inp_p)
    emb_q= Embedding(len(q_tk)+1, emb_size)(inp_q)
    emb_r= Embedding(len(r_tk)+1, emb_size)(inp_r)
    emb_s= Embedding(len(s_tk)+1, emb_size)(inp_s)
    emb_t= Embedding(len(t_tk)+1, emb_size)(inp_t)
    emb_u= Embedding(len(u_tk)+1, emb_size)(inp_u)
    emb_v= Embedding(len(v_tk)+1, emb_size)(inp_v)
    emb_w= Embedding(len(w_tk)+1, emb_size)(inp_w)
    emb_x= Embedding(len(x_tk)+1, emb_size)(inp_x)
    emb_y= Embedding(len(y_tk)+1, emb_size)(inp_y)
    emb_z= Embedding(len(z_tk)+1, emb_size)(inp_z)
    emb_rp= Embedding(len(rp_tk)+1, emb_size)(inp_rp)
    emb_ru= Embedding(len(ru_tk)+1, emb_size)(inp_ru)
    emb_cp= Embedding(len(cp_tk)+1, emb_size)(inp_cp)

    x = concatenate([emb_reg,emb_city,emb_cat1,emb_cat2,emb_sqnm,emb_usr,emb_itype,emb_price,emb_a,emb_b,emb_c,emb_d,emb_e,emb_f,emb_g,emb_h,emb_i,emb_j,emb_k,emb_l,emb_m,emb_n,emb_o,emb_p,emb_q,emb_r,emb_s,emb_t,emb_u,emb_v,emb_w,emb_x,emb_y,emb_z,emb_rp,emb_ru,emb_cp])
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    
    x = Dense(128, activation="relu", kernel_initializer="lecun_normal")(x)
    x = AlphaDropout(0.05)(x)
    x = Dense(64, activation="relu", kernel_initializer="lecun_normal")(x)
    x = AlphaDropout(0.05)(x)
    x = Dense(32, activation="relu", kernel_initializer="lecun_normal")(x)
    y = Dense(1, activation="sigmoid")(x)
    
    model = Model(inputs=[inp_reg, inp_city, inp_cat1, inp_cat2,inp_sqnm, inp_usr, inp_itype,inp_price,inp_a,inp_b,inp_c,inp_d,inp_e,inp_f,inp_g,inp_h,inp_i,inp_j,inp_k,inp_l,inp_m,inp_n,inp_o,inp_p,inp_q,inp_r,inp_s,inp_t,inp_u,inp_v,inp_w,inp_x,inp_y,inp_z,inp_rp,inp_ru,inp_cp],outputs=y)
    model.compile(optimizer="Nadam", loss=["MSE"], metrics=[root_mean_squared_error])
    model.summary()
    
    return model

In [59]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 


In [60]:
model = build_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_112 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
input_113 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
input_114 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
input_115 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
input_116 

In [61]:
#fit model
earlystop = EarlyStopping(monitor="val_loss",mode="auto",patience=5,verbose=0)
checkpt = ModelCheckpoint(monitor="val_loss",mode="auto",filepath='model_baseline_weights.hdf5',verbose=0,save_best_only=True)
rlrop = ReduceLROnPlateau(monitor='val_loss',mode='auto',patience=2,verbose=1,factor=0.1,cooldown=0,min_lr=1e-6)
batch_size = 2048
model.fit(x_train, y_train,batch_size=batch_size,validation_data=(x_val, y_val),
          epochs=100,verbose=2,callbacks =[checkpt, earlystop, rlrop])

Train on 378367 samples, validate on 162157 samples
Epoch 1/100
 - 16s - loss: 0.0642 - root_mean_squared_error: 0.1753 - val_loss: nan - val_root_mean_squared_error: nan
Epoch 2/100
 - 16s - loss: 0.0547 - root_mean_squared_error: 0.1562 - val_loss: nan - val_root_mean_squared_error: nan

Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 3/100
 - 15s - loss: 0.0457 - root_mean_squared_error: 0.1379 - val_loss: nan - val_root_mean_squared_error: nan
Epoch 4/100
 - 16s - loss: 0.0413 - root_mean_squared_error: 0.1279 - val_loss: nan - val_root_mean_squared_error: nan

Epoch 00004: ReduceLROnPlateau reducing learning rate to 2.0000000949949027e-05.
Epoch 5/100
 - 15s - loss: 0.0382 - root_mean_squared_error: 0.1218 - val_loss: nan - val_root_mean_squared_error: nan


<keras.callbacks.History at 0x187edec908>

In [62]:
#make prediction
preds = model.predict(x_test, batch_size=batch_size)

In [63]:
submission = pd.read_csv("sample_submission.csv")
submission['deal_probability'] = preds
submission.to_csv("NN_test.csv", index=False)

In [64]:
submission

Unnamed: 0,item_id,deal_probability
0,6544e41a8817,0.031554
1,65b9484d670f,0.173181
2,8bab230b2ecd,0.075820
3,8e348601fefc,0.212699
4,8bd2fe400b89,0.134131
5,c63dbd6c657f,0.093086
6,6d1a410df86e,0.013475
7,e8d3e7922b80,0.006246
8,2bc1ab208462,0.111949
9,7e05d77a9181,0.031037
