# Coding Log
---
## Start with Linear Regression 
    - variable - region, parent_category_name, category_name, user_type >> 0.2472 

## Try LightGBM
    - variable - region, parent_category_name, category_name, user_type >> 0.2412 

## Change Model to NN (feat. Keras)
|  Model        | Variables                            | Val_loss  | LB |
| ------------- |:-------------------------------- | -----:| -----:|
|||||
| NN      | region | 0.3032 | 0.2588 |
| NN      | region, pcn      |   0.2537 | 0.2475 |
| NN      | region, pcn, cn    |   0.2488 | 0.2430 |
| NN      | region, pcn, cn, ut    |   0.2459 | 0.2407 |
| NN      | region, pcn, cn, ut, city   |   Unkonwn | 0.2402 |
| NN      | region, pcn, cn, ut, city, price  |   Unkonwn | 0.2375 |

***
***

# Import  basic

In [None]:
import numpy as np
import pandas as pd
import gc
import os
print(os.listdir("../input"))

# Data Load

In [None]:
df_train = pd.read_csv('../input/train.csv')
x_test = pd.read_csv('../input/test.csv')

y_trn = df_train['deal_probability']
x_trn = df_train.drop(['deal_probability'], axis=1)

In [None]:
x_trn.info()

In [None]:
x_test.info()

In [None]:
# viewing # of unique value in each column 
for col in x_trn.columns:
    print(col, len(x_trn[col].unique()))

## Variable Plan

    item_id 1503424            => drop
    user_id 771769             => drop
    region 28                    => categorical embedding
    city 1733                    => categorical embedding
    parent_category_name 9       => categorical embedding
    category_name 47             => categorical embedding
    param_1 372          
    param_2 272
    param_3 1220
    title 788377
    description 1317103
    price 17007                 => continuous, log
    item_seq_number 28232
    activation_date 21          => week variable??
    user_type 3                 => categorical embedding
    image 1390837
    image_top_1 3063

In [None]:
# x_trn.fillna(value='UNK', inplace=True)

# Preprocess

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import argparse

#create config init
config = argparse.Namespace()

In [None]:
def tknzr_fit(col, trn, test):
    tknzr = Tokenizer(filters='', lower=False, split='뷁', oov_token='oov' )
    tknzr.fit_on_texts(trn[col])
    return np.array(tknzr.texts_to_sequences(trn[col])), np.array(tknzr.texts_to_sequences(test[col])), tknzr

In [None]:
reg_tr, reg_te, reg_tknzr = tknzr_fit('region', x_trn, x_test)
pcn_tr, pcn_te, pcn_tknzr = tknzr_fit('parent_category_name', x_trn, x_test)
cn_tr, cn_te, cn_tknzr = tknzr_fit('category_name', x_trn, x_test)
ut_tr, ut_te, ut_tknzr = tknzr_fit('user_type', x_trn, x_test)
city_tr, city_te, city_tknzr = tknzr_fit('city', x_trn, x_test)

In [None]:
eps = 1e-10
price_tr = np.log(x_trn['price']+eps)
price_te = np.log(x_test['price']+eps)
price_tr[price_tr.isna()] = -1.
price_te[price_te.isna()] = -1.

In [None]:
price_tr = np.expand_dims(price_tr, axis=-1)
price_te = np.expand_dims(price_te, axis=-1)

In [None]:
price_tr.shape

In [None]:
config.len_reg = len(reg_tknzr.word_index)
config.len_pcn = len(pcn_tknzr.word_index)
config.len_cn = len(cn_tknzr.word_index)
config.len_ut = len(ut_tknzr.word_index)
config.len_city = len(city_tknzr.word_index)
config.len_price = 1

In [None]:
config.emb_reg = 20
config.emb_pcn = 20
config.emb_cn = 20
config.emb_ut = 20
config.emb_city = 32
config.emb_price = 20

# Train_Validation Split (random sample 80:20)

In [None]:
valid_idx = y_trn.sample(frac=0.2, random_state=1991).index
train_idx = y_trn[np.invert(y_trn.index.isin(valid_idx))].index

In [None]:
X = np.array([reg_tr, pcn_tr, cn_tr, ut_tr, city_tr,price_tr])
X_test = np.array([reg_te, pcn_te, cn_te, ut_te, city_te, price_te])
Y = y_trn

In [None]:
X_train = [x[train_idx] for x in X]
X_valid = [x[valid_idx] for x in X]
X_test = [x for x in X_test]

Y_train = Y[train_idx]
Y_valid = Y[valid_idx]

# Keras Model & RMSE Loss

In [None]:
from keras.layers import Input, Embedding, Dense
from keras.layers import GlobalMaxPool1D, GlobalMaxPool2D
from keras.layers import concatenate
from keras.models import Model

from keras import backend as K

### rmse loss for keras
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [None]:
config.batch_size = 1024

In [None]:
def get_model():
    K.clear_session()
    inp_reg = Input(shape=(1, ))
    emb_reg = Embedding(config.len_reg, config.emb_reg )(inp_reg)
    
    inp_pcn = Input(shape=(1, ))
    emb_pcn = Embedding(config.len_pcn, config.emb_pcn )(inp_pcn)

    inp_cn = Input(shape=(1, ))
    emb_cn = Embedding(config.len_cn, config.emb_cn )(inp_cn)
    
    inp_ut = Input(shape=(1, ))
    emb_ut = Embedding(config.len_ut, config.emb_ut )(inp_ut)
    
    inp_city = Input(shape=(1, ))
    emb_city = Embedding(config.len_city, config.emb_city )(inp_city)
    
    inp_price = Input(shape=(1, ))
#     emb_price = K.expand_dims(inp_price, axis=-1)

    conc = concatenate([emb_reg, emb_pcn,  emb_cn, emb_ut, emb_city], axis=-1)
    conc = GlobalMaxPool1D()(conc)
    conc = concatenate([conc, inp_price], axis=-1)
    x = Dense(32, activation='relu')(conc)
#     x = Dense(32, activation='relu')(x)
#     x = Dense(64, activation='relu')(x)
#     x = Dense(64, activation='relu')(x)

    outp = Dense(1, activation='sigmoid')(x)

    model = Model(inputs = [inp_reg, inp_pcn, inp_cn, inp_ut, inp_city, inp_price], outputs = outp)
    return model

In [None]:
model = get_model()
model.compile(optimizer='adam', loss = root_mean_squared_error, metrics=['mse', root_mean_squared_error])
model.summary()

In [None]:
model.fit(x=X_train, y=np.array(Y_train), validation_data=(X_valid, Y_valid), batch_size=1024, epochs=10)

# Model

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn import tree, ensemble, linear_model

In [None]:
# def clip_result(pred):
#     pred[pred<0] = 0.
#     pred[pred>1] = 1.
#     return pred

In [None]:
# X_tr, X_val, Y_tr, Y_val = train_test_split(x_tr, y_trn, test_size = 0.15)

## Linear Regression

In [None]:
# clf = linear_model.LinearRegression(normalize=False, n_jobs=2, copy_X=True)
# clf.fit(X_tr, Y_tr)

In [None]:
# pred_val = clf.predict(X_val)
# cliped_pred_val = clip_result(pred_val)

# Random Forest

In [None]:
# rf = ensemble.RandomForestRegressor(n_estimators=50, max_depth=15, n_jobs=-1)
# rf.fit(X_tr, Y_tr)

In [None]:
# pred_val = rf.predict(X_val)
# cliped_pred_val = clip_result(pred_val)

# LGB

In [None]:
# import lightgbm as lgb

In [None]:
# clf = lgb.LGBMRegressor(n_estimators=500)
# clf.fit(X_tr, Y_tr)

In [None]:
# pred_val = clf.predict(X_val)
# cliped_pred_val = clip_result(pred_val)

In [None]:
# val_loss = mean_squared_error(Y_val, cliped_pred_val)**(1/2)
# print(val_loss)

# Metric

In [None]:
# from sklearn.metrics import mean_squared_error

In [None]:
# val_loss = mean_squared_error(Y_val, cliped_pred_val)**(1/2)
# print(val_loss)

# Test & submit

In [None]:
pred = model.predict(X_test)

subm = pd.read_csv("../input/sample_submission.csv")
subm['deal_probability'] = pred
subm.to_csv('submit_{}_{:.4f}.csv'.format('nn_reg_pcn', 0.2430), index=False)