In [None]:

import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, concatenate, GRU, Embedding, Flatten, Activation,BatchNormalization,GaussianNoise
from keras.layers import Bidirectional
from keras.optimizers import Adam,SGD
from keras.models import Model
from keras import backend as K
from nltk.corpus import stopwords
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
np.random.seed(7)

train_df = pd.read_table('../input/train.tsv')
test_df = pd.read_table('../input/test.tsv')
print(train_df.shape, test_df.shape)


In [None]:
def rmsle(Y, Y_pred):
    assert Y.shape == Y_pred.shape
    return np.sqrt(np.mean(np.square(Y_pred - Y )))

def root_mean_squared_logarithmic_error(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1)+0.0000001)
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)+0.0000001)
def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except: 
        return 0
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")
    
def brandfinder(line):
    brand = line[0]
    name = line[1]
    namesplit = name.split(' ')
    if brand == 'missing':
        for x in namesplit:
            if x in all_brands:
                return name
    if name in all_brands:
        return name
    return brand    

def fill_missing_values(df):
    df.category_name.fillna(value="missing", inplace=True)
    df.brand_name.fillna(value="missing", inplace=True)
    df.item_description.fillna(value="missing", inplace=True)
    df.item_description.replace('No description yet',"missing", inplace=True)
    return df
def get_rnn_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ),
        'item_desc': pad_sequences(dataset.seq_item_description, maxlen=MAX_ITEM_DESC_SEQ),
        'brand_name': np.array(dataset.brand_name),
        'category': np.array(dataset.category),
#         'category_name': pad_sequences(dataset.seq_category, maxlen=MAX_CATEGORY_SEQ),
        'item_condition': np.array(dataset.item_condition_id),
        'num_vars': np.array(dataset[["shipping"]]),
        'desc_len': np.array(dataset[["desc_len"]]),
        'name_len': np.array(dataset[["name_len"]]),
        'subcat_0': np.array(dataset.subcat_0),
        'subcat_1': np.array(dataset.subcat_1),
        'subcat_2': np.array(dataset.subcat_2),
    }
    return X



In [None]:

# remove price low 3
train_df = train_df.drop(train_df[(train_df.price < 3.0)].index)
train_df.shape

In [None]:
# get name and description lengths

train_df['desc_len'] = train_df['item_description'].apply(lambda x: wordCount(x))
test_df['desc_len'] = test_df['item_description'].apply(lambda x: wordCount(x))
train_df['name_len'] = train_df['name'].apply(lambda x: wordCount(x))
test_df['name_len'] = test_df['name'].apply(lambda x: wordCount(x))
train_df.head()


train_df['subcat_0'], train_df['subcat_1'], train_df['subcat_2'] = \
zip(*train_df['category_name'].apply(lambda x: split_cat(x)))
test_df['subcat_0'], test_df['subcat_1'], test_df['subcat_2'] = \
zip(*test_df['category_name'].apply(lambda x: split_cat(x)))

print("DONE")

In [None]:
# try to get brand from text
full_set = pd.concat([train_df,test_df])
all_brands = set(full_set['brand_name'].values)
train_df.brand_name.fillna(value="missing", inplace=True)
test_df.brand_name.fillna(value="missing", inplace=True)

premissing = len(train_df.loc[train_df['brand_name'] == 'missing'])

train_df['brand_name'] = train_df[['brand_name','name']].apply(brandfinder, axis = 1)
test_df['brand_name'] = test_df[['brand_name','name']].apply(brandfinder, axis = 1)
#found = premissing-len(train_df.loc[train_df['brand_name'] == 'missing'])
print("DONE")

In [None]:
#train_df["price"] = np.log1p(train_df.price)
#print(train_df["price"])
train_df["price2"]=(train_df.price)
train_df["price3"]=np.log10(train_df.price)
train_df["price"]=np.log1p(train_df.price)

#print(train_df["price"])
# Split training examples into train/dev examples.
train_df, dev_df = train_test_split(train_df, random_state=123, train_size=0.99)

# Calculate number of train/dev/test examples.
n_trains = train_df.shape[0]
n_devs = dev_df.shape[0]
n_tests = test_df.shape[0]
print("DONE")

In [None]:
full_df = pd.concat([train_df, dev_df, test_df])
# missing values
full_df = fill_missing_values(full_df)
#print(full_df.category_name[1])
print("DONE")

In [None]:
#convert to categorical
le = LabelEncoder()
le.fit(full_df.category_name)
full_df['category'] = le.transform(full_df.category_name)

le.fit(full_df.brand_name)
full_df.brand_name = le.transform(full_df.brand_name)

le.fit(full_df.subcat_0)
full_df.subcat_0 = le.transform(full_df.subcat_0)

le.fit(full_df.subcat_1)
full_df.subcat_1 = le.transform(full_df.subcat_1)

le.fit(full_df.subcat_2)
full_df.subcat_2 = le.transform(full_df.subcat_2)

del le
print("done")

In [None]:
print("DONE")

In [None]:
#convert text description and name to sequences
raw_text = np.hstack([full_df.item_description.str.lower(), full_df.name.str.lower(), full_df.category_name.str.lower()])

tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)

full_df['seq_item_description'] = tok_raw.texts_to_sequences(full_df.item_description.str.lower())
full_df['seq_name'] = tok_raw.texts_to_sequences(full_df.name.str.lower())

full_df['seq_name'][:5]
del tok_raw
print("DONE")

In [None]:


MAX_NAME_SEQ = 17 #17
MAX_ITEM_DESC_SEQ = 105 #269
MAX_CATEGORY_SEQ = 8 #8
MAX_TEXT = np.max([
    np.max(full_df.seq_name.max()),
    np.max(full_df.seq_item_description.max()),
]) + 100
MAX_CATEGORY = np.max(full_df.category.max()) + 1
MAX_BRAND = np.max(full_df.brand_name.max()) + 1
MAX_CONDITION = np.max(full_df.item_condition_id.max()) + 1
MAX_DESC_LEN = np.max(full_df.desc_len.max()) + 1
MAX_NAME_LEN = np.max(full_df.name_len.max()) + 1
MAX_SUBCAT_0 = np.max(full_df.subcat_0.max()) + 1
MAX_SUBCAT_1 = np.max(full_df.subcat_1.max()) + 1
MAX_SUBCAT_2 = np.max(full_df.subcat_2.max()) + 1

train = full_df[:n_trains]
dev = full_df[n_trains:n_trains+n_devs]
test = full_df[n_trains+n_devs:]

X_train = get_rnn_data(train)
Y_train = train.price.values.reshape(-1, 1)
Y_train2 = train.price2.values.reshape(-1, 1)
Y_train3 = train.price3.values.reshape(-1, 1)
#Y_train = train.price2.values.reshape(-1, 1)

X_dev = get_rnn_data(dev)
Y_dev = dev.price.values.reshape(-1, 1)
Y_dev2 = dev.price2.values.reshape(-1, 1)
Y_dev3 = dev.price3.values.reshape(-1, 1)

X_test = get_rnn_data(test)
print("done")

In [None]:
np.random.seed(7)
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras.layers import Conv1D,Input, Dropout, Dense, concatenate, GRU, Embedding, Flatten, Activation,BatchNormalization,GaussianNoise
def get_model(lr=0.001, decay=0.0):
    # Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand_name = Input(shape=[1], name="brand_name")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    desc_len = Input(shape=[1], name="desc_len")
    name_len = Input(shape=[1], name="name_len")
    subcat_0 = Input(shape=[1], name="subcat_0")
    subcat_1 = Input(shape=[1], name="subcat_1")
    subcat_2 = Input(shape=[1], name="subcat_2")
    #docvec=Input(shape=,name="docvec")
    
    # Embeddings layers (adjust outputs to help model)
    emb_name = Embedding(MAX_TEXT, 20)(name)
    emb_item_desc = Embedding(MAX_TEXT, 60)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
#     emb_category_name = Embedding(MAX_TEXT, 20)(category_name)
#     emb_category = Embedding(MAX_CATEGORY, 10)(category)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    emb_desc_len = Embedding(MAX_DESC_LEN, 5)(desc_len)
    emb_name_len = Embedding(MAX_NAME_LEN, 5)(name_len)
    emb_subcat_0 = Embedding(MAX_SUBCAT_0, 10)(subcat_0)
    emb_subcat_1 = Embedding(MAX_SUBCAT_1, 10)(subcat_1)
    emb_subcat_2 = Embedding(MAX_SUBCAT_2, 10)(subcat_2)
    

    # rnn layers (GRUs are faster than LSTMs and speed is important here)
    rnn_layer1 = GRU(16) (emb_item_desc)
    rnn_layer2 = GRU(8) (emb_name)
#     rnn_layer3 = GRU(8) (emb_category_name)

    # main layers
    main_l = concatenate([
        Flatten() (emb_brand_name)
#         , Flatten() (emb_category)
        , Flatten() (emb_item_condition)
        , Flatten() (emb_desc_len)
        , Flatten() (emb_name_len)
        , Flatten() (emb_subcat_0)
        , Flatten() (emb_subcat_1)
        , Flatten() (emb_subcat_2)
        , rnn_layer1
        , rnn_layer2
#         , rnn_layer3
        , num_vars
    ])
    #GaussianNoise
    # (incressing the nodes or adding layers does not effect the time quite as much as the rnn layers)
#model.add(Conv1D(64, 3, activation='relu'))
#model.add(MaxPooling1D(3))
#model.add(Conv1D(128, 3, activation='relu'))
#model.add(Conv1D(128, 3, activation='relu'))
#model.add(GlobalAveragePooling1D())

    main_l = Dropout(0.1)(Dense(512,kernel_initializer='normal',activation='relu') (main_l))    
 #   main_l = (BatchNormalization()(main_l))
    main_l = Dropout(0.1)(Dense(256,kernel_initializer='normal',activation='relu') (main_l))    
    main_l = Dropout(0.1)(Dense(128,kernel_initializer='normal',activation='relu') (main_l))    
    main_l = Dropout(0.1)(Dense(64,kernel_initializer='normal',activation='relu') (main_l))    
  #  main_l=  (GaussianNoise(0.2)(main_l))

    # the output layer.
    output = Dense(1, activation="linear") (main_l)
    output2 = Dense(1, activation="linear") (main_l)
    output3 = Dense(1, activation="linear") (main_l)
    optimizer = Adam(lr=lr, decay=decay)
    model = Model([name, item_desc, brand_name , item_condition, 
                   num_vars, desc_len, name_len, subcat_0, subcat_1, subcat_2], [output,output2,output3])
    model.compile(loss =root_mean_squared_logarithmic_error, optimizer = optimizer,metrics=['mae','mse'])
    #model.compile(loss =root_mean_squared_logarithmic_error, optimizer = optimizer,metrics=['mae','mse'])

    return model

model = get_model()
model.summary()
del model

In [None]:

# Set hyper parameters for the model.
BATCH_SIZE = 1500
epochs = 2

# Calculate learning rate decay.
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(X_train['name']) / BATCH_SIZE) * epochs
lr_init, lr_fin = 0.005, 0.001
lr_decay = exp_decay(lr_init, lr_fin, steps)

model = get_model(lr=lr_init, decay=lr_decay)
model.fit(
        X_train, [Y_train,Y_train2,Y_train3], epochs=epochs, batch_size=BATCH_SIZE,  validation_data=(X_dev, [Y_dev,Y_dev2,Y_dev3]), verbose=1,
)

In [None]:

#Y_dev_preds_rnn3 = model.predict(X_dev, batch_size=BATCH_SIZE)
#Y_dev_preds_rnn3[0]
#print(" RMSLE error:", rmsle(Y_dev, Y_dev_preds_rnn3))


In [None]:
preds=model.predict(X_test, batch_size=BATCH_SIZE,verbose=1)

In [None]:


preds1 = np.expm1(preds[0])
preds2=preds[1]
preds3=np.power(10,preds[2])

final=np.array((preds1+preds2+preds3)/3)

submission = pd.DataFrame({
        "test_id": test_df.test_id,
        "price": final.reshape(-1),
})
print(submission)
submission.to_csv("./submisionmed.csv", index=False)

submission10 = pd.DataFrame({
        "test_id": test_df.test_id,
        "price": preds3.reshape(-1),
})
print(submission10)
submission10.to_csv("./submision10.csv", index=False)

print("done")