In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
%matplotlib inline

In [None]:
%%time
from datetime import datetime
start_real= datetime.now()

train_df = pd.read_table('/kaggle/input/dataset2/train.tsv')
test_df = pd.read_table('/kaggle/input/dataset2/test.tsv')

train_df = train_df.drop(train_df[train_df['price'] < 3.0].index)
train_df.shape

In [None]:
%%time
def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except:
        return 0

train_df['name_len']=train_df['name'].apply(lambda x : wordCount(x))
test_df['name_len']=test_df['name'].apply(lambda x : wordCount(x))

train_df['desc_len']=train_df['item_description'].apply(lambda x : wordCount(x))
test_df['desc_len']=test_df['item_description'].apply(lambda x : wordCount(x))

In [None]:
%%time
train_df["target"] = np.log1p(train_df.price)

In [None]:
%%time

def split_cat(text):
    try:return text.split("/")
    except:return ("No Label","No Label","No Label")

train_df['subcat_0'],train_df['subcat_1'],train_df['subcat_2'] = \
    zip(*train_df['category_name'].apply(lambda x : split_cat(x)))

test_df['subcat_0'],test_df['subcat_1'],test_df['subcat_2'] = \
    zip(*test_df['category_name'].apply(lambda x : split_cat(x)))

In [None]:
%%time

full_set = pd.concat([train_df,test_df])
all_brands = set(full_set['brand_name'].values)

train_df['brand_name'] = train_df['brand_name'].fillna(value ='missing')
test_df['brand_name'] = test_df['brand_name'].fillna(value ='missing')

#missingの数
train_premissing = len(train_df.loc[train_df['brand_name']=='missing'])
test_premissing = len(test_df.loc[test_df['brand_name']=='missing'])

def brandfinder(line):
    """
    line:ブランド名
    ・missingを商品名に置き換える：
        商品名がブランドリストの名前と完全に一致する場合
    ・ブランド名を商品名に置き換える：
        商品名がブランド名と完全一致した場合
    ・ブランド名をそのままにする：
        商品名がブランドリストの名前と一致しない
        ブランド名がmissingだが商品名の単語がブランドリストと一致しない

    """
    brand = line.iloc[0]
    name = line.iloc[1]
    namesplit = name.split(' ')

    if brand == 'missing':
        for x in namesplit:
            if x in all_brands:
                return name

    if name in all_brands:#完全一致
        return name

    return brand

train_df['brand_name'] = train_df[['brand_name','name']].apply(brandfinder,axis = 1)
test_df['brand_name'] = test_df[['brand_name','name']].apply(brandfinder,axis = 1)

train_found = train_premissing-len(train_df.loc[train_df['brand_name'] == 'missing'])
print(train_premissing)
print(train_found)

In [None]:
%%time
import gc

train_dfs,dev_dfs=train_test_split(
    train_df,
    random_state=123,
    train_size=0.99,
    test_size=0.01)
n_devs = dev_dfs.shape[0]
print(n_devs)

In [None]:
%%time
full_df = pd.concat([train_dfs,dev_dfs,test_df])

#Nan値をmissingに置き換える
def fill_missing_values(df):
    df.category_name = df.category_name.fillna(value='missing')
    df.brand_name = df.brand_name.fillna(value='missing')
    df.item_description = df.item_description.fillna(value='missing')

    df.item_description = df.item_description.replace(
        'No description yet','missing'
    )
    return df

full_df = fill_missing_values(full_df)

In [None]:
%%time
le = LabelEncoder()

le.fit(full_df.category_name)
full_df['category']=le.transform(full_df.category_name)

le.fit(full_df.brand_name)
full_df.brand_name=le.transform(full_df.brand_name)

le.fit(full_df.subcat_0)
full_df.subcat_0=le.transform(full_df.subcat_0)

le.fit(full_df.subcat_1)
full_df.subcat_1=le.transform(full_df.subcat_1)

le.fit(full_df.subcat_2)
full_df.subcat_2=le.transform(full_df.subcat_2)

del le
gc.collect()

In [None]:
%%time
from tensorflow.keras.preprocessing.text import Tokenizer

print("transforming...\n")
raw_text = np.hstack(
    [full_df.item_description.str.lower(),
    full_df.name.str.lower(),
    full_df.category_name.str.lower()]
)

print(raw_text.shape)

tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)

print('text for sequences\n')

full_df['seq_item_description'] = tok_raw.texts_to_sequences(full_df.item_description.str.lower())
full_df['seq_name'] = tok_raw.texts_to_sequences(full_df.name.str.lower())

del tok_raw
gc.collect()

In [None]:
#RNNモデルで使用する定数を定義
MAX_NAME_SEQ = 10
MAX_ITEM_DESC_SEQ = 75
MAX_CATEGORY_SEQ = 8

MAX_TEXT = np.max([
    np.max(full_df.seq_name.max()),
    np.max(full_df.seq_item_description.max()),
])+100

MAX_CATEGORY = np.max(full_df.category.max())+1
MAX_BRAND =np.max(full_df.brand_name.max())+1
MAX_CONDITION = np.max(full_df.item_condition_id.max())+1
MAX_DESC_LEN=np.max(full_df.desc_len.max())+1
MAX_NAME_LEN =np.max(full_df.name_len.max())+1

MAX_SUBCAT_0 =np.max(full_df.subcat_0.max())+1
MAX_SUBCAT_1 =np.max(full_df.subcat_1.max())+1
MAX_SUBCAT_2 =np.max(full_df.subcat_2.max())+1

In [None]:
%%time
from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_rnn_data(dataset):
    X ={
        #商品ラベル
        'name':pad_sequences(dataset.seq_name,maxlen=MAX_NAME_SEQ),
        #商品説明ラベル
        'item_desc':pad_sequences(dataset.seq_item_description,maxlen=MAX_ITEM_DESC_SEQ),
        #ブランド名
        'brand_name':np.array(dataset.brand_name),
        #/区切りのカテゴリ名のラベル
        'category':np.array(dataset.category),
        #状態
        'item_condition':np.array(dataset.item_condition_id),
        #送料負担
        'num_vars':np.array(dataset[['shipping']]),
        #商品説明の単語数
        'desc_len':np.array(dataset[["desc_len"]]),
        #商品名の単語数
        'name_len':np.array(dataset[['name_len']]),
        #カテゴリ0のラベル
        'subcat_0':np.array(dataset.subcat_0),
        'subcat_1':np.array(dataset.subcat_1),
        'subcat_2':np.array(dataset.subcat_2)
    }
    return X

In [None]:
n_trains = train_dfs.shape[0]
n_devs = dev_dfs.shape[0]
n_tests = test_df.shape[0]
print(n_trains)
train = full_df[:n_trains]
dev =full_df[n_trains:n_trains+n_devs]

test = full_df[n_trains+n_devs:]

In [None]:
X_train = get_rnn_data(train)

Y_train = train.target.values.reshape(-1,1)

In [None]:
X_dev = get_rnn_data(dev)
Y_dev = dev.target.values.reshape(-1,1)

In [None]:
X_test = get_rnn_data(test)

del train_df
gc.collect()

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,Dropout,Dense,Embedding,Flatten
from tensorflow.keras.layers import concatenate, GRU
from tensorflow.keras.optimizers import Adam

In [None]:
np.random.seed(123)

def rmsle(Y,Y_pred):
    assert Y.shape == Y_pred.shape
    return np.sqrt(np.mean(np.square(Y_pred-Y)))

In [None]:
def new_rnn_model(lr=0.001):
    #lr:学習率、decay:学習減衰率
    #入力層
    name = Input(shape=[X_train["name"].shape[1]],name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]],name="item_desc")
    brand_name = Input(shape=[1],name="brand_name")
    item_condition = Input(shape=[1],name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]],name="num_vars")

    name_len = Input(shape=[1],name="name_len")
    desc_len = Input(shape=[1],name="desc_len")

    subcat_0 = Input(shape=[1],name="subcat_0")
    subcat_1 = Input(shape=[1],name="subcat_1")
    subcat_2 = Input(shape=[1],name="subcat_2")

    #Embedding層
    emb_name = Embedding(MAX_TEXT,20)(name)
    emb_item_desc = Embedding(MAX_TEXT,60)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND,10)(brand_name)
    emb_item_condition = Embedding(MAX_CONDITION,5)(item_condition)
    emb_desc_len = Embedding(MAX_DESC_LEN,5)(desc_len)
    emb_name_len = Embedding(MAX_NAME_LEN,5)(name_len)

    emb_subcat_0 = Embedding(MAX_SUBCAT_0,10)(subcat_0)
    emb_subcat_1 = Embedding(MAX_SUBCAT_1,10)(subcat_1)
    emb_subcat_2 = Embedding(MAX_SUBCAT_2,10)(subcat_2)

    #ReccurentユニットはGRUを使用
    rnn_layer1 = GRU(16)(emb_item_desc)
    rnn_layer2 = GRU(8)(emb_name)

    #全結合層
    main_1 = concatenate([
        Flatten()(emb_brand_name),
        Flatten()(emb_item_condition),
        Flatten()(emb_desc_len),
        Flatten()(emb_name_len),
        Flatten()(emb_subcat_0),
        Flatten()(emb_subcat_1),
        Flatten()(emb_subcat_2),
        rnn_layer1,
        rnn_layer2,
        num_vars#0か1なのでそのまま通す
    ])
    #512,256,128,64ユニットの層を追加
    main_1 = Dropout(0.1)(
        Dense(512,kernel_initializer='normal',activation='relu')(main_1))
    main_1 = Dropout(0.1)(
        Dense(256,kernel_initializer='normal',activation='relu')(main_1))
    main_1 = Dropout(0.1)(
        Dense(128,kernel_initializer='normal',activation='relu')(main_1))
    main_1 = Dropout(0.1)(
        Dense(64,kernel_initializer='normal',activation='relu')(main_1))

    #出力層
    output = Dense(1,activation='linear')(main_1)

    #modelオブジェクトの作成
    model = Model(
        inputs=[name,item_desc,brand_name,item_condition,num_vars,
               desc_len,name_len,subcat_0,subcat_1,subcat_2],
        outputs=output
    )

    model.compile(loss='mse',optimizer=Adam(learning_rate=lr))

    return model

model = new_rnn_model()
model.summary()
del model
gc.collect()

In [None]:
%%time

BATCH_SIZE = 512*2
epochs=3

#学習減衰率
exp_decay = lambda init,fin,steps:(init/fin)**(1/(steps-1))-1
steps = int(len(X_train["name"])/BATCH_SIZE)* epochs
lr_init =0.005
lr_fin= 0.001
lr_decay = exp_decay(lr_init,lr_fin,steps)

#モデルを生成
rnn_model = new_rnn_model(lr=lr,decay=lr_decay)
#学習
rnn_model.fit(X_train,Y_train,
             epochs=epochs,
             batch_size=BATCH_SIZE,
             validation_data=(X_dev,Y_dev),
             verbose=1)

In [None]:
%%time
from tensorflow.keras.callbacks import LearningRateScheduler

# バッチサイズとエポック数
BATCH_SIZE = 512 * 2
epochs = 3

# 学習減衰率の計算
def exp_decay_schedule(epoch, lr):
    return lr * lr_decay

exp_decay = lambda init, fin, steps: (init / fin) ** (1 / (steps - 1)) - 1
steps = int(len(X_train["name"]) / BATCH_SIZE) * epochs
lr_init = 0.005  # 初期学習率
lr_fin = 0.001   # 最終学習率
lr_decay = exp_decay(lr_init, lr_fin, steps)

# モデルを生成
rnn_model = new_rnn_model(lr=lr_init)  # 初期学習率を設定

# 学習率スケジューラーのコールバック
lr_scheduler = LearningRateScheduler(exp_decay_schedule)

# モデルの学習
rnn_model.fit(
    X_train,
    Y_train,
    epochs=epochs,
    batch_size=BATCH_SIZE,
    validation_data=(X_dev, Y_dev),
    verbose=1,
    callbacks=[lr_scheduler]  # スケジューラーを渡す
)


In [None]:
Y_dev_preds_rnn = rnn_model.predict(X_dev,batch_size=BATCH_SIZE)

print(rmsle(Y_dev,Y_dev_preds_rnn))

In [None]:
rnn_preds = rnn_model.predict(X_test,batch_size=BATCH_SIZE,verbose=1)
rnn_preds=np.expm1(rnn_preds)

del rnn_model
gc.collect()