## Score

- local CV: 0.1701
- private LB score: 0.39762

## References

- [Mercari Golf: 0.3875 CV in 75 LOC, 1900 s](https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s)



In [1]:
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error
import datetime

In [2]:
if "google.colab" in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/MyDrive/kaggle/kaggle-mercari-price-suggestion-challenge/notebook

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/kaggle/kaggle-mercari-price-suggestion-challenge/notebook


In [3]:
class Config:
    def __init__(self, debug):
        self.project_name = "mercari-price-suggestion-challenge"
        self.debug = debug
        self.train_nrows = 100 if self.debug else None
        self.test_nrows = 100 if self.debug else None
        self.num_kfolds = 2 if self.debug else 20
        if "kaggle_web_client" in sys.modules:
            self.submission_filepath = "submission.csv"
        else:
            self.submission_filepath = "../output/submission_%s.csv" % datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
        if self.debug:
            self.submission_filepath = self.submission_filepath.replace(".csv", "_debug.csv")
config = Config(debug=False)

In [4]:
train_df = pd.read_csv("../input/mercari-price-suggestion-challenge/train.tsv", sep="\t", nrows=config.train_nrows)
test_df = pd.read_csv("../input/mercari-price-suggestion-challenge/test_stg2.tsv", sep="\t", nrows=config.test_nrows)

In [5]:
train_df = train_df[train_df["price"] > 0].reset_index(drop=True)

In [6]:
def preprocess(df):
    df["name"] = df["name"].fillna("") + " " + df["brand_name"].fillna("")
    df["text"] = df["item_description"].fillna("") + " " + df["name"] + " " + df["category_name"].fillna("")
    return df[["name", "text", "shipping", "item_condition_id"]]

In [7]:
y_train = train_df["price"].values
train_df = preprocess(train_df)
test_df = preprocess(test_df)

In [8]:
df = pd.concat([train_df, test_df])

In [9]:
tfidf_mapping = {
    "name": TfidfVectorizer(max_features=10**5, token_pattern="\w+").fit(df["name"]),
    "text": TfidfVectorizer(max_features=10**5, token_pattern="\w+", ngram_range=(1,2)).fit(df["text"]),
}

In [10]:
def transform(df):
    x1 = tfidf_mapping["name"].transform(df["name"])
    x2 = tfidf_mapping["text"].transform(df["text"])
    x3 = df[["shipping", "item_condition_id"]].values
    return x1, x2, x3

In [11]:
X_train = transform(train_df)
X_test = transform(test_df)

In [12]:
def build_model(input_shapes):
    x_name_in = tf.keras.Input(shape=input_shapes["name"], name="name", sparse=True)
    x_text_in = tf.keras.Input(shape=input_shapes["text"], name="text", sparse=True)
    x_numeric_in = tf.keras.Input(shape=input_shapes["numeric"], name="numeric")
    x_name = tf.keras.layers.Dense(96, activation="relu")(x_name_in)
    x_text = tf.keras.layers.Dense(96, activation="relu")(x_text_in)
    x_numeric = x_numeric_in
    x = tf.keras.layers.Concatenate()([x_name, x_text, x_numeric])
    x = tf.keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dense(64, activation="relu")(x)
    output = tf.keras.layers.Dense(1)(x)
    model = tf.keras.Model(inputs=[x_name_in, x_text_in, x_numeric_in], outputs=[output])
    model.compile(
        loss="mean_squared_error",
        optimizer=tf.optimizers.Adam(lr=3e-3)
    )
    return model

In [13]:
input_shapes = {
    "name": (X_train[0].shape[1], ),
    "text": (X_train[1].shape[1], ),
    "numeric": (X_train[2].shape[1], ),
}

In [14]:
for i in range(2):
    X_train[i].sort_indices()
    X_test[i].sort_indices()

In [15]:
scaler = StandardScaler()
y_log_train = scaler.fit_transform(np.log1p(y_train).reshape(-1, 1))

In [16]:
kf = KFold(n_splits=config.num_kfolds, shuffle=True, random_state=777)
oof = np.zeros(len(y_train))
y_preda_list = []
for train_index, valid_index in kf.split(y_train):
    model = build_model(input_shapes=input_shapes)
    for i in range(3):
        # TODO(nishimori-m): batch sizeを段階的に大きくする理由のページを見つける
        model.fit(
            [_[train_index] for _ in X_train],
            y_log_train[train_index],
            validation_data=([_[valid_index] for _ in X_train], y_log_train[valid_index]),
            batch_size=2**(11+i),
            epochs=1,
            verbose=1
        )
    oof[valid_index] = model.predict([_[valid_index] for _ in X_train]).flatten()
    y_preda_list.append(model.predict(X_test).flatten())

    print()























In [17]:
oof2 = np.expm1(
    scaler.inverse_transform(oof.reshape(-1, 1))
)
print("valid rmsle: %.4f" % mean_squared_log_error(y_train, oof2))

valid rmsle: 0.1701


In [18]:
y_preda = np.expm1(
    scaler.inverse_transform(
        np.mean(y_preda_list, axis=0)
    )
)

In [19]:
submission_df = pd.read_csv("../input/mercari-price-suggestion-challenge/sample_submission_stg2.csv", nrows=config.test_nrows)

In [20]:
submission_df["price"] = y_preda

In [21]:
print(f"Save to {config.submission_filepath}")
submission_df.to_csv(config.submission_filepath, index=False)

Save to ../output/submission_2021-04-04_024913.csv
