In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

import unicodedata

import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)

- categoryは全部one-hotで表現
- yaerで2030年以降のものがある
    - おそらくタイポ
    - -1000してあげるのが良い
    - lightgbm使うのでその他の処理は行わない
- conditionは1~6で評価
- odometer
    - 絶対値を取る
    - 1はnullにしてあげる(元々odometer=1のデータはない)
    - 1e6を超える場合は10で割る
- cylinders, transmission, typeのotherはnullにする

In [None]:
def preprocess(df):
    #year
    df.loc[((df['year']>2030) & (df['year']<3000)) | (df['year']>=3000), 'year'] -= 1000

    #condition
    condition_dict = {'excellent':4, 'fair':2, 'good':3, 'like new':5, 'salvage':1, 'new':6}
    df['condition'] = df['condition'].apply(lambda x: condition_dict[x])

    #odometer
    def set_odometer(x):
        x = abs(x)
        if x==1:
            return None
        if x>1e6:
            x = x//10
        return x
    df['odometer'] = [set_odometer(x) for x in df['odometer'].values]

    #size
    size_dict = {'mid-size':3, 'full-size':4, 'sub-compact':1, 'compact':2}
    df['size'] = [size_dict[unicodedata.normalize('NFKC', x).lower().replace('ー', '-').replace('−', '-')] if type(x)==str else None for x in df['size'].values]

    #categorical data
    categoly_calumns = ['region', 'manufacturer', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'type', 'paint_color', 'state']
    for cat in categoly_calumns:
        df[cat] = [unicodedata.normalize('NFKC', x).lower() if type(x)==str else None for x in df[cat].values]

    #otherをnullにする
    for cat in ['cylinders', 'transmission', 'type']:
        df[cat] = [x if x!='other' else None for x in df[cat].values]

    df = pd.get_dummies(df, columns=categoly_calumns) #one-hot vectorize

    df['odometer'] = df['odometer'].fillna(df['odometer'].mean())
    
    for cat in df.columns:
        if len(df[df[cat].isnull()]) == 0:
            df[cat] = df[cat].astype(int) 
    return df

In [None]:
df = pd.concat([train, test], ignore_index=True).drop(['price'], axis=1)
df_processed = preprocess(df)
df_processed.head()

In [None]:
X = df_processed[:len(train)].values
y = train['price'].values
X_test = df_processed[len(train):].values

features = df_processed.columns

In [None]:
n_splits = 4
random_state = 42

folds = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print("Fold {}".format(fold_+1))
    X_train, X_valid = X[trn_idx], X[val_idx]
    y_train, y_valid = y[trn_idx], y[val_idx]
    
    # モデルの構築
    tf.random.set_seed(42)
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(1024, activation='relu', input_shape=(X.shape[1],)),
        tf.keras.layers.Dense(512, activation='relu', input_shape=(1024,)),
        # tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(256, activation='relu', input_shape=(512,)),
        tf.keras.layers.Dense(128, activation='relu', input_shape=(256,)),
        tf.keras.layers.Dense(128, activation='relu', input_shape=(128,)),
        tf.keras.layers.Dense(64, activation='relu', input_shape=(128,)),
        tf.keras.layers.Dense(16, activation='relu', input_shape=(64,)),
        tf.keras.layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_absolute_percentage_error')
    early_stopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
    model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10000, batch_size=32, callbacks=[early_stopping])

    oof[val_idx] = model.predict(X_valid).flatten()    
    predictions += model.predict(X_test).flatten() / folds.n_splits

print("CV score: {:<8.5f}".format(mean_absolute_percentage_error(y, oof)))

In [None]:
oof_df = train.copy()
oof_df['oof'] = oof
oof_df.head(5)

In [None]:
print(mean_absolute_percentage_error(y, oof))
plt.scatter(y, oof)

In [None]:
sub_df = pd.DataFrame({"id":test.index})
sub_df["price"] = predictions
sub_df.to_csv('output/sub003.csv', index=False, header=False)
sub_df