In [88]:
package_path = '/content/drive/MyDrive/pip/install/'
import sys
sys.path.append(package_path)  

In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf

import category_encoders as ce

In [93]:
df_station = pd.read_csv('/content/drive/MyDrive/Python/民泊サービスの宿泊料金予測/station_list.csv')
df_train = pd.read_csv('/content/drive/MyDrive/Python/民泊サービスの宿泊料金予測/train_data.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Python/民泊サービスの宿泊料金予測/test_data.csv')

## 前処理

In [94]:
    #緯度経度データの距離を行列で計算する関数(単位：km)
    def make_distance_list(df1, df2):

        lat1 = np.expand_dims(df1["latitude"].values * np.pi / 180, axis=1)
        lon1 = np.expand_dims(df1["longitude"].values * np.pi / 180, axis=1)
        lat2 = np.expand_dims(df2["latitude"].values * np.pi / 180, axis=0)
        lon2 = np.expand_dims(df2["longitude"].values * np.pi / 180, axis=0)

        distance_list = 6378.137 * np.arccos(
            np.cos(lat1) * np.cos(lat2) * np.cos(lon2 - lon1) + np.sin(lat1) * np.sin(lat2)
        )

        return distance_list

In [95]:
def preprocessing(df):
    #欠損値の補完、ひと月当たりのレビューデータなし、という仮定で0を補完
    df['reviews_per_month'].fillna(0, inplace=True)

    #欠損値の補完、最後に閲覧された月のデータなし、という仮定で仮の日程'2100-01-01'で補完
    df['last_review_fillna'] = df['last_review'].fillna('2100-01-01')

    #日付データを文字列から日付型へ変換
    from datetime import datetime as dt
    df['last_review_D'] = df['last_review_fillna'].apply(lambda x: dt.strptime(x, '%Y-%m-%d') if x == x and x != 0 else 0) 
    
    def add_station_distance(df):

        # 物件と各駅との距離のリストの作成
        distance_list = make_distance_list(df, df_station)

        # 最寄り駅と最寄り駅までの距離(対数化)、〇km圏内の駅数の列を追加(徒歩5分=４００ｍ)
        df["nearest_station"] = [
            df_station["station_name"][i] for i in np.argmin(distance_list, axis=1)
        ]
        df["distance_nearest_station"] = np.log1p(np.min(distance_list, axis=1))
        df["number_of_stations_500m"] = (distance_list < 0.5).sum(axis=1)
        df["number_of_stations_1000m"] = (distance_list < 1.0).sum(axis=1)
        df["number_of_stations_1500m"] = (distance_list < 1.5).sum(axis=1)

        return df
    df = add_station_distance(df)

    #datetime
    def last_review_split(df):
        date = pd.to_datetime(df['last_review'])
        df['last_review_y'] = date.dt.year
        df['last_review_m'] = date.dt.month
        df['last_review_d'] = date.dt.day
        return df
    
    df = last_review_split(df)

    def add_dis_day(df):
        latest_date = df.loc[:,['last_review_D']].query('last_review_D != "2100-01-01"').sort_values('last_review_D').tail(1)
        df['diff_from_newdate']= (df['last_review_D']- latest_date.iat[0,0]).dt.days
        return df

    df = add_dis_day(df)

#last_reviewが欠損だったものにnew_placeフラグつける
    df['new_place'] = df['diff_from_newdate'].where(df['diff_from_newdate'] > 1 , 0)
    df['new_place'] = df['new_place'].where(df['diff_from_newdate'] < 1 , 1)
    df['diff_from_newdate'] = df['diff_from_newdate'].where(df['diff_from_newdate'] < 1 , min(df['diff_from_newdate']))

    
    #review数のクラス作成
    def make_review_class(df):
        review_class = []
        for i in range(len(df)):
            if df['number_of_reviews'][i] == 0:
                review_class.append(0)
            elif df['number_of_reviews'][i] <=  3:
                review_class.append(1)
            elif df['number_of_reviews'][i] <=  10:
                review_class.append(2)
            elif df['number_of_reviews'][i] <=  30:
                review_class.append(3)
            elif df['number_of_reviews'][i] <=  100:
                review_class.append(4)
            elif df['number_of_reviews'][i] <=  200:
                review_class.append(5)
            elif df['number_of_reviews'][i] <=  300:
                review_class.append(6)
            elif df['number_of_reviews'][i] <=  400:
                review_class.append(7)
            elif df['number_of_reviews'][i] >  400:
                review_class.append(8)
        return(review_class)



    #距離と値段は対数化
    df['distance_nearest_station'] = np.log1p(df['distance_nearest_station'])

    #availability_365 ==0 を, 0を除く平均値に置換
    def availability_365_replace(df):
        df['availability_365'].replace(0, df.query('availability_365 != 0')['availability_365'].mean(), inplace =True)
    
    #関数まとめて実行
    review_class = make_review_class(df)
    df['review_class'] = review_class

    #距離と値段は対数化
    df['distance_nearest_station'] = np.log1p(df['distance_nearest_station'])
    #df['y']= np.log1p(df['y'])

    #availability_365 ==0 を。0を除く平均値に置換
    availability_365_replace(df)

    return df

In [96]:
df_train = preprocessing(df_train.copy())
df_test = preprocessing(df_test.copy())

In [97]:
df_train

Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,...,distance_nearest_station,number_of_stations_500m,number_of_stations_1000m,number_of_stations_1500m,last_review_y,last_review_m,last_review_d,diff_from_newdate,new_place,review_class
0,1,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,242899459,Koto Ku,35.68185,139.80310,Entire home/apt,1,55,2020-04-25,...,0.248480,1,3,6,2020.0,4.0,25.0,-4,0,4
1,2,Downtown Tokyo Iriya next to Ueno,308879948,Taito Ku,35.72063,139.78536,Entire home/apt,6,72,2020-03-25,...,0.126571,1,3,11,2020.0,3.0,25.0,-35,0,4
2,3,"Japan Style,Private,Affordable,4min to Sta.",300877823,Katsushika Ku,35.74723,139.82349,Entire home/apt,1,18,2020-03-23,...,0.241621,1,2,5,2020.0,3.0,23.0,-37,0,3
3,4,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,236935461,Shibuya Ku,35.68456,139.68077,Entire home/apt,1,2,2020-04-02,...,0.333844,0,3,9,2020.0,4.0,2.0,-27,0,1
4,5,LICENSED SHINJUKU HOUSE: Heart of the action!,243408889,Shinjuku Ku,35.69840,139.70467,Entire home/apt,1,86,2020-01-30,...,0.191922,4,11,20,2020.0,1.0,30.0,-90,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9985,9986,Shinjuku Kabukicho2 / Shinjuku 3min walk,59104733,Shinjuku Ku,35.69728,139.70321,Entire home/apt,1,1,2019-12-09,...,0.215239,4,14,20,2019.0,12.0,9.0,-142,0,1
9986,9987,Hostel just 1 min from Sta. / Dorm with curtain,131595566,Taito Ku,35.70407,139.79180,Shared room,1,0,,...,0.103141,1,9,16,,,,-1257,1,0
9987,9988,Hostel just 5 min from Sta./4 bed room with bath,147026065,Taito Ku,35.71501,139.79417,Private room,1,5,2019-12-10,...,0.160043,1,5,11,2019.0,12.0,10.0,-141,0,2
9988,9989,SHITARA HOUSE dormitory B,316273494,Katsushika Ku,35.74672,139.82925,Shared room,1,1,2020-01-01,...,0.145978,1,2,3,2020.0,1.0,1.0,-119,0,1


In [None]:
'''''
#distanceの外れ値除去
def overdata_drop(df, df_colum):
    df_overdata_drop = df[
            (abs(df_colum - np.mean(df_colum)) /
            np.std(df_colum) <= 5)
            ].reset_index()
    return(df_overdata_drop)

#平均値から標準偏差の5倍以内の値に収まるもののみに絞る
df_train = overdata_drop(df_train, df_train['distance_nearest_station'])

,'last_review_y', 'last_review_m','last_review_d'
'''''

In [98]:
df_train.columns.to_list

<bound method IndexOpsMixin.tolist of Index(['id', 'name', 'host_id', 'neighbourhood', 'latitude', 'longitude',
       'room_type', 'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'availability_365', 'y', 'last_review_fillna',
       'last_review_D', 'nearest_station', 'distance_nearest_station',
       'number_of_stations_500m', 'number_of_stations_1000m',
       'number_of_stations_1500m', 'last_review_y', 'last_review_m',
       'last_review_d', 'diff_from_newdate', 'new_place', 'review_class'],
      dtype='object')>

In [99]:
#y切り分け&不要な列削除
y = np.log1p(df_train['y'])
df_train = df_train[['name',  'neighbourhood', 'latitude',
       'longitude', 'room_type', 'minimum_nights', 'number_of_reviews',
       'reviews_per_month', 'availability_365', 'nearest_station', 
       'distance_nearest_station',
       'number_of_stations_500m', 'number_of_stations_1000m',
       'number_of_stations_1500m', 'diff_from_newdate', 'review_class', 'new_place','last_review_y', 'last_review_m','last_review_d']]

df_test = df_test[['name', 'neighbourhood', 'latitude',
       'longitude', 'room_type', 'minimum_nights', 'number_of_reviews',
       'reviews_per_month', 'availability_365',
       'nearest_station', 'distance_nearest_station',
       'number_of_stations_500m', 'number_of_stations_1000m',
       'number_of_stations_1500m', 'diff_from_newdate', 'review_class', 'new_place','last_review_y', 'last_review_m','last_review_d']]

## ターゲットエンコーディング

In [100]:
# エンコーディングするカラム
train_columns = ['neighbourhood', 'latitude',
       'longitude', 'room_type', 'minimum_nights', 'number_of_reviews',
       'reviews_per_month', 'availability_365',
       'nearest_station', 'distance_nearest_station',
       'number_of_stations_500m', 'number_of_stations_1000m',
       'number_of_stations_1500m','diff_from_newdate', 'review_class', 'new_place','last_review_y', 'last_review_m','last_review_d']

In [101]:
X = df_train[train_columns]
target = 'y'

In [102]:
from xfeat import SelectCategorical, LabelEncoder, Pipeline, ConcatCombination, SelectNumerical, \
    ArithmeticCombinations, TargetEncoder, aggregation, GBDTFeatureSelector, GBDTFeatureExplorer
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
from sklearn.model_selection import KFold
#説明変数の作成
test_X = df_test[train_columns]

train_target = pd.concat([X,y],axis=1)

# ターゲットエンコーディング
fold = KFold(n_splits=5, shuffle=False)
train = X.copy()
test = test_X.copy()
for col in train_columns:
  encoder = TargetEncoder(
    input_cols=[col], 
    target_col=target,
    fold=fold,
    output_suffix="_re"
    )

  encoded_df = encoder.fit_transform(train_target)
  train = pd.concat([train,encoded_df[f'{col}_re']],axis=1)
  encoded_df = encoder.transform(test)
  test = pd.concat([test,encoded_df[f'{col}_re']],axis=1)

train.drop(train_columns,axis=1,inplace=True)
test.drop(train_columns,axis=1,inplace=True)

In [103]:
train

Unnamed: 0,neighbourhood_re,latitude_re,longitude_re,room_type_re,minimum_nights_re,number_of_reviews_re,reviews_per_month_re,availability_365_re,nearest_station_re,distance_nearest_station_re,number_of_stations_500m_re,number_of_stations_1000m_re,number_of_stations_1500m_re,diff_from_newdate_re,review_class_re,new_place_re,last_review_y_re,last_review_m_re,last_review_d_re
0,9.179082,8.014005,0.000000,9.691519,9.518808,9.784691,9.369725,9.237979,8.968616,0.0,9.378152,9.367355,9.300555,9.821251,9.456187,9.439100,9.489693,9.458744,9.522040
1,9.630799,9.089027,9.506065,9.691519,9.547290,9.293954,9.506588,8.812992,9.469703,0.0,9.378152,9.367355,9.487567,9.594514,9.456187,9.439100,9.489693,9.491207,9.522040
2,9.298553,0.000000,0.000000,9.691519,9.518808,9.266273,9.502138,9.094287,9.516634,0.0,9.378152,9.200798,9.353912,9.535786,9.502687,9.439100,9.489693,9.491207,9.420814
3,9.659943,9.205328,0.000000,9.691519,9.518808,9.413851,9.557873,9.252357,9.497591,0.0,9.375417,9.367355,9.435309,9.315775,9.424093,9.439100,9.489693,9.458744,9.380711
4,9.458015,9.412414,0.000000,9.691519,9.518808,9.043648,9.314409,9.937243,9.581439,0.0,9.563484,9.679751,9.521624,9.343613,9.456187,9.439100,9.489693,9.472988,9.362209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9985,9.470027,8.768107,8.960761,9.688184,9.523464,9.471119,8.855638,9.164186,9.614103,0.0,9.516604,9.481636,9.475307,9.256715,9.452789,9.454168,9.300866,9.319425,9.426195
9986,9.634534,0.000000,0.000000,8.326459,9.523464,9.536443,9.536443,9.609543,9.527649,0.0,9.389566,9.621884,9.622276,9.536044,9.536443,9.536443,9.536443,9.536443,9.536443
9987,9.634534,10.418961,9.369223,8.996490,9.523464,9.382821,8.931836,9.980226,9.558345,0.0,9.389566,9.574012,9.467807,9.093595,9.438819,9.454168,9.300866,9.319425,9.432110
9988,9.304132,9.599879,8.530702,8.326459,9.523464,9.471119,9.296099,9.457688,9.750326,0.0,9.389566,9.214396,9.075549,9.368849,9.452789,9.454168,9.501163,9.502798,9.359220


In [104]:
test

Unnamed: 0,neighbourhood_re,latitude_re,longitude_re,room_type_re,minimum_nights_re,number_of_reviews_re,reviews_per_month_re,availability_365_re,nearest_station_re,distance_nearest_station_re,number_of_stations_500m_re,number_of_stations_1000m_re,number_of_stations_1500m_re,diff_from_newdate_re,review_class_re,new_place_re,last_review_y_re,last_review_m_re,last_review_d_re
0,9.560202,9.210332,0.000000,8.983179,9.520463,9.561696,9.561696,9.583185,9.376181,0.0,9.368854,9.647779,9.681545,0.000000,9.561696,9.561696,9.561696,9.561696,9.561696
1,9.339548,7.184541,6.965722,9.687423,9.504455,9.450046,9.316484,9.800898,9.259079,0.0,9.368854,9.647779,9.444707,9.684566,9.495826,9.442838,9.489450,9.472114,9.421894
2,9.167317,0.000000,0.000000,9.687423,9.520463,9.555924,9.304836,9.595128,9.260350,0.0,9.620730,9.508310,9.581572,9.506792,9.495826,9.442838,9.489450,9.481612,9.390822
3,9.339548,7.407761,7.587707,9.687423,9.520463,9.514255,9.685014,0.000000,9.458493,0.0,9.547010,9.541806,9.581572,9.488126,9.495826,9.442838,9.489450,9.481612,9.480375
4,9.623190,9.575654,7.695147,9.687423,8.856764,9.422085,9.266382,9.536205,9.429628,0.0,9.547010,9.193756,9.499081,9.478191,9.437350,9.442838,9.296703,9.340559,9.491879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4991,9.671859,0.000000,0.000000,9.687423,9.504455,9.514162,9.624582,9.308474,9.639092,0.0,9.368854,9.371562,9.449127,9.914557,9.458489,9.442838,9.296703,9.357863,9.492459
4992,9.623190,9.681980,0.000000,8.342688,9.504455,9.410439,9.359107,9.062694,8.989362,0.0,9.381046,9.488564,9.499081,6.788232,9.420542,9.442838,9.154658,9.186099,9.513351
4993,9.466136,0.000000,9.083012,9.687423,9.520463,9.440544,9.779424,9.463426,9.238536,0.0,9.544608,9.652158,9.556799,9.532809,9.495826,9.442838,9.489450,9.480234,9.447379
4994,9.466136,8.986617,7.990300,8.342688,9.504455,9.424924,9.414881,9.708068,9.592109,0.0,9.544608,9.750666,9.510443,9.337693,9.458489,9.442838,9.489450,9.474077,9.353681


## category_encoder

In [105]:
df_obj = df_train.select_dtypes(include='object')
df_obj.columns.to_list

<bound method IndexOpsMixin.tolist of Index(['name', 'neighbourhood', 'room_type', 'nearest_station'], dtype='object')>

In [106]:
#category_encoder
# Eoncodeしたい列をリストで指定。
list_cols = ['neighbourhood', 'room_type', 'nearest_station']
# 序数をカテゴリに付与して変換
ce_oe = ce.OneHotEncoder(cols=list_cols,handle_unknown='impute')
df_train = ce_oe.fit_transform(df_train)

In [107]:
df_test = ce_oe.transform(df_test)

## ターゲットとOne-Hotを結合＆train_test_split


In [108]:
df_train_concat = pd.concat([df_train, train], axis=1)
df_test_concat = pd.concat([df_test, test], axis=1)

In [109]:
# 分割
x_train, x_test, t_train, t_test = train_test_split(df_train_concat, y, test_size=0.3, random_state=2022)

In [110]:
x_train

Unnamed: 0,name,neighbourhood_1,neighbourhood_2,neighbourhood_3,neighbourhood_4,neighbourhood_5,neighbourhood_6,neighbourhood_7,neighbourhood_8,neighbourhood_9,...,distance_nearest_station_re,number_of_stations_500m_re,number_of_stations_1000m_re,number_of_stations_1500m_re,diff_from_newdate_re,review_class_re,new_place_re,last_review_y_re,last_review_m_re,last_review_d_re
2063,Sky Lounge Residence /Skytree view / whole floor,0,0,0,0,0,1,0,0,0,...,0.0,9.372211,9.375837,9.515700,9.548837,9.409240,9.447798,9.493052,9.462950,9.543793
1822,30min Shinjuku/Ginza/Asakusa+5m to station(2Bed R,0,0,0,0,0,0,0,0,0,...,0.0,9.539546,9.200798,9.135735,9.454310,9.502687,9.439100,9.489693,9.491207,9.351873
5079,★Monthly Rental Apt★Newly Built 1 min to sta. ...,0,0,0,0,0,0,1,0,0,...,0.0,9.526211,9.476003,9.424865,10.461216,9.415867,9.435346,9.292113,9.545616,9.364957
2215,6 min to sta! 2bathrooms ! 3bedrooms! Ueno/Sen...,0,1,0,0,0,0,0,0,0,...,0.0,9.372211,9.375837,9.467793,9.352835,9.467105,9.447798,9.493052,9.471920,9.436845
9652,R301) Shinjuku(Kagurazaka) *FREE WIFI*,0,0,0,0,1,0,0,0,0,...,0.0,9.389566,9.514627,9.462103,9.360025,9.504891,9.454168,9.501163,9.475126,9.438457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6384,Near Ueno/Asakusa#6min to Subway#40min airport,0,0,0,0,0,0,0,0,0,...,0.0,9.554267,9.544126,9.488914,9.345093,9.464573,9.437779,9.482084,9.469309,9.382020
4720,Entire137sm/9LDK/Haneda20m/Shinagawa/Yokohama/...,0,0,0,0,0,0,0,1,0,...,0.0,9.388401,9.177813,8.907483,9.443879,9.436057,9.435346,9.481257,9.468835,9.498250
173,Twin bed ! Easy to shinjuku! beside YoyogiPark...,0,0,0,1,0,0,0,0,0,...,0.0,9.378152,9.367355,9.567266,9.555702,9.556117,9.556117,9.556117,9.556117,9.556117
1244,Shibuya 10 minutes free Wi-fi Sakurashinmachi ...,0,0,0,0,0,0,0,0,1,...,0.0,9.378152,9.204018,9.135735,9.413388,9.456187,9.439100,9.285909,9.379788,9.519369


## テキスト処理

In [111]:
import MeCab
tagger = MeCab.Tagger('-Owakati')
import re
import mojimoji

In [112]:
#テキスト前処理
def wakati(df):
    wakati = []
    for text in df['name']:
        text = text.lower()
        text = mojimoji.zen_to_han(text)
        text = text.replace('\u3000', '')
        text = text.replace('\n', '')
        text = re.sub('|【|】|,|/|\|<|>|☆|★|[|]|✣|･|♦|❤|&|▶|▼|☀️|●|◆|✤|✦|▲|◎|♪|r"*"|、', '', text)
        text = re.sub('\d+', '0', text)
        text = tagger.parse(text).strip()
        wakati.append(text)
    return(wakati)

In [113]:
#分かち書き実行
wakati_train = wakati(x_train)
wakati_x_test  =wakati(x_test)
wakati_test = wakati(df_test_concat)

In [114]:
from sklearn.feature_extraction.text import TfidfVectorizer
#vectorizer = TfidfVectorizer()

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [115]:
#TF-IDF取得 or BoW取得
tfidf_train = vectorizer.fit_transform(wakati_train).toarray()
tfidf_x_test = vectorizer.transform(wakati_x_test).toarray()
tfidf_test = vectorizer.transform(wakati_test).toarray()

In [116]:
#indexに宿泊施設＋値の高い順に並び替えてxワード分だけ残す
def tf_idf_x(x_train,tfidf, x):
    tfidf = pd.DataFrame(tfidf, columns=vectorizer.get_feature_names()).transpose()
    tfidf['sum'] = tfidf.sum(axis=1)
    tfidf = tfidf.sort_values('sum', ascending=False).head(x)
    tfidf = tfidf.drop('sum', axis=1).transpose().reset_index()
    tfidf = tfidf.where(tfidf == 0, 1)
    return(tfidf)

In [117]:
#単語数指定
tfidf_train= tf_idf_x(x_train,tfidf_train, 100)
tfidf_x_test= tf_idf_x(x_test,tfidf_x_test, 100)
tfidf_test= tf_idf_x(df_test_concat,tfidf_test, 100)



In [118]:
#key列を挿入して結合
tfidf_train['key'] = list(range(len(tfidf_train)))
x_train['key'] = list(range(len(x_train)))

tfidf_x_test['key'] = list(range(len(tfidf_x_test)))
x_test['key'] = list(range(len(x_test)))

tfidf_test['key'] = list(range(len(tfidf_test)))
df_test_concat['key'] = list(range(len(df_test_concat)))

x_train = x_train.merge(tfidf_train, how='inner', on='key')
x_test = x_test.merge(tfidf_x_test, how='inner', on='key')
df_test_concat = df_test_concat.merge(tfidf_test, how='inner', on='key')

In [119]:
x_train.drop(columns=['name','index'], axis=1, inplace=True)
x_test.drop(columns=['name','index'], axis=1, inplace=True)
df_test_concat.drop(columns=['name','index'], axis=1, inplace=True)

In [120]:
x_train

Unnamed: 0,neighbourhood_1,neighbourhood_2,neighbourhood_3,neighbourhood_4,neighbourhood_5,neighbourhood_6,neighbourhood_7,neighbourhood_8,neighbourhood_9,neighbourhood_10,...,minute,guest,women,东京,ldk,最大,no,monthly,metro,羽田
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6989,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6990,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6991,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# right_gbm

In [121]:
import lightgbm as lgb

train_set = lgb.Dataset(x_train, t_train)
valid_set = lgb.Dataset(x_test, t_test)

params = {
    "objective": "regression",
    "boosting_type": "gbdt",
    "learning_rate": 0.01,
    "max_depth": -1,
    "max_leaves": 6,
    "early_stopping_rounds": 1000,
    "n_estimators": 1000000,
    "random_state": 0,
    "metric": "rmse",
    "verbosity": -1,
}
from sklearn.model_selection import KFold

models = []
scores = []
iterations = []

kf = KFold(n_splits=10, shuffle=True, random_state=0)
for tr_idx, va_idx in kf.split(x_train):
    X_tr, X_va = x_train.iloc[tr_idx], x_train.iloc[va_idx]
    y_tr, y_va = t_train.iloc[tr_idx], t_train.iloc[va_idx]

    trains = lgb.Dataset(X_tr, y_tr)
    valids = lgb.Dataset(X_va, y_va)

    model = lgb.train(
        params,
        trains,
        valid_names=["train", "valid"],
        valid_sets=[trains, valids],
        verbose_eval=500,
    )

    iterations.append(model.best_iteration)
    scores.append(model.best_score["valid"]["rmse"])
    models.append(model)

#print("\nDone.")

model = lgb.train(
    params = params,
    train_set = train_set,
    valid_sets = [train_set, valid_set],
    num_boost_round = 1000,
)

lgb_train_pred  = model.predict(x_train)
lgb_test_pred    = model.predict(x_test)



[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
[11317]	training's rmse: 0.462824	valid_1's rmse: 0.757495
[11318]	training's rmse: 0.462816	valid_1's rmse: 0.757495
[11319]	training's rmse: 0.462798	valid_1's rmse: 0.757491
[11320]	training's rmse: 0.462786	valid_1's rmse: 0.757475
[11321]	training's rmse: 0.462763	valid_1's rmse: 0.757473
[11322]	training's rmse: 0.462746	valid_1's rmse: 0.757461
[11323]	training's rmse: 0.46274	valid_1's rmse: 0.757466
[11324]	training's rmse: 0.462731	valid_1's rmse: 0.757465
[11325]	training's rmse: 0.462718	valid_1's rmse: 0.757468
[11326]	training's rmse: 0.462705	valid_1's rmse: 0.757467
[11327]	training's rmse: 0.46269	valid_1's rmse: 0.757482
[11328]	training's rmse: 0.462684	valid_1's rmse: 0.757486
[11329]	training's rmse: 0.462664	valid_1's rmse: 0.757452
[11330]	training's rmse: 0.462655	valid_1's rmse: 0.757446
[11331]	training's rmse: 0.46264	valid_1's rmse: 0.757446
[11332]	training's rmse: 0.462626	valid_1's rmse: 0.757445
[11333]	train

In [124]:
# RMSLE カスタム評価関数 #####################
import tensorflow.keras.backend as K
msle = tf.keras.metrics.MeanSquaredLogarithmicError()

## 関数版
def root_mean_squared_logarithmic_error(y_true, y_pred):
  return K.sqrt(msle(y_true, y_pred))

#真数, RMLSE
def return_y(lgb_pred, t):
    lgb_pred = np.exp(lgb_pred)
    lgb_pred[lgb_pred < 0] = 0
    y_true = np.exp(t)
    y_true[y_true < 0] = 0
    RMLSE = root_mean_squared_logarithmic_error(y_true, lgb_pred)

    return(RMLSE)


RMLSE_train = return_y(lgb_train_pred, t_train)
RMLSE_test = return_y(lgb_test_pred, t_test)

In [125]:
print('学習データ:',RMLSE_train)
print('テストデータ:',RMLSE_test)

学習データ: tf.Tensor(0.4126505, shape=(), dtype=float32)
テストデータ: tf.Tensor(0.6073771, shape=(), dtype=float32)


# 変数重要度 dalex

In [35]:
'''
import dalex as dx
from dalex._explainer.helper import verbose_cat
'''

'\nimport dalex as dx\nfrom dalex._explainer.helper import verbose_cat\n'

In [36]:
'''
model_exp = dx.Explainer(model, x_train, t_train)
mp_model = model_exp.model_parts()
mp_model.plot()
'''

'\nmodel_exp = dx.Explainer(model, x_train, t_train)\nmp_model = model_exp.model_parts()\nmp_model.plot()\n'

In [37]:
#mp_model.result.sort_values('dropout_loss', ascending=True)

# 推論、submission作成

In [38]:
pred = model.predict(df_test_concat)

from numpy.core.fromnumeric import mean
pred = np.exp(pred)
pred[pred < 0] = mean(pred)

pred = pd.Series(pred)

df_test_id = pd.read_csv('/content/drive/MyDrive/Python/民泊サービスの宿泊料金予測/test_data.csv')
df_test_id = pd.Series(df_test_id['id'])
id_pred = pd.concat([df_test_id, pred], axis=1)
id_pred.columns = ['id', 'y']
id_pred.to_csv('submission.csv', index=False)