In [2]:
import pickle
import os
import numpy as np

from sklearn.metrics import r2_score
import pandas as pd


# モデルの読み込み

In [3]:
oe = pickle.load(open(os.path.join('./flask_app/model/ordinalencoder.pkl'),'rb'))
rf = pickle.load(open(os.path.join('./flask_app/model/regressor.pkl'),'rb'))

In [4]:
#データの読み込み
train_data = pd.read_csv("../data/train_data.csv",index_col=0)
model_data = pd.read_csv('../data/model_data.csv',index_col=0)
info_data = pd.read_csv('../data/bike_info.csv',index_col=0)
#サンプルサイズが少ないモデルを抽出
all_data = info_data.merge(model_data,on='model_number',how='left')
brand_count = all_data.groupby('brand').size().sort_values(ascending=True)
#下位10メーカーを抽出(上のcountデータがすでに照準のため上から９つを抽出する)
low_10 = brand_count.index[0:10]
model_data = model_data[~model_data['brand'].isin(low_10)]
#このメーカーデータを使ってtrainデータに結合
train_data = train_data.merge(model_data,on='model_number',how='left')
#先ほど外れ値に指定したデータはnullになるため、dropnaをすればok
train_data.dropna(inplace=True)
test_data = pd.read_csv('../data/test_data.csv',index_col=0)
test_data=test_data.merge(model_data,on='model_number',how='left')
test_data.dropna(inplace=True)
#ダミーエンコーディング
#エンコーディングしたい列
encoding_target = ['color','brand','type']
train_data=pd.get_dummies(train_data,drop_first=True,columns=encoding_target)
#ラベルエンコーディングを反映
test_data[encoding_target] = oe.transform(test_data[encoding_target].values)
X_test=test_data.iloc[:,2:].drop(columns=['name','model_number','store_id']).values
y_test = test_data['cost'].values

In [5]:
rf

In [6]:
#チューニングをしたrandomforestモデルが持って来れる
predict_test = rf.predict(X_test)
r2_score(y_true=y_test,y_pred=predict_test)

0.9920257500573733

In [7]:
test_data

Unnamed: 0,id,cost,model_number,model_year,color,mileage,guarantee_period,store_id,brand,name,type,displacement
0,2100005197021,128000,EBJ-SE53J,2011.0,7.0,9937.0,0.25,64,9.0,ｱｸｼｽﾄﾘｰﾄ,2.0,125.0
1,2100006010725,148000,BA-AF58,2004.0,11.0,9317.0,0.25,75,2.0,ｽﾞｰﾏｰ,2.0,50.0
2,2100006174472,598000,2BK-MC49,2020.0,9.0,1000.0,5.00,50,2.0,ﾚﾌﾞﾙ250,0.0,250.0
3,2100005171830,458000,2BJ-JC79,2021.0,7.0,4349.0,1.00,61,2.0,CB125R,4.0,125.0
4,2100006114140,158000,JBH-AF77,2019.0,4.0,22.0,1.00,21,2.0,ｼﾞｮﾙﾉ,2.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1117,2100006064599,138000,2BH-AF79,2016.0,7.0,4610.0,0.25,21,2.0,ﾀｸﾄﾍﾞｰｼｯｸ,2.0,50.0
1118,2100004942059,648000,JBK-BJ250A,2017.0,0.0,12514.0,3.00,43,5.0,ｴｽﾄﾚﾔ,4.0,250.0
1119,2100005245517,2098000,8BL-EJ11A,2022.0,7.0,1446.0,7.00,10,7.0,ﾊﾔﾌﾞｻ,3.0,1300.0
1120,2100006294828,498000,BC-RH01J,2007.0,11.0,2506.0,0.25,3,9.0,SR400,4.0,400.0


In [8]:
test_data['guarantee_period'].dtype

dtype('float64')

In [9]:
X_test

array([[2.011e+03, 7.000e+00, 9.937e+03, ..., 9.000e+00, 2.000e+00,
        1.250e+02],
       [2.004e+03, 1.100e+01, 9.317e+03, ..., 2.000e+00, 2.000e+00,
        5.000e+01],
       [2.020e+03, 9.000e+00, 1.000e+03, ..., 2.000e+00, 0.000e+00,
        2.500e+02],
       ...,
       [2.022e+03, 7.000e+00, 1.446e+03, ..., 7.000e+00, 3.000e+00,
        1.300e+03],
       [2.007e+03, 1.100e+01, 2.506e+03, ..., 9.000e+00, 4.000e+00,
        4.000e+02],
       [2.015e+03, 1.100e+01, 2.710e+03, ..., 3.000e+00, 0.000e+00,
        7.500e+02]])

In [10]:
data=pd.Series(data=[1,2000,111,2020,'限定',8049,2,1,'HONDA','CBR1000RR','スポーツ/ツアラー','1000'],index=all_data.columns)
df_t=pd.DataFrame(columns=all_data.columns)
df_t.loc[1]=data

In [11]:
df_t

Unnamed: 0,id,cost,model_number,model_year,color,mileage,guarantee_period,store_id,brand,name,type,displacement
1,1,2000,111,2020,限定,8049,2,1,HONDA,CBR1000RR,スポーツ/ツアラー,1000


In [12]:
df_t[encoding_target]=oe.transform(df_t[encoding_target].values)

In [13]:
df_t

Unnamed: 0,id,cost,model_number,model_year,color,mileage,guarantee_period,store_id,brand,name,type,displacement
1,1,2000,111,2020,8.0,8049,2,1,2.0,CBR1000RR,3.0,1000


In [14]:

X_df_t = df_t.iloc[:,2:].drop(columns=['name','model_number','store_id']).values

In [15]:
X_df_t

array([[2020, 8.0, 8049, 2, 2.0, 3.0, '1000']], dtype=object)

In [16]:
rf.predict(X_df_t)

array([2354808.64580937])

In [17]:

np_t = np.array(['赤','HONDA','スポーツ/ツアラー'])
np_t=np_t.reshape(1,-1)
np_t=oe.transform(np_t)

In [18]:
np_t[0][0]

6.0

In [19]:
oe.categories_

[array(['オレンジ', 'ピンク', '白', '紫', '緑', '茶', '赤', '銀', '限定', '青', '黄', '黒'],
       dtype=object),
 array(['BMW', 'Ducati', 'HONDA', 'Harley-Davidson', 'Husqvarna',
        'KAWASAKI', 'KTM', 'SUZUKI', 'Triumph', 'YAMAHA'], dtype=object),
 array(['アメリカン/クルーザー', 'オフロード/モタード', 'スクーター/ビッグスクーター', 'スポーツ/ツアラー',
        'ネイキッド'], dtype=object)]

In [20]:
[(i,oe.categories_[0][i])for i,oe.categories_[0][i] in enumerate(oe.categories_[0])]

[(0, 'オレンジ'),
 (1, 'ピンク'),
 (2, '白'),
 (3, '紫'),
 (4, '緑'),
 (5, '茶'),
 (6, '赤'),
 (7, '銀'),
 (8, '限定'),
 (9, '青'),
 (10, '黄'),
 (11, '黒')]

In [33]:
oe.inverse_transform([[1,1,1]])[0][2]

'オフロード/モタード'