In [1217]:
#ライブラリのimportを行います
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.stats import norm
import seaborn as sns
import unicodedata
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [1218]:
train.head()

Unnamed: 0,id,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,price
0,0,nashville,1949,bmw,excellent,6 cylinders,gas,115148,clean,manual,rwd,mid-size,convertible,orange,,27587
1,1,state college,2013,toyota,fair,8 cylinders,gas,172038,clean,automatic,rwd,full-size,sedan,silver,pa,4724
2,2,wichita,1998,ford,good,6 cylinders,gas,152492,clean,automatic,fwd,full-size,SUV,silver,ks,10931
3,3,albany,2014,ford,excellent,4 cylinders,gas,104118,clean,manual,fwd,mid-size,SUV,blue,ny,16553
4,4,redding,2005,ford,excellent,6 cylinders,gas,144554,clean,manual,fwd,mid-size,sedan,red,ca,5158


In [1219]:
missing_count = train['state'].isnull().sum()
print("欠損値の数:", missing_count)

欠損値の数: 3304


In [1220]:
test['cylinders'].value_counts()

6 cylinders     11697
4 cylinders      9998
8 cylinders      5613
other              78
5 cylinders        61
10 cylinders       57
3 cylinders        24
12 cylinders        9
Name: cylinders, dtype: int64

In [1221]:
test.head()

Unnamed: 0,id,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,27532,western slope,2015,chevrolet,excellent,4 cylinders,gas,92553,clean,automatic,fwd,full-size,SUV,red,
1,27533,roseburg,2013,nissan,like new,4 cylinders,gas,134385,salvage,automatic,fwd,mid-size,sedan,black,or
2,27534,akron / canton,2011,volkswagen,good,4 cylinders,gas,102489,clean,automatic,fwd,full-size,sedan,black,oh
3,27535,denver,2016,jeep,excellent,6 cylinders,diesel,64310,clean,automatic,4wd,mid-size,SUV,red,co
4,27536,hickory / lenoir,1999,honda,excellent,8 cylinders,gas,180839,rebuilt,automatic,4wd,mid-size,SUV,silver,nc


In [1222]:
train['state'].value_counts()

ca    2702
ny    1841
fl    1650
nj    1413
tx    1274
pa     983
oh     849
va     766
mi     739
az     706
co     705
nc     636
ri     631
wi     626
tn     595
or     577
mn     521
dc     427
id     425
il     409
ma     389
nv     369
ia     366
in     345
wa     317
nm     287
ct     278
md     274
ok     267
mo     262
ks     252
ga     251
mt     249
sc     244
ky     229
al     152
vt     138
hi     123
ne     115
ut     111
ak     108
de     101
nd      98
la      85
nh      85
sd      77
me      68
ar      37
wv      36
wy      24
ms      16
Name: state, dtype: int64

In [1223]:
train["price"] = np.log1p(np.log1p(train["price"]))
# trainデータとtestデータを結合
data = pd.concat([train, test], ignore_index=True)
data = data.drop("id", axis=1)

In [1224]:
data['size'] = data['size'].str.replace('full-size', 'fullsize')
data['size'] = data['size'].str.replace('fullーsize', 'fullsize')
data['size'] = data['size'].str.replace('full−size', 'fullsize')

data['size'] = data['size'].str.replace('mid-size', 'midsize')
data['size'] = data['size'].str.replace('midーsize', 'midsize')
data['size'] = data['size'].str.replace('mid−size', 'midsize')


data['size'] = data['size'].str.replace('sub-compact','subcompact')
data['size'] = data['size'].str.replace('subーcompact', 'subcompact')

In [1225]:
data['size'].value_counts()

fullsize      29356
midsize       18782
compact        6649
subcompact      282
Name: size, dtype: int64

In [1226]:
# stateの欠損値は"ca"で埋める
data["state"] = data["state"].fillna("ca")

In [1227]:
# LabelEncoderのインスタンスを作成
label_encoder = LabelEncoder()

# カテゴリ変数を数値にエンコード
data['region'] = label_encoder.fit_transform(data['region'])
data['state'] = label_encoder.fit_transform(data['state'])

In [1228]:
# yearの誤植訂正
for i in [2999, 3008, 3017, 3015, 3019, 3011]:
    data.loc[data["year"] == i, "year"] = i - 1000

In [1229]:
# manufacturerを小文字に変換する
data["manufacturer"] = data["manufacturer"].str.lower()
# manufacturer内の文字列を全て半角に変換する関数
def to_half_width(text):
    return ''.join([unicodedata.normalize('NFKC', char) for char in text])

# manufacturerのカラム内の文字列を半角に変換
data["manufacturer"] = data["manufacturer"].apply(to_half_width)
data["manufacturer"].describe()

def to_half_width(text):
    return ''.join([unicodedata.normalize('NFKC', char).lower().replace("а", "a").replace("ѕ", "s").replace("о", "o").replace("α", "a") for char in text])

data["manufacturer"] = data["manufacturer"].apply(to_half_width)

data["manufacturer"].describe()


count     55069
unique       38
top        ford
freq      13890
Name: manufacturer, dtype: object

In [1230]:
# cylindersの変更
cylmap = {'6 cylinders': 6, '8 cylinders': 8, '4 cylinders': 4, 'other': 0,
          '10 cylinders': 10, '12 cylinders': 12, '5 cylinders': 5, '3 cylinders': 3}
data["cylinders"] = data.cylinders.map(cylmap)

In [1231]:
data["cylinders"].value_counts()

6     23201
4     20069
8     11340
0       149
10      117
5       107
3        55
12       31
Name: cylinders, dtype: int64

In [1232]:
# odometerの修正
data.loc[data["odometer"] == -131869, "odometer"] = 131869

# -1はexcelentのodometerの平均値で補完
data["odometer"] = data["odometer"].replace(-1, data[data["condition"] == "excellent"]["odometer"].mean())

In [1233]:
data = data.drop(columns=['region','state'], axis=1)


In [1234]:
# ワンホットエンコーディングする
data = pd.get_dummies(data, columns=["condition", "manufacturer","size","fuel","drive","type","title_status","paint_color","transmission"])

In [1235]:
data["odometer"].value_counts()

110855.052135    388
100674.000000      6
47186.000000       5
100406.000000      5
167061.000000      4
                ... 
85210.000000       1
184820.000000      1
210260.000000      1
129347.000000      1
186965.000000      1
Name: odometer, Length: 47756, dtype: int64

In [1236]:
data.head()

Unnamed: 0,year,cylinders,odometer,price,condition_excellent,condition_fair,condition_good,condition_like new,condition_new,condition_salvage,...,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow,transmission_automatic,transmission_manual,transmission_other
0,1949,6,115148.0,2.418156,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,2013,8,172038.0,2.247138,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,1998,6,152492.0,2.33209,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,2014,4,104118.0,2.371587,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2005,6,144554.0,2.256384,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [1237]:
from sklearn.preprocessing import StandardScaler

# 標準化の対象となる特徴量を選択
selected_features = ["year","cylinders","odometer"]
# StandardScalerのインスタンスを作成
scaler = StandardScaler()
data[selected_features] = scaler.fit_transform(data[selected_features])

In [1238]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data[selected_features]= scaler.fit_transform(data[selected_features])

In [1239]:
# 再びtrainデータとtestデータに戻す
train = data.iloc[:len(train)]
test = data.iloc[len(train):]

In [1240]:
#priceは予測対象で学習に必要なため別途targetの変数に格納する
y = train["price"]
#odometer, condition, manufacturer以外を削除
X = train.drop(columns=["price"], axis=1)
test = test.drop(columns=["price"], axis=1)
X.head()


Unnamed: 0,year,cylinders,odometer,condition_excellent,condition_fair,condition_good,condition_like new,condition_new,condition_salvage,manufacturer_acura,...,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow,transmission_automatic,transmission_manual,transmission_other
0,0.298077,0.5,0.039083,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0.913462,0.666667,0.058394,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,0.769231,0.5,0.051759,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,0.923077,0.333333,0.035339,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0.836538,0.5,0.049065,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [1241]:
#scikit-learnはsklearnでインポート可能
import lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error

In [1242]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lgb_train = lgb.Dataset(X_train, y_train)

In [1243]:
# ハイパーパラメータを設定
params = {
    'objective': 'regression',   # 目的関数: 回帰
    'metric': 'mse',             # 評価指標: 平均二乗誤差
    'boosting_type': 'gbdt',     # 勾配ブースティング
    'num_leaves': 31,            # ツリーの葉の数
    'learning_rate': 0.05,       # 学習率
}


In [1244]:
# モデルを訓練
num_round = 100  # イテレーション数
model = lgb.train(params, lgb_train, num_round)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 507
[LightGBM] [Info] Number of data points in the train set: 19272, number of used features: 82
[LightGBM] [Info] Start training from score 2.320297


In [1245]:
#predict()で予測できます。()内に予測したいデータを入れることで予測できます。
pred = model.predict(X_test)

In [1246]:
#予測の中身確認
pred = np.exp(np.exp(pred)-1)-1
y_test = np.exp(np.exp(y_test)-1)-1
print(pred[:5])

[ 9483.24650544  3737.15267167 11089.19009849  8661.65176631
 24640.91306312]


In [1247]:
#MAPEの評価はmean_absolute_percentage_error(正解データ, 予測データ)で可能です。
score = mean_absolute_percentage_error(y_test, pred)
print(score*100)

51.59296046576852


In [1248]:
predict = model.predict(test)

In [1249]:
#submit_sample.csvを読み込みます。
submit = pd.read_csv("submit_sample.csv", header=None)
submit.head()

Unnamed: 0,0,1
0,27532,2.338542
1,27533,2.290321
2,27534,2.277996
3,27535,2.401745
4,27536,2.29649


In [1250]:
#submit_sampleのpriceの数値部分を予測データpredictに変更する
submit[1] = predict

In [1251]:
#確認してみましょう
submit.head()

Unnamed: 0,0,1
0,27532,2.333874
1,27533,2.281147
2,27534,2.279242
3,27535,2.416945
4,27536,2.304136


In [1252]:
#submission.csvでデータを保存(提出様式はindex=False, header=Noneとなります。)
submit.to_csv("submit_sample.csv", index=False, header=None)