In [180]:
#ライブラリのimportを行います
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unicodedata
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [181]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [182]:
# trainデータとtestデータを結合
data = pd.concat([train, test], ignore_index=True)
data.info()
data = data.drop("id", axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55069 entries, 0 to 55068
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            55069 non-null  int64  
 1   region        55069 non-null  object 
 2   year          55069 non-null  int64  
 3   manufacturer  55069 non-null  object 
 4   condition     55069 non-null  object 
 5   cylinders     55069 non-null  object 
 6   fuel          52335 non-null  object 
 7   odometer      55069 non-null  int64  
 8   title_status  54384 non-null  object 
 9   transmission  55069 non-null  object 
 10  drive         55069 non-null  object 
 11  size          55069 non-null  object 
 12  type          54384 non-null  object 
 13  paint_color   55069 non-null  object 
 14  state         48736 non-null  object 
 15  price         27532 non-null  float64
dtypes: float64(1), int64(3), object(12)
memory usage: 6.7+ MB


In [183]:
# stateの欠損値は"ca"で埋める
data["state"] = data["state"].fillna("ca")

In [184]:
# LabelEncoderのインスタンスを作成
label_encoder = LabelEncoder()

# カテゴリ変数を数値にエンコード
data['region'] = label_encoder.fit_transform(data['region'])
data['state'] = label_encoder.fit_transform(data['state'])

In [185]:
# yearの誤植訂正
for i in [2999, 3008, 3017, 3015, 3019, 3011]:
    data.loc[data["year"] == i, "year"] = i - 1000

In [186]:
# manufacturerを小文字に変換する
data["manufacturer"] = data["manufacturer"].str.lower()
# manufacturer内の文字列を全て半角に変換する関数
def to_half_width(text):
    return ''.join([unicodedata.normalize('NFKC', char) for char in text])

# manufacturerのカラム内の文字列を半角に変換
data["manufacturer"] = data["manufacturer"].apply(to_half_width)
data["manufacturer"].describe()

def to_half_width(text):
    return ''.join([unicodedata.normalize('NFKC', char).lower().replace("а", "a").replace("ѕ", "s").replace("о", "o").replace("α", "a") for char in text])

data["manufacturer"] = data["manufacturer"].apply(to_half_width)

data["manufacturer"].describe()

count     55069
unique       38
top        ford
freq      13890
Name: manufacturer, dtype: object

In [187]:
data["cylinders"].value_counts()

6 cylinders     23201
4 cylinders     20069
8 cylinders     11340
other             149
10 cylinders      117
5 cylinders       107
3 cylinders        55
12 cylinders       31
Name: cylinders, dtype: int64

In [188]:
# cylindersの変更
cylmap = {'6 cylinders': 6, '8 cylinders': 8, '4 cylinders': 4, 'other': 0,
          '10 cylinders': 10, '12 cylinders': 12, '5 cylinders': 5, '3 cylinders': 3}
data["cylinders"] = train.cylinders.map(cylmap)


In [189]:
# odometerの修正
data[data["odometer"] == -131869] = 131869

# -1はexcelentのodometerの平均値で補完
data["odometer"] = data["odometer"].replace(-1, data[data["condition"] == "excellent"]["odometer"].mean())

In [190]:
data['size'] = data['size'].str.replace('full-size', 'fullsize').replace('fullーsize', 'fullsize').replace('full−size', 'fullsize').replace('mid-size', 'midsize').replace('midーsize', 'midsize').replace('mid−size', 'midsize').replace('sub-compact','subcompact').replace('subーcompact', 'subcompact')

In [191]:
# ワンホットエンコーディングする
data = pd.get_dummies(data, columns=["condition", "manufacturer","size","fuel","drive","type","title_status","paint_color","transmission"])

In [192]:
# 再びtrainデータとtestデータに戻す
train = data.iloc[:len(train)]
test = data.iloc[len(train):]

In [193]:
y = train["price"]
X = train.drop(["price"], axis=1)
test = test.drop(["price"], axis=1)

In [194]:
#scikit-learnはsklearnでインポート可能
import lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error

In [195]:
# カテゴリ変数をカテゴリ型に変換
categorical_features = ['region', 'state']
for col in categorical_features:
    data[col] = data[col].astype('category')

In [196]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
lgb_train = lgb.Dataset(X_train, y_train)

In [197]:
# ハイパーパラメータを設定
params = {
    'objective': 'regression',   # 目的関数: 回帰
    'metric': 'mse',             # 評価指標: 平均二乗誤差
    'boosting_type': 'gbdt',     # 勾配ブースティング
    'num_leaves': 31,            # ツリーの葉の数
    'learning_rate': 0.05,       # 学習率
}


In [198]:
# モデルを訓練
num_round = 100  # イテレーション数
model = lgb.train(params, lgb_train, num_round)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 784
[LightGBM] [Info] Number of data points in the train set: 24778, number of used features: 85
[LightGBM] [Info] Start training from score 13444.939987


In [199]:
#predict()で予測できます。()内に予測したいデータを入れることで予測できます。
pred = model.predict(X_test)
#予測の中身確認
print(pred[:5])

[ 9599.24544609 19022.88151544 10877.7799988  16674.54035978
 21167.89248856]


In [200]:
# 小数点は四捨五入する
# 0.5以上なら切り上げ、0.5未満なら切り捨て
pred = np.round(pred)
print(pred[:10])

[ 9599. 19023. 10878. 16675. 21168. 15319. 13059. 10562. 13683. 30788.]


In [201]:
#MAPEの評価はmean_absolute_percentage_error(正解データ, 予測データ)で可能です。
score = mean_absolute_percentage_error(y_test, pred)
print(score*100)

65.99551735386746


In [204]:
predict = model.predict(test)

In [207]:
#submit_sample.csvを読み込みます。
submit = pd.read_csv("submit_sample.csv", header=None)
submit.head()
#submit_sampleのpriceの数値部分を予測データpredictに変更する
predict = np.round(predict)
submit[1] = predict
#確認してみましょう
submit.head()

Unnamed: 0,0,1
0,27532,13185.0
1,27533,8526.0
2,27534,7365.0
3,27535,31749.0
4,27536,10708.0


In [208]:
#submission.csvでデータを保存(提出様式はindex=False, header=Noneとなります。)
submit.to_csv("submit_sample.csv", index=False, header=None)