In [858]:
import pandas as pd
import numpy as np
import math
import unicodedata

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

In [859]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
n = (train.state.unique()[0])

In [860]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27532 entries, 0 to 27531
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            27532 non-null  int64 
 1   region        27532 non-null  object
 2   year          27532 non-null  int64 
 3   manufacturer  27532 non-null  object
 4   condition     27532 non-null  object
 5   cylinders     27532 non-null  object
 6   fuel          26293 non-null  object
 7   odometer      27532 non-null  int64 
 8   title_status  27076 non-null  object
 9   transmission  27532 non-null  object
 10  drive         27532 non-null  object
 11  size          27532 non-null  object
 12  type          27076 non-null  object
 13  paint_color   27532 non-null  object
 14  state         24228 non-null  object
 15  price         27532 non-null  int64 
dtypes: int64(4), object(12)
memory usage: 3.4+ MB


In [861]:
train.describe(include="O")

Unnamed: 0,region,manufacturer,condition,cylinders,fuel,title_status,transmission,drive,size,type,paint_color,state
count,27532,27532,27532,27532,26293,27076,27532,27532,27532,27076,27532,24228
unique,372,125,6,8,5,6,3,3,9,13,12,51
top,central NJ,ford,excellent,6 cylinders,gas,clean,automatic,fwd,full-size,sedan,black,ca
freq,1069,6166,15219,11504,23546,22365,20497,14602,14585,9259,10143,2702


In [862]:
train.describe()

Unnamed: 0,id,year,odometer,price
count,27532.0,27532.0,27532.0,27532.0
mean,13765.5,2007.832958,116019.9,13468.724829
std,7947.948142,15.758976,64516.42,10962.14984
min,0.0,1921.0,-131869.0,1004.0
25%,6882.75,2005.0,78367.75,5759.0
50%,13765.5,2011.0,111444.5,9206.0
75%,20648.25,2014.0,150856.2,19128.0
max,27531.0,3017.0,2946000.0,96818.0


In [863]:
train['size'] = train['size'].str.replace('full-size', 'fullsize').replace('fullーsize', 'fullsize').replace('full−size', 'fullsize').replace('mid-size', 'midsize').replace('midーsize', 'midsize').replace('mid−size', 'midsize').replace('sub-compact','subcompact').replace('subーcompact', 'subcompact')
test['size'] = test['size'].str.replace('full-size', 'fullsize').replace('fullーsize', 'fullsize').replace('full−size', 'fullsize').replace('mid-size', 'midsize').replace('midーsize', 'midsize').replace('mid−size', 'midsize').replace('sub-compact','subcompact').replace('subーcompact', 'subcompact')

In [864]:
def to_half_width(text):
    return ''.join([unicodedata.normalize('NFKC', char).lower().replace("а", "a").replace("ѕ", "s").replace("о", "o").replace("α", "a") for char in text])

train["manufacturer"] = train["manufacturer"].apply(to_half_width)
test["manufacturer"] = test["manufacturer"].apply(to_half_width)

In [865]:
train.describe(include="O")

Unnamed: 0,region,manufacturer,condition,cylinders,fuel,title_status,transmission,drive,size,type,paint_color,state
count,27532,27532,27532,27532,26293,27076,27532,27532,27532,27076,27532,24228
unique,372,38,6,8,5,6,3,3,4,13,12,51
top,central NJ,ford,excellent,6 cylinders,gas,clean,automatic,fwd,fullsize,sedan,black,ca
freq,1069,6943,15219,11504,23546,22365,20497,14602,14636,9259,10143,2702


In [866]:
train.condition.unique()

array(['excellent', 'fair', 'good', 'like new', 'salvage', 'new'],
      dtype=object)

In [867]:
conmap = {"salvage": 0, "fair": 1, "good": 2, "excellent": 3, "like_new": 4, "new": 5}
train["condition"] = train.condition.map(conmap)
test["condition"] = test.condition.map(conmap)


cylmap = {'6 cylinders': 6, '8 cylinders': 8, '4 cylinders': 4, 'other': n,
       '10 cylinders': 10, '12 cylinders': 12, '5 cylinders': 5, '3 cylinders': 3}
train["cylinders"] = train.cylinders.map(cylmap)
test["cylinders"] = test.cylinders.map(cylmap)


In [868]:
train = train.fillna({"cylinders": train.cylinders.mean()})
test = test.fillna({"cylinders": train.cylinders.mean()})

In [869]:
l = {"dc": [9856, 68], "nj": [1195, 8722], "ri": [1018, 1544], "ma": [839, 10554], "ct": [738, 5543],
     "md": [594, 12405], "de": [460, 2488], "ny": [4141, 54554], "fl": [350, 65757], "pa": [282, 46054],
     "oh": [282, 44825], "ca": [239, 163694], "il": [231, 57913], "hi": [211, 10931], "va": [202, 42774],
     "nc": [196, 53819], "in": [180, 36419], "mi": [174, 96713], "ga": [168, 59425], "tn": [153, 42144],
     "sc": [153, 32020], "nh": [147, 9346], "ky": [109, 40407], "wi": [105, 65496], "la": [104, 52378],
     "wa": [101, 71297], "tx": [96, 268596], "al": [94, 52420], "mo": [87, 69706], "wv": [77, 24230],
     "vt": [67, 9616], "mn": [66, 86935], "ms": [63, 48431], "az": [56, 113990], "ar": [56, 53178],
     "ok": [54, 69898], "ia": [54, 56272], "co": [48, 104093], "me": [43, 35379], "or": [39, 98378],
     "ks": [34, 82278], "ut": [33, 84896], "nv": [24, 110571], "ne": [23, 77347], "id": [18, 83568],
     "nm": [16, 121590], "sd": [10, 77115], "nd": [9, 70698], "mt": [6, 147039], "wy": [5, 97813], "ak": [1, 665384]}


In [870]:
train['state_pd'] = n
test['state_pd'] = n
for i in range(train.id.shape[0]):
    # print(i)
    if type(train["state"][i]) == str:
        train['state_pd'][i] = l[train.state[i]][0]
        
for i in range(test.id.shape[0]):
    # print(i)
    if type(test.state[i]) == str:
        test.state_pd[i] = l[test.state[i]][0]
    # print(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['state_pd'][i] = l[train.state[i]][0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.state_pd[i] = l[test.state[i]][0]


In [871]:
train['state_area'] = n
test['state_area'] = n
for i in range(train.id.shape[0]):
    # print(i)
    if type(train["state"][i]) == str:
        train['state_area'][i] = l[train.state[i]][1]
        
for i in range(test.id.shape[0]):
    # print(i)
    if type(test.state[i]) == str:
        test.state_area[i] = l[test.state[i]][1]
    # print(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['state_area'][i] = l[train.state[i]][1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.state_area[i] = l[test.state[i]][1]


In [872]:
train['state_pop'] = n
test['state_pop'] = n
for i in range(train.id.shape[0]):
    # print(i)
    if type(train["state"][i]) == str:
        train['state_pop'][i] = l[train.state[i]][0] * l[train.state[i]][1]
        
for i in range(test.id.shape[0]):
    # print(i)
    if type(test.state[i]) == str:
        test.state_pop[i] = l[test.state[i]][0] * l[test.state[i]][1]
    # print(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['state_pop'][i] = l[train.state[i]][0] * l[train.state[i]][1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.state_pop[i] = l[test.state[i]][0] * l[test.state[i]][1]


In [873]:
train["region"] = train["region"].apply(lambda x: x.replace("/", "nn").replace(",", " "))
test["region"] = test["region"].apply(lambda x: x.replace("/", "nn").replace(",", " "))

In [874]:
test.shape

(27537, 18)

In [875]:
import category_encoders as ce
# リストの作成
l_list = ["region", "state"]

ce_oe = ce.OrdinalEncoder(cols=l_list,handle_unknown='impute')
#文字を序数に変換
train = ce_oe.fit_transform(train)
test = ce_oe.fit_transform(test)
#値を1の始まりから0の始まりにする
for i in l_list:
    train[i] = train[i] - 1
    test[i] = test[i] - 1

for i in l_list:
    train[i] = train[i].astype("int")
    test[i] = test[i].astype("int")

col = ["condition", "manufacturer", "size", "fuel", "drive", "type", "title_status", "paint_color", "transmission"]
train = pd.get_dummies(train, columns=col)
test = pd.get_dummies(test, columns=col)

In [876]:
train = train.fillna({"state_pd": train.state_pd.mean()})
test = test.fillna({"state_pd": test.state_pd.mean()})

In [877]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27532 entries, 0 to 27531
Data columns (total 99 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          27532 non-null  int64  
 1   region                      27532 non-null  int32  
 2   year                        27532 non-null  int64  
 3   cylinders                   27532 non-null  float64
 4   odometer                    27532 non-null  int64  
 5   state                       27532 non-null  int32  
 6   price                       27532 non-null  int64  
 7   state_pd                    27532 non-null  float64
 8   state_area                  24228 non-null  float64
 9   state_pop                   24228 non-null  float64
 10  condition_0.0               27532 non-null  uint8  
 11  condition_1.0               27532 non-null  uint8  
 12  condition_2.0               27532 non-null  uint8  
 13  condition_3.0               275

In [878]:
cols = ["state_area", "state_pop"]

In [879]:
nu = 3
for i in range(2, nu):
    for c in cols:
        train[f"{c}{i}"] = train[c] ** i
        test[f"{c}{i}"] = test[c] ** i

In [880]:
train = train.query("year <= 2023 and odometer >= 0")
# test = test.query("year <= 2023 and odometer >= 0")
train.describe()

Unnamed: 0,id,region,year,cylinders,odometer,state,price,state_pd,state_area,state_pop,...,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow,transmission_automatic,transmission_manual,transmission_other,state_area2,state_pop2
count,27263.0,27263.0,27263.0,27263.0,27263.0,27263.0,27263.0,27263.0,23989.0,23989.0,...,27263.0,27263.0,27263.0,27263.0,27263.0,27263.0,27263.0,27263.0,23989.0,23989.0
mean,13767.439974,101.023989,2007.69266,5.691808,117151.5,20.7563,13455.268239,740.191137,82709.842636,29266190.0,...,0.00088,0.051462,0.156219,0.13157,0.0011,0.744672,0.223453,0.031875,12390320000.0,4158418000000000.0
std,7951.260409,77.947394,10.110576,1.509441,63792.11,16.232529,10940.538436,1522.491695,74495.839943,57463430.0,...,0.029658,0.220942,0.36307,0.338029,0.033155,0.436053,0.416567,0.17567,33257020000.0,1.343564e+16
min,0.0,0.0,1921.0,3.0,22.0,0.0,1004.0,1.0,68.0,489065.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4624.0,239184600000.0
25%,6879.5,40.0,2005.0,4.0,79506.0,6.0,5756.5,96.0,42774.0,4996464.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1829615000.0,24964650000000.0
50%,13772.0,84.0,2011.0,6.0,112046.0,17.0,9207.0,239.0,65496.0,10422790.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4289726000.0,108634600000000.0
75%,20652.5,147.0,2014.0,6.0,151381.5,31.0,19117.0,740.049777,104093.0,23014950.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10835350000.0,529687900000000.0
max,27531.0,371.0,2022.0,12.0,2946000.0,51.0,96818.0,9856.0,665384.0,225908100.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,442735900000.0,5.103448e+16


In [881]:
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

In [882]:
test.shape

(27537, 100)

In [883]:
col = ["year", "odometer", "condition", "state_pd"]
col2 = ["region", "manufacturer", "condition", "cylinders", "fuel", "title_status", "transmission", "drive", "size", "type", "paint_color", "state"]
col += col2
# X = train[col]
X = train.drop(columns=["price", "id"])
test = test.drop(columns=["id"])
y = train["price"]

In [884]:
test.shape

(27537, 99)

In [891]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
lgb_train = lgb.Dataset(X_train, y_train)
# ハイパーパラメータを設定
params = {
    'objective': 'regression',   # 目的関数: 回帰
    'metric': 'mse',             # 評価指標: 平均二乗誤差
    'boosting_type': 'gbdt',     # 勾配ブースティング
    'num_leaves': 31,            # ツリーの葉の数
    'learning_rate': 0.05,       # 学習率
}
# モデルを訓練
num_round = 560  # イテレーション数
model = lgb.train(params, lgb_train, num_round)
pred = model.predict(X_test)
score = mean_absolute_percentage_error(y_test, pred)
print(score*100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1072
[LightGBM] [Info] Number of data points in the train set: 18266, number of used features: 88
[LightGBM] [Info] Start training from score 13379.011004
64.54713625825389


In [892]:
submit = pd.read_csv("submit_sample.csv", header=None)
submit.head()

Unnamed: 0,0,1
0,27532,18546.172992
1,27533,5899.888298
2,27534,6883.536308
3,27535,27638.250807
4,27536,13572.478035


In [893]:
pred = model.predict(test)
submit[1] = pred

In [894]:
submit.to_csv("submit_sample.csv", index=False, header=None)

In [895]:
pred

array([15200.42211518,  7233.34199844,  7731.8402472 , ...,
       12961.48136902, 10355.62776304, 10069.12171361])