In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('used_car_train_20200313.csv')
df_predict=pd.read_csv('used_car_testB_20200421.csv')

Loading data...


In [3]:
df_train['notRepairedDamage'].value_counts()
df_train['notRepairedDamage'].replace('-', np.nan, inplace=True)

In [4]:
y = df_train['price']#target
X = df_train.drop(['price','SaleID','name','seller','offerType','notRepairedDamage'], axis=1)
X_predict = df_predict.drop(['SaleID','name','seller','offerType','notRepairedDamage'], axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
train_data = lgb.Dataset(data=X_train,label=y_train)
test_data = lgb.Dataset(data=X_test,label=y_test)

In [5]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [6]:
from hyperopt import fmin, tpe, hp, partial

# 自定义hyperopt的参数空间
space = {"max_depth": hp.randint("max_depth", 15),
         "num_trees": hp.randint("num_trees", 300),
         'learning_rate': hp.uniform('learning_rate', 1e-3, 5e-1),
         "bagging_fraction": hp.randint("bagging_fraction", 5),
         "num_leaves": hp.randint("num_leaves", 6),
         }

def argsDict_tranform(argsDict, isPrint=False):
    argsDict["max_depth"] = argsDict["max_depth"] + 5
    argsDict['num_trees'] = argsDict['num_trees'] + 150
    argsDict["learning_rate"] = argsDict["learning_rate"] * 0.02 + 0.05
    argsDict["bagging_fraction"] = argsDict["bagging_fraction"] * 0.1 + 0.5
    argsDict["num_leaves"] = argsDict["num_leaves"] * 3 + 10
    if isPrint:
        print(argsDict)
    else:
        pass

    return argsDict

In [7]:
from sklearn.metrics import mean_squared_error

def lightgbm_factory(argsDict):
    argsDict = argsDict_tranform(argsDict)

    params = {'nthread': -1,  # 进程数
              'max_depth': argsDict['max_depth'],  # 最大深度
              'num_trees': argsDict['num_trees'],  # 树的数量
              'eta': argsDict['learning_rate'],  # 学习率
              'bagging_fraction': argsDict['bagging_fraction'],  # bagging采样数
              'num_leaves': argsDict['num_leaves'],  # 终点节点最小样本占比的和
              'objective': 'regression',
              'feature_fraction': 0.7,  # 样本列采样
              'lambda_l1': 0,  # L1 正则化
              'lambda_l2': 0,  # L2 正则化
              'bagging_seed': 100,  # 随机种子,light中默认为100
              }
    #rmse
    params['metric'] = ['rmse']

    model_lgb = lgb.train(params, train_data, num_boost_round=300, valid_sets=[test_data],early_stopping_rounds=100)

    return get_tranformer_score(model_lgb)

def get_tranformer_score(tranformer):

    model = tranformer
    prediction = model.predict(X_test, num_iteration=model.best_iteration)

    return mean_squared_error(y_test, prediction)

In [8]:
# 开始使用hyperopt进行自动调参
algo = partial(tpe.suggest, n_startup_jobs=1)
best = fmin(lightgbm_factory, space, algo=algo, max_evals=20, pass_expr_memo_ctrl=None)

  0%|                                                                           | 0/20 [00:00<?, ?trial/s, best loss=?]




Auto-choosing col-wise multi-threading, the overhead of testing was 0.013786 seconds.                                  
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info]                                                                                                      
Total Bins 4984                                                                                                        
[LightGBM] [Info]                                                                                                      
Number of data points in the train set: 150000, number of used features: 25                                            
[LightGBM] [Info]                                                                                                      
Start training from score 5923.327333                                                                                  
[1]	valid_0's rmse: 7149.95                                                                           




[18]	valid_0's rmse: 3597.38
[19]	valid_0's rmse: 3487.31                                                                                           
[20]	valid_0's rmse: 3381.71                                                                                           
[21]	valid_0's rmse: 3283                                                                                              
[22]	valid_0's rmse: 3192.67                                                                                           
[23]	valid_0's rmse: 3109.05                                                                                           
[24]	valid_0's rmse: 3029.37                                                                                           
[25]	valid_0's rmse: 2953.4                                                                                            
[26]	valid_0's rmse: 2883.79                                                                                           
[27]	valid_




[16]	valid_0's rmse: 4154.53                                                                                           
[17]	valid_0's rmse: 4027.22                                                                                           
[18]	valid_0's rmse: 3905.95                                                                                           
[19]	valid_0's rmse: 3793.55                                                                                           
[20]	valid_0's rmse: 3685.01                                                                                           
[21]	valid_0's rmse: 3580.26                                                                                           
[22]	valid_0's rmse: 3484.41                                                                                           
[23]	valid_0's rmse: 3395.45                                                                                           
[24]	valid_0's rmse: 3308.22            




[15]	valid_0's rmse: 3991.25                                                                                           
[16]	valid_0's rmse: 3847.33                                                                                           
[17]	valid_0's rmse: 3712.87                                                                                           
[18]	valid_0's rmse: 3584.73                                                                                           
[19]	valid_0's rmse: 3466.06                                                                                           
[20]	valid_0's rmse: 3353.91                                                                                           
[21]	valid_0's rmse: 3248.37                                                                                           
[22]	valid_0's rmse: 3150.38                                                                                           
[23]	valid_0's rmse: 3057.97            




[16]	valid_0's rmse: 3920.95
[17]	valid_0's rmse: 3786.87                                                                                           
[18]	valid_0's rmse: 3658.52                                                                                           
[19]	valid_0's rmse: 3538.64                                                                                           
[20]	valid_0's rmse: 3426.58                                                                                           
[21]	valid_0's rmse: 3320.01                                                                                           
[22]	valid_0's rmse: 3221.01                                                                                           
[23]	valid_0's rmse: 3126.52                                                                                           
[24]	valid_0's rmse: 3037.34                                                                                           
[25]	valid_




[15]	valid_0's rmse: 3977.32                                                                                           
[16]	valid_0's rmse: 3833.19                                                                                           
[17]	valid_0's rmse: 3698.57                                                                                           
[18]	valid_0's rmse: 3570.33                                                                                           
[19]	valid_0's rmse: 3451.25                                                                                           
[20]	valid_0's rmse: 3339.6                                                                                            
[21]	valid_0's rmse: 3234.38                                                                                           
[22]	valid_0's rmse: 3134.61                                                                                           
[23]	valid_0's rmse: 3041.08            




[16]	valid_0's rmse: 3725                                                                                              
[17]	valid_0's rmse: 3586.71                                                                                           
[18]	valid_0's rmse: 3457.28                                                                                           
[19]	valid_0's rmse: 3337.75                                                                                           
[20]	valid_0's rmse: 3224.2                                                                                            
[21]	valid_0's rmse: 3116.14                                                                                           
[22]	valid_0's rmse: 3016.16                                                                                           
[23]	valid_0's rmse: 2921.39                                                                                           
[24]	valid_0's rmse: 2832.25            




[15]	valid_0's rmse: 3650.51                                                                                           
[16]	valid_0's rmse: 3503.72                                                                                           
[17]	valid_0's rmse: 3366.71                                                                                           
[18]	valid_0's rmse: 3238.95                                                                                           
[19]	valid_0's rmse: 3120                                                                                              
[20]	valid_0's rmse: 3010.37                                                                                           
[21]	valid_0's rmse: 2908.62                                                                                           
[22]	valid_0's rmse: 2813.35                                                                                           
[23]	valid_0's rmse: 2725.15            




[15]	valid_0's rmse: 3977.94
[16]	valid_0's rmse: 3837.56                                                                                           
[17]	valid_0's rmse: 3707.54                                                                                           
[18]	valid_0's rmse: 3584.05                                                                                           
[19]	valid_0's rmse: 3471.18                                                                                           
[20]	valid_0's rmse: 3363.94                                                                                           
[21]	valid_0's rmse: 3262.04                                                                                           
[22]	valid_0's rmse: 3166.78                                                                                           
[23]	valid_0's rmse: 3079.54                                                                                           
[24]	valid_




[14]	valid_0's rmse: 3812.06                                                                                           
[15]	valid_0's rmse: 3654.68                                                                                           
[16]	valid_0's rmse: 3507.88                                                                                           
[17]	valid_0's rmse: 3370.88                                                                                           
[18]	valid_0's rmse: 3243.08                                                                                           
[19]	valid_0's rmse: 3124.06                                                                                           
[20]	valid_0's rmse: 3014.17                                                                                           
[21]	valid_0's rmse: 2912.35                                                                                           
[22]	valid_0's rmse: 2817.02            




[16]	valid_0's rmse: 3861.35                                                                                           
[17]	valid_0's rmse: 3728.42                                                                                           
[18]	valid_0's rmse: 3601.7                                                                                            
[19]	valid_0's rmse: 3483.28                                                                                           
[20]	valid_0's rmse: 3373.27                                                                                           
[21]	valid_0's rmse: 3268.7                                                                                            
[22]	valid_0's rmse: 3171.37                                                                                           
[23]	valid_0's rmse: 3080.31                                                                                           
[24]	valid_0's rmse: 2993.91            




[15]	valid_0's rmse: 3743.23
[16]	valid_0's rmse: 3598.12                                                                                           
[17]	valid_0's rmse: 3462.04                                                                                           
[18]	valid_0's rmse: 3334.13                                                                                           
[19]	valid_0's rmse: 3215.26                                                                                           
[20]	valid_0's rmse: 3105.66                                                                                           
[21]	valid_0's rmse: 3002.84                                                                                           
[22]	valid_0's rmse: 2906.22                                                                                           
[23]	valid_0's rmse: 2815.96                                                                                           
[24]	valid_




[15]	valid_0's rmse: 4054.88
[16]	valid_0's rmse: 3909.64                                                                                           
[17]	valid_0's rmse: 3772.79                                                                                           
[18]	valid_0's rmse: 3642.57                                                                                           
[19]	valid_0's rmse: 3520.11                                                                                           
[20]	valid_0's rmse: 3405.01                                                                                           
[21]	valid_0's rmse: 3296.18                                                                                           
[22]	valid_0's rmse: 3192.82                                                                                           
[23]	valid_0's rmse: 3096.81                                                                                           
[24]	valid_




[15]	valid_0's rmse: 3971.76                                                                                           
[16]	valid_0's rmse: 3826.23                                                                                           
[17]	valid_0's rmse: 3689.69                                                                                           
[18]	valid_0's rmse: 3559.61                                                                                           
[19]	valid_0's rmse: 3439.43                                                                                           
[20]	valid_0's rmse: 3324.74                                                                                           
[21]	valid_0's rmse: 3216.93                                                                                           
[22]	valid_0's rmse: 3116.05                                                                                           
[23]	valid_0's rmse: 3020.22            




[17]	valid_0's rmse: 3681.3                                                                                            
[18]	valid_0's rmse: 3554.26                                                                                           
[19]	valid_0's rmse: 3437.17                                                                                           
[20]	valid_0's rmse: 3328.41                                                                                           
[21]	valid_0's rmse: 3224.5                                                                                            
[22]	valid_0's rmse: 3127.39                                                                                           
[23]	valid_0's rmse: 3036.06                                                                                           
[24]	valid_0's rmse: 2949.33                                                                                           
[25]	valid_0's rmse: 2867.46            




[15]	valid_0's rmse: 3839.03                                                                                           
[16]	valid_0's rmse: 3693.13                                                                                           
[17]	valid_0's rmse: 3556.31                                                                                           
[18]	valid_0's rmse: 3427.72                                                                                           
[19]	valid_0's rmse: 3310.19                                                                                           
[20]	valid_0's rmse: 3199.15                                                                                           
[21]	valid_0's rmse: 3093.69                                                                                           
[22]	valid_0's rmse: 2996.75                                                                                           
[23]	valid_0's rmse: 2904.48            




[16]	valid_0's rmse: 3718.56                                                                                           
[17]	valid_0's rmse: 3589.29                                                                                           
[18]	valid_0's rmse: 3466.74                                                                                           
[19]	valid_0's rmse: 3355.13                                                                                           
[20]	valid_0's rmse: 3247.96                                                                                           
[21]	valid_0's rmse: 3148.16                                                                                           
[22]	valid_0's rmse: 3055.39                                                                                           
[23]	valid_0's rmse: 2970.55                                                                                           
[24]	valid_0's rmse: 2888.93            




[14]	valid_0's rmse: 4018.88                                                                                           
[15]	valid_0's rmse: 3863.47                                                                                           
[16]	valid_0's rmse: 3717.36                                                                                           
[17]	valid_0's rmse: 3579.07                                                                                           
[18]	valid_0's rmse: 3449.72                                                                                           
[19]	valid_0's rmse: 3330.26                                                                                           
[20]	valid_0's rmse: 3216.8                                                                                            
[21]	valid_0's rmse: 3108.87                                                                                           
[22]	valid_0's rmse: 3009.04            




[14]	valid_0's rmse: 4019.23                                                                                           
[15]	valid_0's rmse: 3863.82                                                                                           
[16]	valid_0's rmse: 3717.71                                                                                           
[17]	valid_0's rmse: 3579.43                                                                                           
[18]	valid_0's rmse: 3450.07                                                                                           
[19]	valid_0's rmse: 3330.61                                                                                           
[20]	valid_0's rmse: 3217.15                                                                                           
[21]	valid_0's rmse: 3109.21                                                                                           
[22]	valid_0's rmse: 3009.37            




[12]	valid_0's rmse: 4282.4                                                                                            
[13]	valid_0's rmse: 4104.01                                                                                           
[14]	valid_0's rmse: 3940.42                                                                                           
[15]	valid_0's rmse: 3784.32                                                                                           
[16]	valid_0's rmse: 3637.74                                                                                           
[17]	valid_0's rmse: 3499.73                                                                                           
[18]	valid_0's rmse: 3370.78                                                                                           
[19]	valid_0's rmse: 3252.28                                                                                           
[20]	valid_0's rmse: 3139.82            

In [9]:
RMSE = lightgbm_factory(best)
print('best :', best)
print('best param after transform :')
best_param_after_transform=argsDict_tranform(best,isPrint=True)
best_param_after_transform
print('rmse of the best lightgbm:', np.sqrt(RMSE))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4984
[LightGBM] [Info] Number of data points in the train set: 150000, number of used features: 25
[LightGBM] [Info] Start training from score 5923.327333
[1]	valid_0's rmse: 7144.22
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 6809.3
[3]	valid_0's rmse: 6500.7
[4]	valid_0's rmse: 6201.07
[5]	valid_0's rmse: 5916.67
[6]	valid_0's rmse: 5657.62
[7]	valid_0's rmse: 5407.8
[8]	valid_0's rmse: 5169.29
[9]	valid_0's rmse: 4955.96
[10]	valid_0's rmse: 4745.34
[11]	valid_0's rmse: 4545.87
[12]	valid_0's rmse: 4358.27
[13]	valid_0's rmse: 4181.26
[14]	valid_0's rmse: 4019.23
[15]	valid_0's rmse: 3863.82
[16]	valid_0's rmse: 3717.71
[17]	valid_0's rmse: 3579.43




[18]	valid_0's rmse: 3450.07
[19]	valid_0's rmse: 3330.61
[20]	valid_0's rmse: 3217.15
[21]	valid_0's rmse: 3109.21
[22]	valid_0's rmse: 3009.37
[23]	valid_0's rmse: 2914.93
[24]	valid_0's rmse: 2826.3
[25]	valid_0's rmse: 2743.15
[26]	valid_0's rmse: 2664.43
[27]	valid_0's rmse: 2590.63
[28]	valid_0's rmse: 2523.22
[29]	valid_0's rmse: 2459.26
[30]	valid_0's rmse: 2400.29
[31]	valid_0's rmse: 2344.71
[32]	valid_0's rmse: 2291.9
[33]	valid_0's rmse: 2242.51
[34]	valid_0's rmse: 2196.91
[35]	valid_0's rmse: 2153.33
[36]	valid_0's rmse: 2114.49
[37]	valid_0's rmse: 2073.44
[38]	valid_0's rmse: 2039.56
[39]	valid_0's rmse: 2003.66
[40]	valid_0's rmse: 1971.96
[41]	valid_0's rmse: 1941.23
[42]	valid_0's rmse: 1912.63
[43]	valid_0's rmse: 1885.63
[44]	valid_0's rmse: 1863.08
[45]	valid_0's rmse: 1839.03
[46]	valid_0's rmse: 1815.81
[47]	valid_0's rmse: 1795.52
[48]	valid_0's rmse: 1776.04
[49]	valid_0's rmse: 1756.18
[50]	valid_0's rmse: 1737.89
[51]	valid_0's rmse: 1721.03
[52]	valid_0's 

In [16]:
# specify your configurations as a dict
params = best_param_after_transform

In [21]:
#print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                #num_boost_round=5000,
                valid_sets=lgb_eval,
                #early_stopping_rounds=5
               )

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4984
[LightGBM] [Info] Number of data points in the train set: 150000, number of used features: 25
[LightGBM] [Info] Start training from score 5923.327333


ValueError: For early stopping, at least one dataset and eval metric is required for evaluation

In [18]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_predict, num_iteration=gbm.best_iteration)

Starting predicting...


In [19]:
predict=pd.DataFrame(df_predict,columns=[ 'SaleID'])
predict['price']=pd.DataFrame(y_pred)

In [20]:
predict

Unnamed: 0,SaleID,price
0,200000,1315.649133
1,200001,1917.496998
2,200002,8458.509027
3,200003,1065.770604
4,200004,1972.450433
...,...,...
49995,249995,6569.679661
49996,249996,18357.471991
49997,249997,5590.169703
49998,249998,4888.097709


In [15]:
predict.to_csv('submit.csv',index = False)