In [186]:
import pandas as pd
import numpy as np
import re
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer


In [2]:
pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

Collecting https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
  Using cached https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [328]:
df = pd.read_csv('trainCAR.csv')

In [329]:
df

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
188528,188528,Cadillac,Escalade ESV Platinum,2017,49000,Gasoline,420.0HP 6.2L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,White,Beige,None reported,Yes,27500
188529,188529,Mercedes-Benz,AMG C 43 AMG C 43 4MATIC,2018,28600,Gasoline,385.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,White,Black,At least 1 accident or damage reported,Yes,30000
188530,188530,Mercedes-Benz,AMG GLC 63 Base 4MATIC,2021,13650,Gasoline,469.0HP 4.0L 8 Cylinder Engine Gasoline Fuel,7-Speed A/T,White,Black,None reported,Yes,86900
188531,188531,Audi,S5 3.0T Prestige,2022,13895,Gasoline,3.0L,1-Speed Automatic,Daytona Gray Pearl Effect,Black,None reported,,84900


In [51]:
profile = ProfileReport(df, title = 'used car')

In [5]:
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [330]:
df.isna().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [331]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


In [332]:
def preprocessing(df):
    df['fuel_type'].fillna('Missing', inplace = True)
    df['accident'].fillna('Missing', inplace = True)
    df['clean_title'].fillna('Missing', inplace = True)
    df['cylinder_number'] = df['engine'].str.extract('(\d+)(?=\s*Cylinder)')
    df['Horse_power']= df['engine'].str.extract('(\d+\.?\d*)\s*HP')
    df.drop(columns='engine', axis=1,inplace = True )
    df.drop(columns='int_col', axis=1,inplace = True )
    df.drop(columns='ext_col', axis=1,inplace = True )
    df.drop(columns='model', axis=1,inplace = True )
    current_year = datetime.now().year
    df['Vehicle_Age'] = current_year - df['model_year']
    df["fuel_type"]=np.where(df["fuel_type"].isin(["E85 Flex Fuel","Diesel","Hybrid","Gasoline"]),"Gasoline","Electric")
    luxury_brands =  ['Rolls-Royce',
                'Bentley',
                'Ferrari',
                'Lamborghini',
                'Aston Martin',
                'Porsche',
                'McLaren',
                'Bugatti',
                'Maybach']
    premium_brands = ['Mercedes-Benz',
                        'BMW',
                        'Audi',
                        'Genesis',
                        'Tesla',
                        'Jaguar',
                        'Lexus',
                        'Cadillac',
                        'Volvo',
                        'Polestar',
                        'Alfa Romeo',
                        'Maserati',
                        'Rivian']
    
    df['Is_Luxury_Brand'] = df['brand'].apply(lambda x: 3 if x in luxury_brands else (2 if x in premium_brands else 1))
    df['cylinder_number']=df['cylinder_number'].astype(float)
    df['Horse_power']=df['Horse_power'].astype(float)
    df['Horse_power']=df['Horse_power'].fillna(df['Horse_power'].median())
    df['cylinder_number']=df['cylinder_number'].fillna(df['cylinder_number'].median())
    df.drop(columns='model_year', axis=1,inplace = True )

In [333]:
df_cleaned = df.copy()

In [334]:
df_cleaned

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
188528,188528,Cadillac,Escalade ESV Platinum,2017,49000,Gasoline,420.0HP 6.2L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,White,Beige,None reported,Yes,27500
188529,188529,Mercedes-Benz,AMG C 43 AMG C 43 4MATIC,2018,28600,Gasoline,385.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,White,Black,At least 1 accident or damage reported,Yes,30000
188530,188530,Mercedes-Benz,AMG GLC 63 Base 4MATIC,2021,13650,Gasoline,469.0HP 4.0L 8 Cylinder Engine Gasoline Fuel,7-Speed A/T,White,Black,None reported,Yes,86900
188531,188531,Audi,S5 3.0T Prestige,2022,13895,Gasoline,3.0L,1-Speed Automatic,Daytona Gray Pearl Effect,Black,None reported,,84900


In [335]:
preprocessing(df_cleaned)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fuel_type'].fillna('Missing', inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['accident'].fillna('Missing', inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [336]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               188533 non-null  int64  
 1   brand            188533 non-null  object 
 2   milage           188533 non-null  int64  
 3   fuel_type        188533 non-null  object 
 4   transmission     188533 non-null  object 
 5   accident         188533 non-null  object 
 6   clean_title      188533 non-null  object 
 7   price            188533 non-null  int64  
 8   cylinder_number  188533 non-null  float64
 9   Horse_power      188533 non-null  float64
 10  Vehicle_Age      188533 non-null  int64  
 11  Is_Luxury_Brand  188533 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 17.3+ MB


In [337]:
df_cleaned

Unnamed: 0,id,brand,milage,fuel_type,transmission,accident,clean_title,price,cylinder_number,Horse_power,Vehicle_Age,Is_Luxury_Brand
0,0,MINI,213000,Gasoline,A/T,None reported,Yes,4200,4.0,172.0,17,1
1,1,Lincoln,143250,Gasoline,A/T,At least 1 accident or damage reported,Yes,4999,8.0,252.0,22,1
2,2,Chevrolet,136731,Gasoline,A/T,None reported,Yes,13900,8.0,320.0,22,1
3,3,Genesis,19500,Gasoline,Transmission w/Dual Shift Mode,None reported,Yes,45000,8.0,420.0,7,2
4,4,Mercedes-Benz,7388,Gasoline,7-Speed A/T,None reported,Yes,97500,4.0,208.0,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...
188528,188528,Cadillac,49000,Gasoline,Transmission w/Dual Shift Mode,None reported,Yes,27500,8.0,420.0,7,2
188529,188529,Mercedes-Benz,28600,Gasoline,8-Speed A/T,At least 1 accident or damage reported,Yes,30000,6.0,385.0,6,2
188530,188530,Mercedes-Benz,13650,Gasoline,7-Speed A/T,None reported,Yes,86900,8.0,469.0,3,2
188531,188531,Audi,13895,Gasoline,1-Speed Automatic,None reported,Missing,84900,6.0,329.0,2,2


In [338]:
X = pd.get_dummies(df_cleaned, dtype=int)
X = X.drop('price', axis=1)
X = X.drop('id', axis =1)
X = X.drop('brand_smart', axis =1)
X = X.drop('brand_Polestar', axis =1)

y = df_cleaned['price']

In [339]:
print(X.columns.tolist())

['milage', 'cylinder_number', 'Horse_power', 'Vehicle_Age', 'Is_Luxury_Brand', 'brand_Acura', 'brand_Alfa', 'brand_Aston', 'brand_Audi', 'brand_BMW', 'brand_Bentley', 'brand_Bugatti', 'brand_Buick', 'brand_Cadillac', 'brand_Chevrolet', 'brand_Chrysler', 'brand_Dodge', 'brand_FIAT', 'brand_Ferrari', 'brand_Ford', 'brand_GMC', 'brand_Genesis', 'brand_Honda', 'brand_Hummer', 'brand_Hyundai', 'brand_INFINITI', 'brand_Jaguar', 'brand_Jeep', 'brand_Karma', 'brand_Kia', 'brand_Lamborghini', 'brand_Land', 'brand_Lexus', 'brand_Lincoln', 'brand_Lotus', 'brand_Lucid', 'brand_MINI', 'brand_Maserati', 'brand_Maybach', 'brand_Mazda', 'brand_McLaren', 'brand_Mercedes-Benz', 'brand_Mercury', 'brand_Mitsubishi', 'brand_Nissan', 'brand_Plymouth', 'brand_Pontiac', 'brand_Porsche', 'brand_RAM', 'brand_Rivian', 'brand_Rolls-Royce', 'brand_Saab', 'brand_Saturn', 'brand_Scion', 'brand_Subaru', 'brand_Suzuki', 'brand_Tesla', 'brand_Toyota', 'brand_Volkswagen', 'brand_Volvo', 'fuel_type_Electric', 'fuel_type_

In [340]:
X.shape

(188533, 119)

In [341]:
y.shape

(188533,)

In [342]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [343]:
y_train.shape

(131973,)

In [344]:
X_train

Unnamed: 0,milage,cylinder_number,Horse_power,Vehicle_Age,Is_Luxury_Brand,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,...,transmission_Single-Speed Fixed Gear,transmission_Transmission Overdrive Switch,transmission_Transmission w/Dual Shift Mode,transmission_Variable,transmission_–,accident_At least 1 accident or damage reported,accident_Missing,accident_None reported,clean_title_Missing,clean_title_Yes
69007,119000,6.0,245.0,13,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
68561,61117,6.0,329.0,3,2,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
22502,29140,6.0,310.0,7,2,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
4493,134602,6.0,295.0,15,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
32131,70700,4.0,181.0,10,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,89106,8.0,350.0,17,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
103694,57300,6.0,320.0,5,2,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
131932,31500,6.0,420.0,3,3,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
146867,186000,6.0,268.0,13,2,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [345]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 131973 entries, 69007 to 121958
Columns: 119 entries, milage to clean_title_Yes
dtypes: float64(2), int64(117)
memory usage: 120.8 MB


In [346]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((131973, 119), (56560, 119), (131973,), (56560,))

In [347]:
std_scaler = StandardScaler()
X_train_scale=std_scaler.fit_transform(X_train)
X_test_scale=std_scaler.transform(X_test)

In [348]:
X_train_scale

array([[ 1.06809471, -0.21244264, -0.91626903, ..., -1.81264242,
        -0.35662917,  0.35662917],
       [-0.09413524, -0.21244264, -0.11158232, ...,  0.55168079,
         2.80403309, -2.80403309],
       [-0.73619987, -0.21244264, -0.29359479, ...,  0.55168079,
        -0.35662917,  0.35662917],
       ...,
       [-0.68881355, -0.21244264,  0.76016162, ...,  0.55168079,
        -0.35662917,  0.35662917],
       [ 2.41338446, -0.21244264, -0.69593814, ...,  0.55168079,
        -0.35662917,  0.35662917],
       [-1.20104768, -0.21244264, -0.11158232, ...,  0.55168079,
         2.80403309, -2.80403309]])

In [304]:
params = {
    'application': 'regression', # for binary classification
#     'num_class' : 1, # used for multi-classes
    'boosting': 'gbdt', # traditional gradient boosting decision tree
    'num_iterations': 100, 
    'learning_rate': 0.01,
    'num_leaves': 62,
    'device': 'cpu', # you can use GPU to achieve faster learning
    'max_bin': 250, # Small number of bins may reduce training accuracy but can deal with over-fitting
    'lambda_l1': 5, # L1 regularization
    'lambda_l2': 10, # L2 regularization
    'metric' : 'mean_squared_error',
    'subsample_for_bin': 200, # number of samples for constructing bins
    'subsample': 1, # subsample ratio of the training instance
    'colsample_bytree': 0.8, # subsample ratio of columns when constructing the tree
    'min_split_gain': 0.5, # minimum loss reduction required to make further partition on a leaf node of the tree
    'min_child_weight': 1, # minimum sum of instance weight (hessian) needed in a leaf
    'min_child_samples': 5 # minimum number of data needed in a leaf
}

In [306]:
# Initiate classifier to use
mdl = lgb.LGBMRegressor(boosting_type= 'gbdt', 
          objective = 'regression', 
          n_jobs = 5, 
          silent = True,
          max_bin = params['max_bin'], 
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'], 
          min_split_gain = params['min_split_gain'], 
          min_child_weight = params['min_child_weight'], 
          min_child_samples = params['min_child_samples'],
          callbacks=[lgb.log_evaluation()])


In [307]:
gridParams = {
    'learning_rate': [0.01],
    'n_estimators': [500],
    'metric' : ['l2'],
    'max_depth' :[3,5,7,11],
    'num_leaves': [10,30,60,70], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
    'objective' : ['regression'],
    'random_state' : [500]
    }

grid = GridSearchCV(mdl, gridParams, verbose=1, cv=3, n_jobs=-1)
# Run the grid
grid.fit(X_train_scale, y_train)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040143 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.090891 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.327252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.164587 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.223894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n









[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030329 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.101767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.093598 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094330 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.111435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073103 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.092735 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795










[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795










[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025596 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024932 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042887 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027572 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026035 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024528 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026909 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n







[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023294 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031265 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033841 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044809 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025774 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049282 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025089 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795










[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061822 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068312 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795


[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044396 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027897 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056480 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.115126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784








[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039993 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023627 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043788 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n









[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 131973, number of used features: 119
[LightGBM] [Info] Start training from score 43940.411311






{'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 3, 'metric': 'l2', 'n_estimators': 500, 'num_leaves': 10, 'objective': 'regression', 'random_state': 500}
0.12931167277756242


In [349]:
lgb_train = lgb.Dataset(X_train_scale, label = y_train)

In [376]:
lgb_params ={
    'boosting_type': 'gbdt',
    'learning_rate': 0.01, 
    'n_estimators': 500, 
    'num_leaves': 50, 
    'max_depth' : 6,
    'objective': 'regression', 
    'random_state': 500,
    'reg_lambda' : 15
    }


In [377]:
lgbm = lgb.train(lgb_params, lgb_train, 100)





[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032241 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 827
[LightGBM] [Info] Number of data points in the train set: 131973, number of used features: 103
[LightGBM] [Info] Start training from score 43940.411311




In [378]:
y_pred = lgbm.predict(X_test_scale)
y_pred[:5]

array([54581.54126954, 31507.0150202 , 36351.61373954, 22672.20387931,
       36419.61422901])

In [379]:
real_val = y_test[:5]
real_val

111355    37999
182258    30000
14147     26772
79313     24999
101160    47995
Name: price, dtype: int64

In [380]:
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [381]:
rmse

66453.00854105878

In [356]:
df_test = pd.read_csv('testCAR.csv')

In [357]:
df_test

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
125685,314218,Mercedes-Benz,GL-Class GL 450 4MATIC,2014,83315,Gasoline,362.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Black,None reported,Yes
125686,314219,Audi,Q7 55 Prestige,2019,29336,Gasoline,3.0 Liter Turbo,Automatic,White,Black,None reported,
125687,314220,Audi,A6 3.0T Premium Plus,2012,77634,Gasoline,333.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,A/T,Black,Black,None reported,Yes
125688,314221,Audi,Q7 3.0T Premium,2012,112000,Gasoline,333.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,A/T,Black,Black,None reported,Yes


In [358]:
preprocessing(df_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fuel_type'].fillna('Missing', inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['accident'].fillna('Missing', inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [359]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               125690 non-null  int64  
 1   brand            125690 non-null  object 
 2   milage           125690 non-null  int64  
 3   fuel_type        125690 non-null  object 
 4   transmission     125690 non-null  object 
 5   accident         125690 non-null  object 
 6   clean_title      125690 non-null  object 
 7   cylinder_number  125690 non-null  float64
 8   Horse_power      125690 non-null  float64
 9   Vehicle_Age      125690 non-null  int64  
 10  Is_Luxury_Brand  125690 non-null  int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 10.5+ MB


In [360]:
df_test_final = pd.get_dummies(df_test, dtype=int)

In [361]:
X_test_final = df_test_final.drop('id',axis = 1)

In [362]:
X_test_final.shape

(125690, 119)

In [363]:
X_test_scale_final=std_scaler.transform(X_test_final)

In [364]:
print(X_test_final.columns.tolist())

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064302 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356
['milage', 'cylinder_number', 'Horse_power', 'Vehicle_Age', 'Is_Luxury_Brand', 'brand_Acura', 'brand_Alfa', 'brand_Aston', 'brand_Audi', 'brand_BMW', 'brand_Bentley', 'brand_Bugatt

In [365]:
X_test_scale_final.shape

(125690, 119)

array([16408.62887801, 70059.99003574, 55703.26006916, ...,
       29320.61837508, 20993.54215447, 38705.00865424])

In [366]:
ypred_lgbm = lgbm.predict(X_test_scale_final)

In [367]:
df_test['price'] = ypred_lgbm

In [368]:
df_test

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.088219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43963.627356
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047316 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.146211 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

Unnamed: 0,id,brand,milage,fuel_type,transmission,accident,clean_title,cylinder_number,Horse_power,Vehicle_Age,Is_Luxury_Brand,price
0,188533,Land,98000,Gasoline,6-Speed A/T,None reported,Yes,4.0,240.0,9,1,17988.923086
1,188534,Land,9142,Gasoline,8-Speed A/T,None reported,Yes,6.0,395.0,4,1,69673.621558
2,188535,Ford,28121,Gasoline,10-Speed Automatic,None reported,Missing,6.0,328.0,2,1,60604.642221
3,188536,Audi,61258,Gasoline,Automatic,None reported,Missing,6.0,328.0,8,2,30431.379283
4,188537,Audi,59000,Gasoline,A/T,None reported,Yes,4.0,252.0,6,2,30489.232468
...,...,...,...,...,...,...,...,...,...,...,...,...
125685,314218,Mercedes-Benz,83315,Gasoline,7-Speed A/T,None reported,Yes,6.0,362.0,10,2,28071.434747
125686,314219,Audi,29336,Gasoline,Automatic,None reported,Missing,6.0,328.0,5,2,48517.333113
125687,314220,Audi,77634,Gasoline,A/T,None reported,Yes,6.0,333.0,12,2,28596.878983
125688,314221,Audi,112000,Gasoline,A/T,None reported,Yes,6.0,333.0,12,2,18347.530875


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43921.274795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 87982, number of used features: 119
[LightGBM] [Info] Start training from score 43936.331784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.140263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

In [369]:
submission = df_test[['id']]

In [370]:
submission['price'] = df_test['price']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['price'] = df_test['price']


In [371]:
submission

Unnamed: 0,id,price
0,188533,17988.923086
1,188534,69673.621558
2,188535,60604.642221
3,188536,30431.379283
4,188537,30489.232468
...,...,...
125685,314218,28071.434747
125686,314219,48517.333113
125687,314220,28596.878983
125688,314221,18347.530875


In [373]:
submission.to_csv('submissionLGBM5.csv',index=None)

In [375]:
pd.read_csv('submissionLGBM5.csv')

Unnamed: 0,id,price
0,188533,17988.923086
1,188534,69673.621558
2,188535,60604.642221
3,188536,30431.379283
4,188537,30489.232468
...,...,...
125685,314218,28071.434747
125686,314219,48517.333113
125687,314220,28596.878983
125688,314221,18347.530875


In [383]:
submission[:5]

Unnamed: 0,id,price
0,188533,17988.923086
1,188534,69673.621558
2,188535,60604.642221
3,188536,30431.379283
4,188537,30489.232468
