In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, RobustScaler, MinMaxScaler, MaxAbsScaler, PolynomialFeatures, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, HuberRegressor, Lars, LassoLars, TheilSenRegressor,\
                                    LassoLarsIC, OrthogonalMatchingPursuit, PassiveAggressiveRegressor, RANSACRegressor, SGDRegressor
                            
from sklearn.decomposition import PCA, TruncatedSVD
                                    
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, IsolationForest, \
                                BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline

In [16]:
final_train = pd.read_csv('train_ready.csv')
final_test = pd.read_csv('test_ready.csv')

In [17]:
final_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 7000 non-null   int64  
 1   Brand              7000 non-null   object 
 2   VehicleModel       7000 non-null   object 
 3   ManufacturingYear  7000 non-null   int64  
 4   Type               7000 non-null   object 
 5   rating             7000 non-null   float64
 6   color              7000 non-null   object 
 7   Duty               7000 non-null   int64  
 8   fuel               7000 non-null   object 
 9   CylinderCount      7000 non-null   int64  
 10  type of gear       7000 non-null   object 
 11  capacity           7000 non-null   object 
 12  Odometer           7000 non-null   int64  
 13  airbags            7000 non-null   int64  
 14  price              7000 non-null   int64  
dtypes: float64(1), int64(7), object(7)
memory usage: 820.4+ KB


In [18]:
final_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 7000 non-null   int64  
 1   Brand              7000 non-null   object 
 2   VehicleModel       7000 non-null   object 
 3   ManufacturingYear  7000 non-null   int64  
 4   Type               7000 non-null   object 
 5   rating             7000 non-null   float64
 6   color              7000 non-null   object 
 7   Duty               7000 non-null   int64  
 8   fuel               7000 non-null   object 
 9   CylinderCount      7000 non-null   int64  
 10  type of gear       7000 non-null   object 
 11  capacity           7000 non-null   object 
 12  Odometer           7000 non-null   int64  
 13  airbags            7000 non-null   int64  
 14  price              7000 non-null   int64  
dtypes: float64(1), int64(7), object(7)
memory usage: 820.4+ KB


In [19]:
X = final_train.drop(['price', 'ID'], axis=1)
y = final_train['price']

cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude='object').columns

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
    # Create a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_transformer = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', cat_transformer, cat_features),
    ('num', num_transformer, num_features)
])

x_train_preprocessed = preprocessor.fit_transform(x_train)
x_test_preprocessed = preprocessor.transform(x_test)

x_train_preprocessed = pd.DataFrame(x_train_preprocessed)

In [21]:
# Linear Regression
lr = LinearRegression()
lr_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('lr', lr)
])
lr_pipe.fit(x_train, y_train)
lr_pred = lr_pipe.predict(x_test)
lr_score = mean_absolute_error(y_test, lr_pred)
print(f'Linear Regression MAE: {lr_score}')

Linear Regression MAE: 7497.792361904292


In [22]:
# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('dt', dt)
])
dt_pipe.fit(x_train, y_train)
dt_pred = dt_pipe.predict(x_test)
dt_score = mean_absolute_error(y_test, dt_pred)
print(f'Decision Tree MAE: {dt_score}')

Decision Tree MAE: 7064.74


In [23]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', rf)
])
rf_pipe.fit(x_train, y_train)
rf_pred = rf_pipe.predict(x_test)
rf_score = mean_absolute_error(y_test, rf_pred)
print(f'Random Forest MAE: {rf_score}')

Random Forest MAE: 5361.895073282313


In [24]:
# Extra Trees
et = ExtraTreesRegressor(random_state=42)
et_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('et', et)
])
et_pipe.fit(x_train, y_train)
et_pred = et_pipe.predict(x_test)
et_score = mean_absolute_error(y_test, et_pred)
print(f'Extra Trees MAE: {et_score}')

Extra Trees MAE: 5431.793599999999


In [25]:
# Bagging
bag = BaggingRegressor(random_state=42)
bag_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('bag', bag)
])
bag_pipe.fit(x_train, y_train)
bag_pred = bag_pipe.predict(x_test)
bag_score = mean_absolute_error(y_test, bag_pred)
print(f'Bagging MAE: {bag_score}')

Bagging MAE: 5605.4004583333335


In [26]:
# AdaBoost
ada = AdaBoostRegressor(random_state=42)
ada_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('ada', ada)
])
ada_pipe.fit(x_train, y_train)
ada_pred = ada_pipe.predict(x_test)
ada_score = mean_absolute_error(y_test, ada_pred)
print(f'AdaBoost MAE: {ada_score}')

AdaBoost MAE: 8285.872533433176


In [27]:
# Gradient Boosting
gb = GradientBoostingRegressor(random_state=42)
gb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('gb', gb)
])
gb_pipe.fit(x_train, y_train)
gb_pred = gb_pipe.predict(x_test)
gb_score = mean_absolute_error(y_test, gb_pred)
print(f'Gradient Boosting MAE: {gb_score}')

Gradient Boosting MAE: 6486.445359506321


In [28]:
# Voting
vot = VotingRegressor([('lr', lr), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('ada', ada), ('gb', gb)])
vot_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('vot', vot)
])
vot_pipe.fit(x_train, y_train)
vot_pred = vot_pipe.predict(x_test)
vot_score = mean_absolute_error(y_test, vot_pred)
print(f'Voting MAE: {vot_score}')


Voting MAE: 5863.435464195449


In [29]:
submission = pd.DataFrame()
submission['ID'] = final_test['ID']
submission['price'] = rf_pipe.predict(final_test.drop('ID', axis=1))

In [30]:
submission.to_csv('submission1.csv', index=False)

In [31]:
# Stacking
stack = StackingRegressor([('lr', lr), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('ada', ada), ('gb', gb)])
stack_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stack)
])
stack_pipe.fit(x_train, y_train)
stack_pred = stack_pipe.predict(x_test)
stack_score = mean_absolute_error(y_test, stack_pred)
print(f'Stacking MAE: {stack_score}')

Stacking MAE: 5271.000418999875


In [32]:
submission = pd.DataFrame()
submission['ID'] = final_test['ID']
submission['price'] = stack_pipe.predict(final_test.drop('ID', axis=1))

submission.to_csv('submission2.csv', index=False)

In [33]:
# SVR
svr = SVR()
svr_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('svr', svr)
])
svr_pipe.fit(x_train, y_train)
svr_pred = svr_pipe.predict(x_test)
svr_score = mean_absolute_error(y_test, svr_pred)
print(f'SVR MAE: {svr_score}')


SVR MAE: 9192.654481933048


In [34]:
# KNN
knn = KNeighborsRegressor()
knn_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('knn', knn)
])
knn_pipe.fit(x_train, y_train)
knn_pred = knn_pipe.predict(x_test)
knn_score = mean_absolute_error(y_test, knn_pred)
print(f'KNN MAE: {knn_score}')

KNN MAE: 7355.9784285714295


In [35]:
# Isolation Forest
iso = IsolationForest()
iso_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('iso', iso)
])
iso_pipe.fit(x_train, y_train)
iso_pred = iso_pipe.predict(x_test)
iso_score = mean_absolute_error(y_test, iso_pred)
print(f'Isolation Forest MAE: {iso_score}')

Isolation Forest MAE: 14554.460714285715


So till now the best performing models are:
1. **Stacking Reressor** with 5076 MAE
2. **Random Forest** with 5361 MAE
3. **Extra Trees** with 5431 MAE
4. **Bagging Regressor** with 5605 MAE

In [36]:
rf = RandomForestRegressor(random_state=42)
et = ExtraTreesRegressor(random_state=42)
bag = BaggingRegressor(random_state=42)
lgb = LGBMRegressor(n_estimators=1000, verbosity=0)
cb = CatBoostRegressor(iterations=1000, verbose=0)
xgb = XGBRegressor(n_estimators=1000, verbosity=0)
voting = VotingRegressor([('rf', rf), ('et', et), ('bag', bag),('xgb', xgb)])

stack = StackingRegressor([('voting', voting), ('rf', rf), ('et', et), ('bag', bag), ('lgb', lgb), ('cb', cb), ('xgb', xgb)], final_estimator=lr)
stack_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stack)
])
stack_pipe.fit(x_train, y_train)
stack_pred = stack_pipe.predict(x_test)
stack_score = mean_absolute_error(y_test, stack_pred)
print(f'Stacking MAE: {stack_score}')

  File "c:\ProgramData\Anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 245, in _count_physical_cores
    raise ValueError(


Stacking MAE: 5076.795878029337


In [37]:
submission = pd.DataFrame()
submission['ID'] = final_test['ID']
submission['price'] = stack_pipe.predict(final_test.drop('ID', axis=1))

submission.to_csv('submission5.csv', index=False)

In [38]:
# Voting
vot = VotingRegressor([('rf', rf), ('et', et)])
vot_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('vot', vot)
])
vot_pipe.fit(x_train, y_train)
vot_pred = vot_pipe.predict(x_test)
vot_score = mean_absolute_error(y_test, vot_pred)
print(f'Voting MAE: {vot_score}')


Voting MAE: 5282.622563545918


In [39]:
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
et = ExtraTreesRegressor(random_state=42)
bag = BaggingRegressor(random_state=42)
lgb = LGBMRegressor(n_estimators=1000, verbosity=0)
cb = CatBoostRegressor(iterations=1000, verbose=0)
xgb = XGBRegressor(n_estimators=1000, verbosity=0)
voting = VotingRegressor([('rf', rf), ('et', et), ('xgb', xgb), ('lgb', lgb), ('cb', cb)])

stack = StackingRegressor([('voting', voting), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('lgb', lgb), ('cb', cb), ('xgb', xgb)], final_estimator=lr)
stack_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stack)
])
stack_pipe.fit(x_train, y_train)
stack_pred = stack_pipe.predict(x_test)
stack_score = mean_absolute_error(y_test, stack_pred)
print(f'Stacking MAE: {stack_score}')

Stacking MAE: 5075.099352118044


In [40]:
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
et = ExtraTreesRegressor(random_state=42)
bag = BaggingRegressor(random_state=42)
lgb = LGBMRegressor(n_estimators=1500, verbosity=0)
cb = CatBoostRegressor(iterations=1500, verbose=0)
xgb = XGBRegressor(n_estimators=1500, verbosity=0)
voting = VotingRegressor([('rf', rf), ('et', et)])

stack = StackingRegressor([('voting', voting), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('lgb', lgb), ('cb', cb), ('xgb', xgb)], final_estimator=lr)
stack_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stack)
])
stack_pipe.fit(x_train, y_train)
stack_pred = stack_pipe.predict(x_test)
stack_score = mean_absolute_error(y_test, stack_pred)
print(f'Stacking MAE: {stack_score}')

Stacking MAE: 5071.718291975172


In [41]:
submission = pd.DataFrame()
submission['ID'] = final_test['ID']
submission['price'] = stack_pipe.predict(final_test.drop('ID', axis=1))

submission.to_csv('submission6.csv', index=False)

In [42]:
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
et = ExtraTreesRegressor(random_state=42)
bag = BaggingRegressor(random_state=42)
lgb = LGBMRegressor(n_estimators=1500)
cb = CatBoostRegressor(iterations=1500)
gbr = GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1)

voting = VotingRegressor([('rf', rf), ('et', et)])

stack = StackingRegressor([('voting', voting), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('lgb', lgb), ('cb', cb)], final_estimator=gbr)
stack_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stack)
])
stack_pipe.fit(x_train, y_train)
stack_pred = stack_pipe.predict(x_test)
stack_score = mean_absolute_error(y_test, stack_pred)
print(f'Stacking MAE: {stack_score}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000857 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 900
[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 152
[LightGBM] [Info] Start training from score 14369.717857
Learning rate set to 0.038657
0:	learn: 11621.9290537	total: 6.9ms	remaining: 10.3s
1:	learn: 11523.5989800	total: 13.2ms	remaining: 9.87s
2:	learn: 11430.9476484	total: 19.8ms	remaining: 9.88s
3:	learn: 11328.6824322	total: 26.7ms	remaining: 9.99s
4:	learn: 11223.3443929	total: 33ms	remaining: 9.87s
5:	learn: 11128.7622633	total: 39.6ms	remaining: 9.85s
6:	learn: 11056.1056945	total: 46.2ms	remaining: 9.85s
7:	learn: 10992.8722905	total: 52.7ms	remaining: 9.84s
8:	learn: 10920.2142675	total: 60.1ms	remaining: 9.96s
9:	learn: 10852.1056322	total: 66.5ms	remaining: 9.91s
10:	learn: 10787.3251049	total: 74.6ms	remaining: 10.1s
11:	learn: 10730.49276

In [43]:
# stack 1
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
et = ExtraTreesRegressor(random_state=42)
bag = BaggingRegressor(random_state=42)
lgb = LGBMRegressor(n_estimators=1500, verbosity=0)
cb = CatBoostRegressor(iterations=1500, verbose=0)
xgb = XGBRegressor(n_estimators=1500, verbosity=0)
voting = VotingRegressor([('rf', rf), ('et', et)])

stack1 = StackingRegressor([('voting', voting), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('lgb', lgb), ('cb', cb), ('xgb', xgb)], final_estimator=lr)

# stack 2
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
et = ExtraTreesRegressor(random_state=42)
bag = BaggingRegressor(random_state=42)
lgb = LGBMRegressor(n_estimators=1000, verbosity=0)
cb = CatBoostRegressor(iterations=1000, verbose=0)
xgb = XGBRegressor(n_estimators=1000, verbosity=0)
voting = VotingRegressor([('rf', rf), ('et', et), ('xgb', xgb), ('lgb', lgb), ('cb', cb)])

stack2 = StackingRegressor([('voting', voting), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('lgb', lgb), ('cb', cb), ('xgb', xgb)], final_estimator=lr)

# stack 3

stacl3 = StackingRegressor([('stack1', stack1), ('stack2', stack2)], final_estimator=lr)
stack_pipe3 = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stack)
])
stack_pipe3.fit(x_train, y_train)
stack_pred = stack_pipe3.predict(x_test)
stack_score = mean_absolute_error(y_test, stack_pred)
print(f'Stacking MAE: {stack_score}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000642 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 900
[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 152
[LightGBM] [Info] Start training from score 14369.717857
Learning rate set to 0.038657
0:	learn: 11621.9290537	total: 7.8ms	remaining: 11.7s
1:	learn: 11523.5989800	total: 14ms	remaining: 10.5s
2:	learn: 11430.9476484	total: 21.3ms	remaining: 10.6s
3:	learn: 11328.6824322	total: 27.3ms	remaining: 10.2s
4:	learn: 11223.3443929	total: 33.9ms	remaining: 10.1s
5:	learn: 11128.7622633	total: 40.6ms	remaining: 10.1s
6:	learn: 11056.1056945	total: 46.6ms	remaining: 9.94s
7:	learn: 10992.8722905	total: 53.5ms	remaining: 9.98s
8:	learn: 10920.2142675	total: 59.9ms	remaining: 9.93s
9:	learn: 10852.1056322	total: 66.3ms	remaining: 9.88s
10:	learn: 10787.3251049	total: 73.2ms	remaining: 9.91s
11:	learn: 10730.49276

In [45]:
reg = VotingRegressor([
    ("forest", RandomForestRegressor(random_state = 42, n_estimators = 145)),
    ("extra", ExtraTreesRegressor(random_state = 42, n_estimators = 140)),
])

reg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('reg', reg)
])
reg_pipe.fit(x_train, y_train)
reg_pred = reg_pipe.predict(x_test)
reg_score = mean_absolute_error(y_test, reg_pred)
print(f'Voting MAE: {reg_score}')

Voting MAE: 5281.652097190946


In [46]:
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state = 42, n_estimators = 145)
et = ExtraTreesRegressor(random_state = 42, n_estimators = 140)
bag = BaggingRegressor(random_state=42)
lgb = LGBMRegressor(n_estimators=1500, verbosity=0)
cb = CatBoostRegressor(iterations=1500, verbose=0)
xgb = XGBRegressor(n_estimators=1500, verbosity=0)
voting = VotingRegressor([
    ("rf", RandomForestRegressor(random_state = 42, n_estimators = 145)),
    ("et", ExtraTreesRegressor(random_state = 42, n_estimators = 140)),
])

stack = StackingRegressor([('voting', voting), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('lgb', lgb), ('cb', cb), ('xgb', xgb)], final_estimator=lr)
stack_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stack)
])
stack_pipe.fit(x_train, y_train)
stack_pred = stack_pipe.predict(x_test)
stack_score = mean_absolute_error(y_test, stack_pred)
print(f'Stacking MAE: {stack_score}')

Stacking MAE: 5071.033139383548


In [47]:
submission = pd.DataFrame()
submission['ID'] = final_test['ID']
submission['price'] = stack_pipe.predict(final_test.drop('ID', axis=1))

submission.to_csv('submission7.csv', index=False)

In [49]:
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state = 42, n_estimators = 145)
et = ExtraTreesRegressor(random_state = 42, n_estimators = 140)
bag = BaggingRegressor(n_estimators=1000, random_state=42)
lgb = LGBMRegressor(n_estimators=1500, verbosity=0)
cb = CatBoostRegressor(iterations=1500, verbose=0)
xgb = XGBRegressor(n_estimators=1500, verbosity=0)
voting = VotingRegressor([
    ("rf", RandomForestRegressor(random_state = 42, n_estimators = 145)),
    ("et", ExtraTreesRegressor(random_state = 42, n_estimators = 140)),
])

stack = StackingRegressor([('voting', voting), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('lgb', lgb), ('cb', cb), ('xgb', xgb)], final_estimator=lr)
stack_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stack)
])
stack_pipe.fit(x_train, y_train)
stack_pred = stack_pipe.predict(x_test)
stack_score = mean_absolute_error(y_test, stack_pred)
print(f'Stacking MAE: {stack_score}')

Stacking MAE: 5066.42973698983


In [50]:
submission = pd.DataFrame()
submission['ID'] = final_test['ID']
submission['price'] = stack_pipe.predict(final_test.drop('ID', axis=1))

submission.to_csv('submission8.csv', index=False)

In [52]:
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state = 42, n_estimators = 145)
et = ExtraTreesRegressor(random_state = 42, n_estimators = 140)
bag = BaggingRegressor(n_estimators=1000, random_state=42)
lgb = LGBMRegressor(n_estimators=1500, verbosity=0)
cb = CatBoostRegressor(iterations=1500, verbose=0)
xgb = XGBRegressor(n_estimators=1500, verbosity=0)
voting = VotingRegressor([
    ("rf", RandomForestRegressor(random_state = 42, n_estimators = 145)),
    ("et", ExtraTreesRegressor(random_state = 42, n_estimators = 140)),
])

stack1 = StackingRegressor([('voting', voting), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('lgb', lgb), ('cb', cb), ('xgb', xgb)], final_estimator=lr)
stack2 = StackingRegressor([('voting', voting), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('lg', lgb), ('cb', cb), ('xgb', xgb)], final_estimator=stack1)
stack_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stack2)
])
stack_pipe.fit(x_train, y_train)
stack_pred = stack_pipe.predict(x_test)
stack_score = mean_absolute_error(y_test, stack_pred)
print(f'Stacking MAE: {stack_score}')

Stacking MAE: 5240.783966432438


In [53]:
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state = 42, n_estimators = 145)
et = ExtraTreesRegressor(random_state = 42, n_estimators = 140)
bag = BaggingRegressor(n_estimators=1300, random_state=42)
lgb = LGBMRegressor(n_estimators=1500, verbosity=0)
cb = CatBoostRegressor(iterations=1500, verbose=0)
xgb = XGBRegressor(n_estimators=1500, verbosity=0)
voting = VotingRegressor([
    ("rf", RandomForestRegressor(random_state = 42, n_estimators = 145)),
    ("et", ExtraTreesRegressor(random_state = 42, n_estimators = 140)),
])

stack = StackingRegressor([('voting', voting), ('dt', dt), ('rf', rf), ('et', et), ('bag', bag), ('lgb', lgb), ('cb', cb), ('xgb', xgb)], final_estimator=lr)
stack_pipe4 = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stack)
])
stack_pipe4.fit(x_train, y_train)
stack_pred4 = stack_pipe4.predict(x_test)
stack_score4 = mean_absolute_error(y_test, stack_pred4)
print(f'Stacking MAE: {stack_score4}')

Stacking MAE: 5065.320675904615


In [54]:
submission = pd.DataFrame()
submission['ID'] = final_test['ID']
submission['price'] = stack_pipe4.predict(final_test.drop('ID', axis=1))

submission.to_csv('submission10.csv', index=False)