In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
                                                                
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, AdaBoostRegressor,\
                            GradientBoostingRegressor, VotingRegressor, StackingRegressor, HistGradientBoostingRegressor

from sklearn.pipeline import Pipeline

In [30]:
train = pd.read_csv('train_ready_zakaria.csv')
test = pd.read_csv('test_ready_zakaria.csv')

In [31]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Brand              3000 non-null   object 
 1   VehicleModel       3000 non-null   object 
 2   ManufacturingYear  3000 non-null   int64  
 3   Type               3000 non-null   object 
 4   rating             3000 non-null   float64
 5   color              3000 non-null   object 
 6   Duty               3000 non-null   float64
 7   fuel               3000 non-null   object 
 8   CylinderCount      3000 non-null   int64  
 9   type of gear       3000 non-null   object 
 10  Odometer           3000 non-null   int64  
 11  airbags            3000 non-null   int64  
 12  Engine Volume      3000 non-null   float64
 13  Engine Type        3000 non-null   object 
 14  ID                 3000 non-null   int64  
dtypes: float64(3), int64(5), object(7)
memory usage: 351.7+ KB


In [32]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         7000 non-null   int64  
 1   Brand              7000 non-null   object 
 2   VehicleModel       7000 non-null   object 
 3   ManufacturingYear  7000 non-null   int64  
 4   Type               7000 non-null   object 
 5   rating             7000 non-null   float64
 6   color              7000 non-null   object 
 7   Duty               7000 non-null   float64
 8   fuel               7000 non-null   object 
 9   CylinderCount      7000 non-null   int64  
 10  type of gear       7000 non-null   object 
 11  Odometer           7000 non-null   int64  
 12  airbags            7000 non-null   int64  
 13  price              7000 non-null   int64  
 14  Engine Volume      7000 non-null   float64
 15  Engine Type        7000 non-null   object 
dtypes: float64(3), int64(6),

In [33]:
X = train.drop(['price', 'Unnamed: 0', 'rating', 'color'], axis=1)
y = train['price']

cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude='object').columns

In [34]:
cat_features, num_features

(Index(['Brand', 'VehicleModel', 'Type', 'fuel', 'type of gear', 'Engine Type'], dtype='object'),
 Index(['ManufacturingYear', 'Duty', 'CylinderCount', 'Odometer', 'airbags',
        'Engine Volume'],
       dtype='object'))

In [35]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude='object').columns

cat_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_transformer = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', cat_transformer, cat_features),
    ('num', num_transformer, num_features)
])

x_train_preprocessed = preprocessor.fit_transform(x_train)
x_test_preprocessed = preprocessor.transform(x_test)

x_train_preprocessed = pd.DataFrame(x_train_preprocessed)

In [37]:
# Random Forest
rf = RandomForestRegressor(random_state=42, n_estimators=145)
rf_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', rf)
])
rf_pipe.fit(x_train, y_train)
rf_pred = rf_pipe.predict(x_test)
rf_score = mean_absolute_error(y_test, rf_pred)
print(f'Random Forest MAE: {rf_score}')

Random Forest MAE: 4991.706226278442


In [None]:
et = ExtraTreesRegressor(n_estimators = 140, random_state=42)
et_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('et', et)
])
et_pipe.fit(x_train, y_train)
et_pred = et_pipe.predict(x_test)
et_score = mean_absolute_error(y_test, et_pred)
print(f'Extra Trees MAE: {et_score}')

Extra Trees MAE: 5330.619663265306


In [38]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', rf)
])
rf_pipe.fit(x_train, y_train)
rf_pred = rf_pipe.predict(x_test)
rf_score = mean_absolute_error(y_test, rf_pred)
print(f'Random Forest MAE: {rf_score}')

Random Forest MAE: 4987.405340960885


In [40]:
to_predict = test.drop(['ID', 'rating', 'color'], axis=1)

In [41]:
to_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Brand              3000 non-null   object 
 1   VehicleModel       3000 non-null   object 
 2   ManufacturingYear  3000 non-null   int64  
 3   Type               3000 non-null   object 
 4   Duty               3000 non-null   float64
 5   fuel               3000 non-null   object 
 6   CylinderCount      3000 non-null   int64  
 7   type of gear       3000 non-null   object 
 8   Odometer           3000 non-null   int64  
 9   airbags            3000 non-null   int64  
 10  Engine Volume      3000 non-null   float64
 11  Engine Type        3000 non-null   object 
dtypes: float64(2), int64(4), object(6)
memory usage: 281.4+ KB


In [43]:
submission = pd.DataFrame()
submission['ID'] = test['ID']
submission['price'] = rf_pipe.predict(to_predict)
submission.to_csv('late_submission_zakaria_rf_dropping_cols.csv', index=False)

In [44]:
# Random Forest
rf = RandomForestRegressor(random_state=42, n_estimators=100)
rf_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', rf)
])
rf_pipe.fit(x_train, y_train)
rf_pred = rf_pipe.predict(x_test)
rf_score = mean_absolute_error(y_test, rf_pred)
print(f'Random Forest MAE: {rf_score}')

Random Forest MAE: 4987.405340960885


In [14]:
submission = pd.DataFrame()
submission['ID'] = test['ID']
submission['price'] = rf_pipe.predict(test.drop('ID', axis=1))
submission.to_csv('late_submission_zakaria_rf.csv', index=False)

In [150]:
# Extra Trees
et = ExtraTreesRegressor(random_state=42)
et_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('et', et)
])
et_pipe.fit(x_train, y_train)
et_pred = et_pipe.predict(x_test)
et_score = mean_absolute_error(y_test, et_pred)
print(f'Extra Trees MAE: {et_score}')

Extra Trees MAE: 5340.067375


In [151]:
voting = VotingRegressor([
    ('rf', rf),
    ('et', et)
])
voting_pipe2 = Pipeline([
    ('preprocessor', preprocessor),
    ('voting', voting)
])
voting_pipe2.fit(x_train, y_train)
voting_pred2 = voting_pipe2.predict(x_test)
voting_score2 = mean_absolute_error(y_test, voting_pred2)
print(f'Voting Regressor MAE: {voting_score2}')

Voting Regressor MAE: 5061.747388290816


In [152]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', rf)
])
rf_pipe.fit(x_train, y_train)
rf_pred = rf_pipe.predict(x_test)
rf_score = mean_absolute_error(y_test, rf_pred)
print(f'Random Forest MAE: {rf_score}')

Random Forest MAE: 5059.394404438775


In [153]:
et = ExtraTreesRegressor(n_estimators = 140, random_state=42)
et_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('et', et)
])
et_pipe.fit(x_train, y_train)
et_pred = et_pipe.predict(x_test)
et_score = mean_absolute_error(y_test, et_pred)
print(f'Extra Trees MAE: {et_score}')

Extra Trees MAE: 5330.619663265306


In [154]:
# Voting Regressor
voting = VotingRegressor([
    ('rf', rf),
    ('et', et)
])
voting_pipe3 = Pipeline([
    ('preprocessor', preprocessor),
    ('voting', voting)
])
voting_pipe3.fit(x_train, y_train)
voting_pred3 = voting_pipe3.predict(x_test)
voting_score3 = mean_absolute_error(y_test, voting_pred3)
print(f'Voting Regressor MAE: {voting_score3}')

Voting Regressor MAE: 5060.198907423469


In [167]:
# stacking
voting = VotingRegressor([
    ('rf', RandomForestRegressor(random_state=42)),
    ('et', ExtraTreesRegressor(random_state=42, n_estimators=140))
])
dt = DecisionTreeRegressor(random_state=42)
bag = BaggingRegressor(n_estimators=1300, random_state=42)
lgb = LGBMRegressor(n_estimators=1500, verbosity=0)
cb = CatBoostRegressor(iterations=1500, verbose=0)
xgb = XGBRegressor(n_estimators=1500, verbosity=0)

stacking = StackingRegressor([
    ('voting', voting),
    ('dt', dt),
    ('rf',rf),
    ('et',  et),
    ('bag', bag),
    ('lgb', lgb),
    ('cb', cb),
    ('xgb', xgb)
], final_estimator=LinearRegression())

stacking_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('stacking', stacking)
])
stacking_pipe.fit(x_train, y_train)
stacking_pred = stacking_pipe.predict(x_test)
stacking_score = mean_absolute_error(y_test, stacking_pred)
print(f'Stacking Regressor MAE: {stacking_score}')

Stacking Regressor MAE: 4909.757368808491


In [168]:
submission = pd.DataFrame()
submission['ID'] = test['ID']
submission['price'] = stacking_pipe.predict(test.drop('ID', axis=1))
submission.to_csv('submission_zakaria_2.csv', index=False)

In [170]:
# stacking
voting = VotingRegressor([
    ('rf', RandomForestRegressor(random_state=42)),
    ('et', ExtraTreesRegressor(random_state=42, n_estimators=140))
])
dt = DecisionTreeRegressor(random_state=42)
bag = BaggingRegressor(n_estimators=1300, random_state=42)
lgb = LGBMRegressor(n_estimators=1500, verbosity=0)
cb = CatBoostRegressor(iterations=1500, verbose=0)
xgb = XGBRegressor(n_estimators=1500, verbosity=0)
hstgb = GradientBoostingRegressor()

stacking = StackingRegressor([
    ('voting', voting),
    ('dt', dt),
    ('rf',rf),
    ('et',  et),
    ('bag', bag),
    ('lgb', lgb),
    ('cb', cb),
    ('xgb', xgb), 
    ('hstgb', hstgb)
], final_estimator=LinearRegression())

stacking_pipe2 = Pipeline([
    ('preprocessor', preprocessor),
    ('stacking', stacking)
])
stacking_pipe2.fit(x_train, y_train)
stacking_pred2 = stacking_pipe2.predict(x_test)
stacking_score2 = mean_absolute_error(y_test, stacking_pred2)
print(f'Stacking Regressor MAE: {stacking_score2}')

Stacking Regressor MAE: 4900.675839867733


In [171]:
submission = pd.DataFrame()
submission['ID'] = test['ID']
submission['price'] = stacking_pipe2.predict(test.drop('ID', axis=1))
submission.to_csv('submission_zakaria_3_last_hope.csv', index=False)

In [47]:
# stacking
rf = RandomForestRegressor(random_state=42)
et = ExtraTreesRegressor(random_state=42, n_estimators=140)

voting = VotingRegressor([
    ('rf', rf),
    ('et', et)
])
dt = DecisionTreeRegressor(random_state=42)
bag = BaggingRegressor(n_estimators=1300, random_state=42)
lgb = LGBMRegressor(n_estimators=1500, verbosity=0)
cb = CatBoostRegressor(iterations=1500, verbose=0)
xgb = XGBRegressor(n_estimators=1500, verbosity=0)
gb = GradientBoostingRegressor()
ada = AdaBoostRegressor()

stacking = StackingRegressor([
    ('voting', voting),
    ('dt', dt),
    ('rf',rf),
    ('et',  et),
    ('bag', bag),
    ('lgb', lgb),
    ('cb', cb),
    ('xgb', xgb), 
    ('gb', gb),
    ('ada', ada)
], final_estimator=LinearRegression())

stacking_pipe2 = Pipeline([
    ('preprocessor', preprocessor),
    ('stacking', stacking)
])
stacking_pipe2.fit(x_train, y_train)
stacking_pred2 = stacking_pipe2.predict(x_test)
stacking_score2 = mean_absolute_error(y_test, stacking_pred2)
print(f'Stacking Regressor MAE: {stacking_score2}')

  File "c:\ProgramData\Anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 245, in _count_physical_cores
    raise ValueError(


Stacking Regressor MAE: 4865.056608565888


In [48]:
submission = pd.DataFrame()
submission['ID'] = test['ID']
submission['price'] = stacking_pipe2.predict(test.drop(['ID','rating', 'color'], axis=1))
submission.to_csv('stacking_data_improved.csv', index=False)

In [166]:
submissions = pd.DataFrame()
submissions['ID'] = test['ID']
submissions['price'] = stacking_pipe.predict(test.drop('ID', axis=1))
submissions.to_csv('zakaria_basic_stacking.csv', index=False)

In [None]:
cat_features2 = X.select_dtypes(include='object').columns
num_features2 = X.select_dtypes(exclude='object').columns

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore')
X_encoded = pd.DataFrame(ohe.fit_transform(X[cat_features2]).toarray())

X_processed = pd.concat([X[num_features2], X_encoded], axis=1)

# Now you can safely convert the processed data to float32
X_processed = X_processed.astype(np.float32)
y = y.astype(np.float32)

X_processed.columns = X_processed.columns.astype(str)

In [None]:
offset = int(X_processed.shape[0] * 0.8)
X_train, y_train = X_processed[:offset], y[:offset]
X_test, y_test = X_processed[offset:], y[offset:]

In [None]:
import lazypredict
from lazypredict.Supervised import LazyRegressor

lazy_reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=mean_absolute_error)


models,predictions = lazy_reg.fit(X_train, X_test, y_train, y_test)

  0%|          | 0/42 [00:00<?, ?it/s]

 17%|█▋        | 7/42 [00:06<00:15,  2.19it/s]

ElasticNetCV model failed to execute
Gram matrix passed in via 'precompute' parameter did not pass validation when a single element was checked - please check that it was computed properly. For element (267,268) we computed -4.475705146789551 but the user-supplied value was -4.475739002227783.


 26%|██▌       | 11/42 [01:34<04:26,  8.61s/it]


KeyboardInterrupt: 

In [None]:
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken,mean_absolute_error
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestRegressor,0.29,0.56,7642.6,12.76,5083.9
ExtraTreesRegressor,0.24,0.53,7931.84,23.34,5320.67
LGBMRegressor,0.21,0.51,8072.09,0.72,5560.46
HistGradientBoostingRegressor,0.2,0.51,8090.1,13.88,5600.52
XGBRegressor,0.19,0.5,8189.5,3.3,5704.3
BaggingRegressor,0.17,0.48,8281.97,1.7,5495.73
GradientBoostingRegressor,0.07,0.43,8743.71,5.24,6424.41
BayesianRidge,-0.2,0.26,9928.25,0.74,7704.2
ElasticNet,-0.2,0.26,9935.01,0.27,7721.12
LarsCV,-0.21,0.25,9966.24,2.25,7728.28


In [None]:
# lgbm
lgbm = LGBMRegressor(random_state=42)
lgbm_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('lgbm', lgbm)
])
lgbm_pipe.fit(x_train, y_train)
lgbm_pred = lgbm_pipe.predict(x_test)
lgbm_score = mean_absolute_error(y_test, lgbm_pred)
print(f'LGBM MAE: {lgbm_score}')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000383 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 904
[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 199
[LightGBM] [Info] Start training from score 14406.612679
LGBM MAE: 9277.984796648303
