In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [2]:
data = pd.read_csv('./data/model_data.csv', index_col=0)
data.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,60437,2016-04-06 10:17:21


## Format Data

**Repeating missing values fill process for all columns.**

In [3]:
percentage_of_missing = data.isnull().sum().sort_values(ascending=False) * 100 / len(data)
percentage_of_missing

notRepairedDamage      19.455079
vehicleType            10.217059
fuelType                8.935956
model                   5.350893
gearbox                 5.339391
dateCrawled             0.000000
kilometer               0.000000
postalCode              0.000000
dateCreated             0.000000
brand                   0.000000
monthOfRegistration     0.000000
powerPS                 0.000000
name                    0.000000
yearOfRegistration      0.000000
abtest                  0.000000
price                   0.000000
offerType               0.000000
seller                  0.000000
lastSeen                0.000000
dtype: float64

In [4]:
for col in data.columns:
    data[col] = np.where(data[col].isna(), 'non-specified', data[col])
percentage_of_missing = data.isnull().sum().sort_values(ascending=False) * 100 / len(data)
percentage_of_missing

dateCrawled            0.0
model                  0.0
postalCode             0.0
dateCreated            0.0
notRepairedDamage      0.0
brand                  0.0
fuelType               0.0
monthOfRegistration    0.0
kilometer              0.0
powerPS                0.0
name                   0.0
gearbox                0.0
yearOfRegistration     0.0
vehicleType            0.0
abtest                 0.0
price                  0.0
offerType              0.0
seller                 0.0
lastSeen               0.0
dtype: float64

Before fitting a model, it is important to transform the variables to a format that provides better interpretation for the model, according to each variable types. Here, we apply the following transformations for each feature type:

**1. Numerical variables: 0-1 normalization.**

Machine learning algorithms tend to perform better, or converge faster, when the different features are on a smaller scale. Before training machine learning models on data, it’s common practice to normalize the data first to potentially get better, faster results. Normalization also makes the training process less sensitive to the scale of the features, resulting in better coefficients after training.

**2. Categorical variables: One Hot Encoding.**

One hot encoding is a technique that we use to represent categorical variables as numerical values in a machine learning model. It allows the use of categorical variables in models that require numerical input. It can improve model performance by providing more information to the model about the categorical variable. It can help to avoid the problem of ordinality, which can occur when a categorical variable has a natural ordering (e.g. “small”, “medium”, “large”).

In [5]:
# removing columns based on data exploration on data-etl.ipynb notebook
cols_to_drop = ['name', 'dateCrawled', 'dateCreated', 'lastSeen', 'postalCode']
data = data.drop(cols_to_drop, axis=1)
data.dtypes

seller                 object
offerType              object
price                  object
abtest                 object
vehicleType            object
yearOfRegistration     object
gearbox                object
powerPS                object
model                  object
kilometer              object
monthOfRegistration    object
fuelType               object
brand                  object
notRepairedDamage      object
dtype: object

### Numerical Variables - Normalization

Using sklearn MinMaxScaler to normalize numerical data.

In [6]:
numerical_columns = ['yearOfRegistration', 'kilometer', 'monthOfRegistration', 'powerPS']
data_numerical = data[numerical_columns].astype(int)
scaler = MinMaxScaler()
data_numerical = scaler.fit_transform(data_numerical)
data_numerical = pd.DataFrame(data_numerical, columns=numerical_columns)
data_numerical.head()

Unnamed: 0,yearOfRegistration,kilometer,monthOfRegistration,powerPS
0,0.490196,1.0,0.0,0.0
1,0.843137,0.827586,0.416667,0.39501
2,0.705882,0.827586,0.666667,0.338877
3,0.647059,1.0,0.5,0.155925
4,0.784314,0.586207,0.583333,0.143451


### Categorical Variables - One Hot Encoding

Using sklearn OneHotEncoder to encode categorical data.

In [7]:
categorical_columns = ['seller', 'offerType', 'abtest', 'vehicleType', 'gearbox', 'model', 'fuelType', 'brand', 'notRepairedDamage']
data_categorical = data[categorical_columns]
encoder = OneHotEncoder(sparse=False)
data_categorical = encoder.fit_transform(data_categorical)
data_categorical = pd.DataFrame(data_categorical, columns=encoder.get_feature_names_out()).astype(int)
data_categorical.head()

Unnamed: 0,seller_gewerblich,seller_privat,offerType_Angebot,offerType_Gesuch,abtest_control,abtest_test,vehicleType_andere,vehicleType_bus,vehicleType_cabrio,vehicleType_coupe,...,brand_sonstige_autos,brand_subaru,brand_suzuki,brand_toyota,brand_trabant,brand_volkswagen,brand_volvo,notRepairedDamage_ja,notRepairedDamage_nein,notRepairedDamage_non-specified
0,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Joining data and adding price column

In [8]:
df = pd.concat([data_numerical, data_categorical], axis=1)
df['price'] = data['price']
df.head()

Unnamed: 0,yearOfRegistration,kilometer,monthOfRegistration,powerPS,seller_gewerblich,seller_privat,offerType_Angebot,offerType_Gesuch,abtest_control,abtest_test,...,brand_subaru,brand_suzuki,brand_toyota,brand_trabant,brand_volkswagen,brand_volvo,notRepairedDamage_ja,notRepairedDamage_nein,notRepairedDamage_non-specified,price
0,0.490196,1.0,0.0,0.0,0,1,1,0,0,1,...,0,0,0,0,1,0,0,0,1,480
1,0.843137,0.827586,0.416667,0.39501,0,1,1,0,0,1,...,0,0,0,0,0,0,1,0,0,18300
2,0.705882,0.827586,0.666667,0.338877,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,1,9800
3,0.647059,1.0,0.5,0.155925,0,1,1,0,0,1,...,0,0,0,0,1,0,0,1,0,1500
4,0.784314,0.586207,0.583333,0.143451,0,1,1,0,0,1,...,0,0,0,0,0,0,0,1,0,3600


## XGBoost

XGBoost is a popular and efficient open-source implementation of the gradient boosted trees algorithm. Gradient boosting is a supervised learning algorithm, which attempts to accurately predict a target variable by combining the estimates of a set of simpler, weaker models. For more informations about XGBoost, [check its documentation](https://xgboost.readthedocs.io/en/stable/).

Before training the models and compare then, we split our dataset into train and test sets.

1. The training set is applied to train, or fit, your model.

2. The test set is needed for an unbiased evaluation of the final model. You shouldn’t use it for fitting or validation.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('price', axis=1), data['price'], test_size=0.2, random_state=15)
print(f"Train Shape : {X_train.shape} --------- Test Shape : {X_test.shape}")

Train Shape : (292123, 324) --------- Test Shape : (73031, 324)


After the data is splitted into train and test sets, we apply cross-validation, a model validation techniques for assessing how the results of a statistical analysis will generalize to an independent data set. Cross-validation is a resampling method that uses different portions of the data to test and train a model on different iterations. Basically, it splits the already splitted train set into another train and test sets. This is done multiple times, so we can get the average accuracy value, as well as the standard deviation for each metric used to evaluate the model. In this notebook (and in many others regression analysis cases), we use the R-Squared (R²), Mean Absolute Error(MAE) and Mean Squared Error(MSE) metrics to evaluate the fitted model.

In [10]:
scores = {"r2_score" : make_scorer(r2_score), "mae" : make_scorer(mean_absolute_error), "mse" : make_scorer(mean_squared_error)}

In [11]:
model = xgb.XGBRegressor(tree_method="hist", n_estimators = 1000)
result = cross_validate(model, X_train, y_train, cv=5, verbose=0, scoring=scores)

for key, value in result.items():
    print(f"{key} : {np.mean(value)} +- {np.std(value)}")
    
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"R2 Score : {r2_score(y_test, pred)}")

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


fit_time : 24.10170135498047 +- 0.6605981437130986
score_time : 0.3957973003387451 +- 0.03399696589831099
test_r2_score : 0.8815876066021738 +- 0.0030328933082487807
test_mae : 1127.0908198771408 +- 8.439390777893637
test_mse : 4118423.929364069 +- 115375.66888408948


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R2 Score : 0.8856918313564333


As we can see in the outputs, the XGBoost model achieve a 0.8815 R-Squared coeficient. It means that, on average, 88.15% of the variability observed in the target variable is explained by the regression model. For the selected test set (obtained in the split before the cross-validation step), the model presetned a R-Squared coefficient of 0,8857, which is close to the average value obtained with cross-validation. This means that the model has a good generalization power while being able to explain well the existing relationships between the explanatory variables and the used-car prices variable.