In [51]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_validate, cross_val_score
from sklearn import metrics

from scipy import sparse
from scipy.sparse import hstack, vstack

import os
import warnings

warnings.filterwarnings("ignore")

### Import data

In [52]:
link = r"D:/clean_df.csv"
df = pd.read_csv(link)

df.head()

Unnamed: 0,maker,model,mileage,manufacture_year,engine_displacement,engine_power,body_type,stk_year,transmission,door_count,seat_count,fuel_type,date_created,date_last_seen,price_eur
0,ford,galaxy,151000.0,2011.0,2000.0,138.12506,compact,,man,5.0,7.0,diesel,2015-11-14 18:10:06.838319+00,2016-01-27 20:40:15.46361+00,10584.75
1,skoda,octavia,143476.0,2012.0,2000.0,108.62262,compact,,man,5.0,5.0,diesel,2015-11-14 18:10:06.853411+00,2016-01-27 20:40:15.46361+00,8882.31
2,bmw,,97676.0,2010.0,1995.0,113.9867,compact,,man,5.0,5.0,diesel,2015-11-14 18:10:06.861792+00,2016-01-27 20:40:15.46361+00,12065.06
3,skoda,fabia,111970.0,2004.0,1200.0,84.48426,compact,,man,5.0,5.0,gasoline,2015-11-14 18:10:06.872313+00,2016-01-27 20:40:15.46361+00,2960.77
4,skoda,fabia,128886.0,2004.0,1200.0,84.48426,compact,,man,5.0,5.0,gasoline,2015-11-14 18:10:06.880335+00,2016-01-27 20:40:15.46361+00,2738.71


### Create man_period, stk_period

In [53]:
df['date_created'] = pd.to_datetime(df['date_created'])
df['year_created'] = df['date_created'].dt.year

df['man_period'] = df['year_created'] - df['manufacture_year']
df['stk_period'] = df['year_created'] - df['stk_year']

In [None]:
df = df.drop(['manufacture_year', 'stk_year', 'date_created', 'date_last_seen', 'year_created'], axis=1)

### Convert columns

In [None]:
class Indicator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        nonnull_X = np.nan_to_num(X.astype(float), nan=0).astype(int)

        missing_indicator = MissingIndicator()
        indicator_values = missing_indicator.fit_transform(X).astype(int)

        return np.c_[nonnull_X, indicator_values]

In [None]:
numerical_columns = ['engine_power', 'mileage', 'engine_displacement']
conv_to_string_columns = ['door_count', 'seat_count', 'man_period', 'stk_period']
categorical_columns = ['maker', 'model', 'body_type', 'transmission', 'fuel_type']+conv_to_string_columns

In [None]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
indicator = Indicator()

full_pipeline = ColumnTransformer([
    ("num", indicator, numerical_columns),
    ("cat", one_hot_encoder , categorical_columns),
])

In [24]:
# def datatype_converter(x):

#     for i in conv_to_string_columns:
#         x[i] = x[i].apply(lambda x: str(x) if not pd.isnull(x) else x)
    
#     for i in categorical_columns:
#         x[i] = x[i].astype('category')
    
#     x[numerical_columns] = x[numerical_columns].apply(pd.to_numeric, downcast="float")
#     x['price_eur'] = x['price_eur'].apply(pd.to_numeric, downcast='float')

#     return x

# df = datatype_converter(df)

### Split into train, val, test sets

In [33]:
X = df.drop('price_eur', axis=1)
y = df['price_eur']

full_pipeline.fit(X)
X_trans = full_pipeline.transform(X)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_trans, y, test_size=0.2, random_state=42)

## Model

### Decision tree

In [32]:
dt_reg = DecisionTreeRegressor(random_state=42)

dt_param_grid = {
    'criterion': ['gini', 'entropy'],  # Splitting criterion
    'splitter': ['best', 'random'],     # Strategy for choosing splits
    'max_depth': [10, 20, 30],     # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],    # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],      # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider for the best split
}

dt_rs = RandomizedSearchCV(
    estimator=dt_reg,
    param_distributions=dt_param_grid,
    scoring = 'neg_mean_squared_error',
    n_iter=30,  # Number of parameter candidate settings to sample
    verbose=2,  # The higher this is, the more messages are outputed
    random_state=42,
    refit = True
)

In [34]:
dt_rs.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END criterion=gini, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, splitter=best; total time=   0.6s
[CV] END criterion=gini, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, splitter=best; total time=   0.5s
[CV] END criterion=gini, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, splitter=best; total time=   0.6s
[CV] END criterion=gini, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, splitter=best; total time=   0.5s
[CV] END criterion=gini, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, splitter=best; total time=   0.5s
[CV] END criterion=gini, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=2, splitter=best; total time=   0.5s
[CV] END criterion=gini, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=2, splitter=best; total time=   0.5s
[CV] EN

ValueError: could not convert string to float: 'mini'

In [44]:
dt_reg.fit(X_train, y_train)

ValueError: could not convert string to float: 'mini'