In [9]:
import sklearn
import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm
from datetime import datetime
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import pickle
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


import joblib

import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_pickle("./data.pkl")

In [11]:
target_names = ["titel", "bouwjaar", "kilometer_stand", "vermogen", "is_handgeschakeld", "is_benzine", "upload_datum", "apk"]

numeric_features = ['bouwjaar', "kilometer_stand", "vermogen", "upload_datum", "apk"]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('text',  CountVectorizer(), 'titel')],
    remainder='passthrough')

In [12]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

X_train = df_train.drop(['prijs'], axis=1)
y_train = df_train[['prijs']]

X_test = df_test.drop(['prijs'], axis=1)
y_test = df_test[['prijs']]['prijs']
X_test

Unnamed: 0,titel,bouwjaar,kilometer_stand,vermogen,is_handgeschakeld,is_benzine,upload_datum,apk
83,fiat grande punto dynamic,2007,69724.0,78,1.0,1,17,181
299,fiat grande punto active,2006,100976.0,65,1.0,1,10,277
271,fiat grande punto dynamic nette auto airco,2007,177374.0,78,1.0,1,28,181
58,fiat grande punto dynamic,2008,186935.0,77,1.0,1,14,231
422,fiat grande punto dynamic automaat airco sept,2007,74016.0,78,1.0,1,0,78
...,...,...,...,...,...,...,...,...
395,fiat grande punto dynamic airco,2007,224911.0,95,1.0,1,8,81
90,fiat grande punto edizione lusso,2009,223623.0,67,1.0,1,15,181
128,fiat grande punto actual airco,2011,203598.0,84,1.0,0,38,134
4,fiat grande punto active airco rijdt prima,2008,138191.0,65,1.0,1,12,380


In [13]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [14]:
def evaluate_model(y_test, y_pred):
    print("Mean absolute error:\t", int(mean_absolute_error(y_test, y_pred)))
    print("Mean squared error:\t", int(mean_squared_error(y_test, y_pred)))

In [15]:
dtree = tree.DecisionTreeRegressor()
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
evaluate_model(y_test, y_pred)

Mean absolute error:	 543
Mean squared error:	 445133


In [16]:
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
evaluate_model(y_test, y_pred)

Mean absolute error:	 393
Mean squared error:	 236616


In [18]:
joblib.dump(random_forest, 'RandomForestRegressor')
joblib.dump(preprocessor, 'RFPreprocessor')

['RFPreprocessor']