In [1]:
import sklearn
import numpy as np
import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import re
import pickle
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

In [2]:
conn = sqlite3.connect('../db.sqlite3')

In [3]:
df_raw = pd.read_sql_query("SELECT * FROM scraper_auto", conn)

In [4]:
df_raw

Unnamed: 0,id,kenteken,bouwjaar,kilometer_stand,vermogen,is_handgeschakeld,is_benzine,prijs,url,titel,upload_datum,bron,apk
0,1159,96-GZR-3,2008,117910,77,0,1,1.0,https://www.marktplaats.nl/a/auto-s/fiat/m1554...,Fiat Grande Punto 1.4 Edizione Lusso AUTOMAAT!,2020-05-23 15:01:00,marktplaats,2021-05-22
1,1160,90-ZJ-ZN,2008,169504,78,0,1,2950.0,https://www.marktplaats.nl/a/auto-s/fiat/m1556...,Fiat Grande Punto 1.4-16V Emotion 5dr Automaat...,2020-05-27 13:22:00,marktplaats,2020-10-18
2,1161,03-TR-LV,2007,219718,65,1,1,1950.0,https://www.marktplaats.nl/a/auto-s/fiat/m1558...,"Fiat Grande Punto 1.2 Edizione Cool AIRCO,CRUI...",2020-06-01 16:00:00,marktplaats,2020-12-12
3,1162,61-TF-FP,2006,199908,95,1,1,1895.0,https://www.marktplaats.nl/a/auto-s/fiat/m1558...,Fiat Grande Punto 1.4-16V Emotion AIRCO/APK,2020-06-01 14:11:00,marktplaats,2021-01-17
4,1163,62-ZL-SX,2008,138191,65,1,1,2740.0,https://www.marktplaats.nl/a/auto-s/fiat/m1558...,Fiat Grande Punto 1.2 Active Airco_rijdt prima,2020-06-01 13:56:00,marktplaats,2021-06-28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
716,1910,84ZNGL,2008,149336,77,1,0,3450.0,https://www.autoscout24.nl/aanbod/fiat-grande-...,Fiat Grande Punto1.4 Dynamic *NIEUW BINNEN*,,autoscout,2020-05-28
717,1911,92PDH9,2011,190999,84,1,0,3450.0,https://www.autoscout24.nl/aanbod/fiat-punto-e...,Fiat Punto Evo1.3 M-Jet Dynamic / AIRCO / LMV ...,,autoscout,2021-01-01
718,1912,HJ044H,2009,111000,77,1,1,3450.0,https://www.autoscout24.nl/aanbod/fiat-punto-1...,Fiat Punto1.4 Active,,autoscout,2020-03-01
719,1913,58NKP8,2010,128835,86,1,0,3450.0,https://www.autoscout24.nl/aanbod/fiat-punto-e...,Fiat Punto Evo1.3 M-JET DYNAMIC,,autoscout,2021-02-01


In [13]:
target_names = ["bouwjaar", "kilometer_stand", "vermogen", "prijs", "is_handgeschakeld", "is_benzine", "upload_datum", "apk"]
df = df_raw[target_names]
df.describe()

Unnamed: 0,bouwjaar,kilometer_stand,vermogen,prijs,is_handgeschakeld,is_benzine
count,721.0,721.0,721.0,713.0,721.0,721.0
mean,2009.803051,167935.106796,83.280166,22704480.0,0.973648,0.558946
std,2.886278,66361.033031,13.007979,270737400.0,0.160292,0.496858
min,2005.0,7183.0,51.0,1.0,0.0,0.0
25%,2007.0,120688.0,77.0,2250.0,1.0,0.0
50%,2010.0,174724.0,84.0,2850.0,1.0,1.0
75%,2011.0,211763.0,86.0,3999.0,1.0,1.0
max,2018.0,368661.0,165.0,3445143000.0,1.0,1.0


In [14]:
MAX_PRICE = 30000
MIN_PRICE = 1000
df = df[(df['prijs'] > MIN_PRICE) & (df['prijs'] < MAX_PRICE)]

MIN_BOUWJAAR = 2000
df = df[df['bouwjaar'] > MIN_BOUWJAAR]
df.describe()

MAX_VERMOGEN = 500
df = df[df['vermogen'] < MAX_VERMOGEN]
df.describe()

Unnamed: 0,bouwjaar,kilometer_stand,vermogen,prijs,is_handgeschakeld,is_benzine
count,690.0,690.0,690.0,690.0,690.0,690.0
mean,2009.904348,165483.478261,83.313043,3721.226087,0.973913,0.556522
std,2.880717,65140.762338,12.709115,2374.295289,0.15951,0.497155
min,2005.0,7183.0,51.0,1195.0,0.0,0.0
25%,2007.0,118711.75,77.0,2296.0,1.0,0.0
50%,2010.0,172319.5,84.0,2899.5,1.0,1.0
75%,2011.0,209939.75,86.0,3999.0,1.0,1.0
max,2018.0,367084.0,165.0,13900.0,1.0,1.0


In [15]:
df['upload_datum'] = pd.to_datetime(df['upload_datum'])
df['apk'] = pd.to_datetime(df['apk'])

average_upload_datum = df['upload_datum'].mean().toordinal()
average_apk =df['apk'].mean().toordinal()

def upload_datum_to_ordinal(date):
    if date is pd.NaT:
        return abs(datetime.now().toordinal() - average_upload_datum)
    else:
        return abs(datetime.now().toordinal() - date.toordinal())
    
def apk_to_ordinal(date):
    if date is pd.NaT:
        return abs(datetime.now().toordinal() - average_apk)
    else:  
        return abs(datetime.now().toordinal() - date.toordinal())

df['upload_datum'] = df['upload_datum'].apply(upload_datum_to_ordinal).astype(int)
df['apk'] = df['apk'].apply(apk_to_ordinal).astype(int)

# def parse_titel(titel):
#     cleaned = re.sub('[^a-zA-Z]+', ' ', titel.lower())
#     stripped = line = re.sub(r'\b\w{1,3}\b', '', cleaned)
    
#     return stripped

# df['titel'] = df['titel'].apply(parse_titel)

In [16]:
df = df.dropna()
df

Unnamed: 0,bouwjaar,kilometer_stand,vermogen,prijs,is_handgeschakeld,is_benzine,upload_datum,apk
1,2008,169504,78,2950.0,0,1,7,137
2,2007,219718,65,1950.0,1,1,2,192
3,2006,199908,95,1895.0,1,1,2,228
4,2008,138191,65,2740.0,1,1,2,390
5,2006,139057,65,2950.0,1,1,2,290
...,...,...,...,...,...,...,...,...
716,2008,149336,77,3450.0,1,0,13,6
717,2011,190999,84,3450.0,1,0,13,212
718,2009,111000,77,3450.0,1,1,13,94
719,2010,128835,86,3450.0,1,0,13,243


In [17]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [18]:
df_train.to_pickle("./train_data.pkl")
df_test.to_pickle("./test_data.pkl")