In [2]:
import sklearn
import numpy as np
import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import re
import pickle
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

In [11]:
conn = sqlite3.connect('../db.sqlite3')

In [14]:
df_raw = pd.read_sql_query("SELECT * FROM scraper_auto", conn)

In [15]:
df_raw['bron'].value_counts()

marktplaats    415
autotrack      269
autoscout      196
Name: bron, dtype: int64

In [5]:
target_names = ["bouwjaar", "kilometer_stand", "vermogen", "prijs", "is_handgeschakeld", "is_benzine", "upload_datum", "apk"]
df = df_raw[target_names]
df.describe()

Unnamed: 0,bouwjaar,kilometer_stand,vermogen,prijs,is_handgeschakeld,is_benzine
count,852.0,851.0,852.0,843.0,850.0,852.0
mean,2009.516432,169374.957697,82.86385,19204800.0,0.970588,0.586854
std,2.821099,64903.897649,13.232452,249096500.0,0.169057,0.492688
min,2005.0,7183.0,51.0,1.0,0.0,0.0
25%,2007.0,124985.0,77.0,2249.0,1.0,0.0
50%,2009.0,174637.0,84.0,2799.0,1.0,1.0
75%,2011.0,211476.0,86.0,3950.0,1.0,1.0
max,2018.0,368661.0,165.0,3445143000.0,1.0,1.0


In [6]:
MAX_PRICE = 30000
MIN_PRICE = 1000
df = df[(df['prijs'] > MIN_PRICE) & (df['prijs'] < MAX_PRICE)]

MIN_BOUWJAAR = 2000
df = df[df['bouwjaar'] > MIN_BOUWJAAR]
df.describe()

MAX_VERMOGEN = 500
df = df[df['vermogen'] < MAX_VERMOGEN]
df.describe()

Unnamed: 0,bouwjaar,kilometer_stand,vermogen,prijs,is_handgeschakeld,is_benzine
count,815.0,814.0,815.0,815.0,813.0,815.0
mean,2009.611043,166563.05774,82.788957,3580.354601,0.97048,0.588957
std,2.821839,63329.683151,12.857741,2237.300992,0.169364,0.492325
min,2005.0,7183.0,51.0,1150.0,0.0,0.0
25%,2007.0,121570.25,77.0,2250.0,1.0,0.0
50%,2010.0,171868.0,84.0,2850.0,1.0,1.0
75%,2011.0,208738.5,86.0,3950.0,1.0,1.0
max,2018.0,367084.0,165.0,13900.0,1.0,1.0


In [7]:
df['upload_datum'] = pd.to_datetime(df['upload_datum'])
df['apk'] = pd.to_datetime(df['apk'])

average_upload_datum = df['upload_datum'].mean().toordinal()
average_apk =df['apk'].mean().toordinal()

def upload_datum_to_ordinal(date):
    if date is pd.NaT:
        return abs(datetime.now().toordinal() - average_upload_datum)
    else:
        return abs(datetime.now().toordinal() - date.toordinal())
    
def apk_to_ordinal(date):
    if date is pd.NaT:
        return abs(datetime.now().toordinal() - average_apk)
    else:  
        return abs(datetime.now().toordinal() - date.toordinal())

df['upload_datum'] = df['upload_datum'].apply(upload_datum_to_ordinal).astype(int)
df['apk'] = df['apk'].apply(apk_to_ordinal).astype(int)

# def parse_titel(titel):
#     cleaned = re.sub('[^a-zA-Z]+', ' ', titel.lower())
#     stripped = line = re.sub(r'\b\w{1,3}\b', '', cleaned)
    
#     return stripped

# df['titel'] = df['titel'].apply(parse_titel)

In [8]:
df = df.dropna()
df

Unnamed: 0,bouwjaar,kilometer_stand,vermogen,prijs,is_handgeschakeld,is_benzine,upload_datum,apk
1,2008,169504.0,78,2950.0,0.0,1,16,128
2,2007,219718.0,65,1950.0,1.0,1,11,183
3,2006,199908.0,95,1895.0,1.0,1,11,219
4,2008,138191.0,65,2740.0,1.0,1,11,381
5,2006,139057.0,65,2950.0,1.0,1,11,281
...,...,...,...,...,...,...,...,...
847,2011,219116.0,86,2700.0,1.0,0,10,63
848,2010,255591.0,86,1999.0,1.0,0,10,230
849,2008,169343.0,120,4650.0,1.0,1,10,148
850,2011,150050.0,86,2945.0,1.0,0,10,53


In [9]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
df_train.to_pickle("./train_data.pkl")
df_test.to_pickle("./test_data.pkl")