In [3]:
import sklearn
import numpy as np
import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import re
import pickle
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

In [4]:
conn = sqlite3.connect('../db.sqlite3')

In [5]:
df_raw = pd.read_sql_query("SELECT * FROM scraper_auto WHERE bron='marktplaats'", conn)

In [6]:
target_names = ["titel", "bouwjaar", "kilometer_stand", "vermogen", "prijs", "is_handgeschakeld", "is_benzine", "upload_datum", "apk"]
df = df_raw[target_names]
df.describe()

Unnamed: 0,bouwjaar,kilometer_stand,vermogen,prijs,is_handgeschakeld,is_benzine
count,428.0,427.0,428.0,418.0,426.0,428.0
mean,2007.841121,179076.058548,80.682243,5013.844498,0.955399,0.733645
std,1.741622,58644.197017,13.946769,48792.727034,0.206669,0.442569
min,2005.0,15570.0,51.0,1.0,0.0,0.0
25%,2006.0,138097.0,75.0,1950.0,1.0,0.0
50%,2007.0,177654.0,78.0,2591.5,1.0,1.0
75%,2009.0,211811.5,84.0,3250.0,1.0,1.0
max,2014.0,368661.0,155.0,999999.0,1.0,1.0


In [7]:
MAX_PRICE = 30000
MIN_PRICE = 1000
df = df[(df['prijs'] > MIN_PRICE) & (df['prijs'] < MAX_PRICE)]

MIN_BOUWJAAR = 2000
df = df[df['bouwjaar'] > MIN_BOUWJAAR]

MAX_VERMOGEN = 500
df = df[df['vermogen'] < MAX_VERMOGEN]

df.describe()

Unnamed: 0,bouwjaar,kilometer_stand,vermogen,prijs,is_handgeschakeld,is_benzine
count,397.0,396.0,397.0,397.0,395.0,397.0
mean,2007.924433,174756.439394,80.282116,2726.130982,0.95443,0.743073
std,1.757906,54998.073599,13.204912,887.528954,0.208814,0.43749
min,2005.0,15570.0,51.0,1150.0,0.0,0.0
25%,2006.0,137210.25,75.0,1995.0,1.0,0.0
50%,2007.0,176590.5,78.0,2690.0,1.0,1.0
75%,2009.0,206965.25,84.0,3250.0,1.0,1.0
max,2014.0,367084.0,155.0,6950.0,1.0,1.0


In [8]:
df['upload_datum'] = pd.to_datetime(df['upload_datum'])
df['apk'] = pd.to_datetime(df['apk'])

average_upload_datum = df['upload_datum'].mean().toordinal()
average_apk =df['apk'].mean().toordinal()

def upload_datum_to_ordinal(date):
    if date is pd.NaT:
        return abs(datetime.now().toordinal() - average_upload_datum)
    else:
        return abs(datetime.now().toordinal() - date.toordinal())
    
def apk_to_ordinal(date):
    if date is pd.NaT:
        return abs(datetime.now().toordinal() - average_apk)
    else:  
        return abs(datetime.now().toordinal() - date.toordinal())

df['upload_datum'] = df['upload_datum'].apply(upload_datum_to_ordinal).astype(int)
df['apk'] = df['apk'].apply(apk_to_ordinal).astype(int)

def parse_titel(titel):
    cleaned = re.sub('[^a-zA-Z]+', ' ', titel.lower())
    stripped = line = re.sub(r'\b\w{1,3}\b', '', cleaned)
    
    return stripped

df['titel'] = df['titel'].apply(parse_titel)

In [9]:
df = df.dropna()
#df.isna().sum()

In [10]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [11]:
df_train.to_pickle("./train_data.pkl")
df_test.to_pickle("./test_data.pkl")

In [12]:
df.to_pickle("./data.pkl")