## Naivni Bayesov klasifikator
za napovedovanje cene rabljenega avtomobila.

In [1]:
import pandas as pd

avti = pd.read_csv("../podatki/rabljeni_avtomobili.csv")

# zvezne parametre zaokrožimo
avti.cena = avti.cena.round(-3)
avti.kilovati = avti.kilovati.round(-1)
avti.stevilo_kilometrov = avti.stevilo_kilometrov.round(-5)

Avtomobile razdelimo na učno in testno množico.

In [34]:
import numpy as np
maska = np.random.rand(len(avti)) < 0.8
parametri = [
    "ime_znamke",
    "id_modela",
    "menjalnik",
    "vrsta_motorja",
    "tip_modela",
    "kilovati",
    "leto_izdelave",
    "stevilo_kilometrov"
]
učni_avti = avti[maska][parametri+["cena"]].dropna()
testni_avti = avti[~maska][parametri+["cena"]].dropna()

### Izvedba klasifikatorja

In [35]:
pojavitve_cen = učni_avti.groupby("cena").size()
verjetnosti_cen = pojavitve_cen / len(učni_avti)

$p_i$ ... i-ti parameter  
$c$ ... cena
$$P(P_i = p_i \mid C = c)$$

In [44]:
pogojne_verjetnosti_parametrov = pd.concat(
    [
        (učni_avti.groupby(["cena", parameter]).size() / pojavitve_cen)
        .unstack().transpose().fillna(1/len(učni_avti))
        for parameter
        in parametri
    ],
    keys=parametri
)
pogojne_verjetnosti_parametrov

Unnamed: 0,cena,0.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,...,325000.0,330000.0,339000.0,348000.0,350000.0,370000.0,390000.0,439000.0,450000.0,507000.0
ime_znamke,ALFA ROMEO,0.010526,0.000105,0.000105,0.000105,0.000105,0.003650,0.000105,0.000105,0.000105,0.000105,...,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105
ime_znamke,AUDI,0.021053,0.000105,0.003195,0.000105,0.000105,0.007299,0.000105,0.007067,0.002532,0.003021,...,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105
ime_znamke,Alfa Romeo,0.021053,0.026923,0.006390,0.020619,0.013966,0.040146,0.036212,0.017668,0.012658,0.012085,...,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105
ime_znamke,Alpine,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,...,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105
ime_znamke,Aston Martin,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,...,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
stevilo_kilometrov,1500000.0,0.000105,0.000105,0.000105,0.000105,0.000105,0.003650,0.000105,0.000105,0.000105,0.000105,...,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105
stevilo_kilometrov,1600000.0,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,...,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105
stevilo_kilometrov,2400000.0,0.000105,0.000105,0.000105,0.000105,0.000105,0.003650,0.000105,0.000105,0.000105,0.000105,...,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105
stevilo_kilometrov,2900000.0,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.002532,0.000105,...,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105


In [37]:
def napovej_ceno(avtomobil: pd.Series):
    indeksi = set(zip(avtomobil.index, avtomobil))
    verjetne_cene = pogojne_verjetnosti_parametrov[
        pogojne_verjetnosti_parametrov.index.isin(indeksi)
    ].product(numeric_only=True)*verjetnosti_cen
    verjetne_cene /= verjetne_cene.sum()
    return (verjetne_cene.index * verjetne_cene).sum().round(-2)

def napovej_ceno_join(avtomobil: pd.Series):
    vc = pd.DataFrame(
        index=pd.MultiIndex.from_arrays(
            [avtomobil.index, avtomobil], names=None
        )
    ).join(
        pogojne_verjetnosti_parametrov, how="inner"
    ).product()*verjetnosti_cen
    vc /= vc.sum()
    return (vc.index * vc).sum().round(-2)

In [40]:
# 20 s
testni_avti["napovedana_cena"] = testni_avti.apply(napovej_ceno, axis=1)

In [39]:
# 40 s
testni_avti["napovedana_cena"] = testni_avti.apply(napovej_ceno_join, axis=1)

### Rezultati
Napaka napovedi

In [41]:
RMSE = (((testni_avti.napovedana_cena-testni_avti.cena)**2).sum()/len(testni_avti))**0.5
RMSE

6927.077910092422

In [42]:
testni_avti.sample(10)

Unnamed: 0,ime_znamke,id_modela,menjalnik,vrsta_motorja,tip_modela,kilovati,leto_izdelave,stevilo_kilometrov,cena,napovedana_cena
2152,Audi,164.0,AVTOMATSKI,DIEZEL,KARAVAN,160.0,2016.0,200000.0,22000.0,23500.0
3482,BMW,253.0,ROČNI,DIEZEL,COUPÉ,110.0,2015.0,100000.0,16000.0,18600.0
22280,Volkswagen,2651.0,ROČNI,DIEZEL,KOMBI,80.0,2005.0,200000.0,5000.0,5200.0
969,Audi,145.0,ROČNI,DIEZEL,KARAVAN,100.0,2012.0,200000.0,11000.0,11400.0
20787,Volkswagen,2638.0,AVTOMATSKI,DIEZEL,KARAVAN,110.0,2016.0,200000.0,16000.0,19600.0
1115,Audi,145.0,AVTOMATSKI,DIEZEL,KARAVAN,110.0,2013.0,200000.0,12000.0,16300.0
15301,Renault,2204.0,AVTOMATSKI,DIEZEL,KOMBI,120.0,2017.0,100000.0,18000.0,18900.0
2248,Audi,164.0,AVTOMATSKI,DIEZEL,KARAVAN,170.0,2019.0,100000.0,45000.0,41400.0
6691,Ford,945.0,ROČNI,BENCIN,SUV,70.0,2019.0,0.0,19000.0,18000.0
8569,Jeep,1301.0,AVTOMATSKI,DIEZEL,SUV,150.0,2017.0,100000.0,47000.0,40400.0
