# Szacowanie cen samochodów

In [1]:
import pandas as pd
import sklearn
import coremltools as ct
import numpy as np

# Wczytanie danych

Wczytajmy dane z pliku CSV, pochodzącego z [tego projektu w serwisie Kaggle](https://www.kaggle.com/lepchenkov/usedcarscatalog).

In [2]:
df = pd.read_csv('cars.csv');
df

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,True,False,True,True,False,False,False,False,True,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,Chrysler,300,automatic,silver,290000,2000,gasoline,False,gasoline,3.5,...,True,False,False,True,True,False,False,True,True,301
38527,Chrysler,PT Cruiser,mechanical,blue,321000,2004,diesel,False,diesel,2.2,...,True,False,False,True,True,False,False,True,True,317
38528,Chrysler,300,automatic,blue,777957,2000,gasoline,False,gasoline,3.5,...,True,False,False,True,True,False,False,True,True,369
38529,Chrysler,PT Cruiser,mechanical,black,20000,2001,gasoline,False,gasoline,2.0,...,True,False,False,False,False,False,False,False,True,490


Utwórzmy kolumnę z marką i modelem:

In [3]:
df['make_model'] = df.manufacturer_name + ' ' + df.model_name
df

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed,make_model
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,False,True,False,True,True,True,16,Subaru Outback
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,False,False,True,True,False,False,False,True,83,Subaru Outback
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,False,False,False,False,False,False,True,True,151,Subaru Forester
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,86,Subaru Impreza
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,False,True,True,False,False,False,False,True,7,Subaru Legacy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,Chrysler,300,automatic,silver,290000,2000,gasoline,False,gasoline,3.5,...,False,False,True,True,False,False,True,True,301,Chrysler 300
38527,Chrysler,PT Cruiser,mechanical,blue,321000,2004,diesel,False,diesel,2.2,...,False,False,True,True,False,False,True,True,317,Chrysler PT Cruiser
38528,Chrysler,300,automatic,blue,777957,2000,gasoline,False,gasoline,3.5,...,False,False,True,True,False,False,True,True,369,Chrysler 300
38529,Chrysler,PT Cruiser,mechanical,black,20000,2001,gasoline,False,gasoline,2.0,...,False,False,False,False,False,False,False,True,490,Chrysler PT Cruiser


## Analiza danych

Sprawdźmy, jaki model jest najpopularniejszy:

In [4]:
df.make_model.value_counts()

Volkswagen Passat      1423
Opel Astra              751
Volkswagen Golf         707
Audi A6                 687
Ford Mondeo             637
                       ... 
Honda Logo                1
Rover Streetwise          1
Rover 3500                1
Renault Grand Modus       1
Chrysler Aspen            1
Name: make_model, Length: 1157, dtype: int64

Utwórzmy DataFrame zawierający rekordy dotyczące tego modelu i interesujące nas atrybuty:

In [5]:
df_passat = df[df.make_model == 'Volkswagen Passat'][[
    'odometer_value',
    'year_produced',
    'engine_fuel',
    'engine_capacity',
    'price_usd'
]]
df_passat.describe()

Unnamed: 0,odometer_value,year_produced,engine_capacity,price_usd
count,1423.0,1423.0,1423.0,1423.0
mean,294548.590302,2000.494027,1.848138,5106.278454
std,125430.340463,8.213471,0.213392,3870.245879
min,1.0,1980.0,1.3,200.0
25%,218500.0,1993.0,1.8,1900.0
50%,298048.0,2001.0,1.8,4250.0
75%,355000.0,2007.0,2.0,7200.0
max,1000000.0,2018.0,3.2,24500.0


## Uczenie i walidacja modelu

Poniważ atrybut `engine_fuel` jest tekstowy, aby umożliwić wykorzystanie go do regresji, musimy przeprowadzić tzw. [one-hot encoding](https://en.wikipedia.org/wiki/One-hot).

In [6]:
df_dummy = pd.get_dummies(df_passat)
df_dummy

Unnamed: 0,odometer_value,year_produced,engine_capacity,price_usd,engine_fuel_diesel,engine_fuel_gas,engine_fuel_gasoline,engine_fuel_hybrid-petrol
15077,215000,2009,1.4,7100.0,0,0,1,0
15078,500000,1993,1.8,650.0,0,0,1,0
15082,316000,1996,1.9,2350.0,1,0,0,0
15083,285000,1999,1.9,4650.0,1,0,0,0
15084,375000,1994,1.9,2990.0,1,0,0,0
...,...,...,...,...,...,...,...,...
19297,270000,2007,1.9,7500.0,1,0,0,0
19298,210000,2002,1.9,8400.0,1,0,0,0
19301,270000,1997,1.8,2500.0,0,0,1,0
19303,1000000,1992,1.8,900.0,0,0,1,0


Podzielmy ramkę na zbiór uczący i testowy.

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

input_columns = set(df_dummy.columns) - set(['price_usd'])
X_train, X_test, y_train, y_test = train_test_split(
    df_dummy[input_columns],
    df_dummy['price_usd']
)

Sprawdźmy, jak radzi sobie regresja liniowa…

In [8]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print('MAE: ', mean_absolute_error(y_test, predictions))
print('RMSE: ', mean_squared_error(y_test, predictions, squared=False))

MAE:  1087.188509623915
RMSE:  1467.347338006194


…oraz algorytm [lasu losowego](https://en.wikipedia.org/wiki/Random_forest).

In [9]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print('MAE: ', mean_absolute_error(y_test, predictions))
print('RMSE: ', mean_squared_error(y_test, predictions, squared=False))

MAE:  636.7733172387194
RMSE:  920.8638066035548


## Wygenerowanie modelu Core ML

Czas na wygenerowanie modelu Core ML przy pomocy [coremltools](https://coremltools.readme.io/docs).

In [11]:
import coremltools as ct

# coreml_model = ct.converters.sklearn.convert(...)
# coreml_model.save(...)

df_passat.to_csv('df_passat.csv')

coreml_model = ct.converters.sklearn.convert(model, list(input_columns),'price_usd')
coreml_model.save('RandomForestPassatPriceUsd.mlmodel')

