## Import Bibliotek

In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

import eli5

## Upload danych

In [2]:
df = pd.read_csv('car_prices.csv')
df = df.fillna(-1)
df.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj0fm227318,fl,4.1,14872.0,gray,black,enterprise veh exchange/rental,13700,14300,Tue Jun 02 2015 02:15:00 GMT-0700 (PDT)
1,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj5fm219943,fl,4.4,16633.0,silver,black,sixt rent a car llc,13600,13600,Tue Jun 02 2015 02:15:00 GMT-0700 (PDT)
2,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj5fm221322,fl,4.8,18384.0,silver,black,sixt rent a car llc,13550,13500,Tue Jun 02 2015 02:15:00 GMT-0700 (PDT)
3,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj5fm273601,fl,2.8,339.0,black,black,florida auto financial group,14300,12900,Tue Jun 02 2015 02:30:00 GMT-0700 (PDT)
4,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj6fm218641,fl,4.9,12655.0,red,black,enterprise veh exchange/rental,13850,14500,Tue May 26 2015 02:15:00 GMT-0700 (PDT)


## Cechy

In [3]:
num_feats = ['odometer','year','condition']
obj_feats = ['make','body','transmission','color','interior']

for feat in obj_feats:
    df["{}_cat".format(feat)] = df[feat].factorize()[0]
    
cat_feats = [x for x in df.columns if "_cat" in x]
feats = cat_feats + num_feats
df.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,...,interior,seller,mmr,sellingprice,saledate,make_cat,body_cat,transmission_cat,color_cat,interior_cat
0,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj0fm227318,fl,4.1,14872.0,...,black,enterprise veh exchange/rental,13700,14300,Tue Jun 02 2015 02:15:00 GMT-0700 (PDT),0,0,0,0,0
1,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj5fm219943,fl,4.4,16633.0,...,black,sixt rent a car llc,13600,13600,Tue Jun 02 2015 02:15:00 GMT-0700 (PDT),0,0,0,1,0
2,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj5fm221322,fl,4.8,18384.0,...,black,sixt rent a car llc,13550,13500,Tue Jun 02 2015 02:15:00 GMT-0700 (PDT),0,0,0,1,0
3,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj5fm273601,fl,2.8,339.0,...,black,florida auto financial group,14300,12900,Tue Jun 02 2015 02:30:00 GMT-0700 (PDT),0,0,0,2,0
4,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj6fm218641,fl,4.9,12655.0,...,black,enterprise veh exchange/rental,13850,14500,Tue May 26 2015 02:15:00 GMT-0700 (PDT),0,0,0,3,0


## Pierwszy model - drzewo decyzyjne i las losowy

In [4]:
X = df[feats]
y = df["sellingprice"]

model = DecisionTreeRegressor(max_depth=5)
model.fit(X, y)
y_pred = model.predict(X)

mean_absolute_error(y, y_pred)


4273.5898273119055

In [5]:
X = df[feats]
y = df["sellingprice"]

model = RandomForestRegressor(max_depth=5)
model.fit(X, y)
y_pred = model.predict(X)

mean_absolute_error(y, y_pred)


4204.228560689692

## Drugi model - xgboost

In [6]:
X = df[feats]
y = df["sellingprice"]

model = xgb.XGBRegressor(max_depth=5, n_estimators=50, learning_rate=0.3, random_state=0)
model.fit(X, y)
y_pred = model.predict(X)

mean_absolute_error(y, y_pred)

2610.037748873653

## Jak usprawnić model:

 
 1 - feature engineering - tworzymy cechy z już istniejących (np. wersje wyposażenia)
 
 2 - feature engineering - cechy numeryczne zagregowane po markach, latach
 
 3 - dobór modelu i optymalizacja parametrów, walidacja krzyżowa
 
 4 - prognoza na logarytmie (zmiana rozkładu)

## Ważność cech 

In [7]:
eli5.show_weights(model, feature_names=feats)

Weight,Feature
0.3592,odometer
0.2275,year
0.1287,body_cat
0.1219,make_cat
0.092,condition
0.0499,interior_cat
0.0114,color_cat
0.0095,transmission_cat


# ---> idziemy do Tableau --->

# -------------------------------------------------------------

## Uproszczony model do deployementu:

In [8]:
X = df[num_feats]
y = df["sellingprice"]

model = DecisionTreeRegressor(max_depth=5)
model.fit(X, y)
y_pred = model.predict(X)

mean_absolute_error(y, y_pred)

4813.227554932031

## Funkcja i deployement

In [10]:
def price_prediction(_arg1,_arg2,_arg3):    
    
    input_data = np.column_stack([_arg1,_arg2,_arg3])
    
    X = pd.DataFrame(input_data,columns=['odometer','year','condition'])
   
    X = X.fillna(-1).values
   
    result = model.predict(X)
    
    return result.tolist()

In [12]:
price_prediction(100000,2015,1)



[5808.224585436194]

In [13]:
from tabpy.tabpy_tools.client import Client

client = Client('http://localhost:9004/')
client.deploy('price_prediction',
price_prediction,
'price_prediction'
, override = True)

# ---> idziemy do Tableau --->

# -------------------------------------------------------------

## Mapujemy rynek

In [14]:
#dołączamy dane do prognozy

df_test = pd.read_csv('full.csv')

df_train = pd.read_csv('car_prices.csv')
df_train = df_train.fillna(-1)

df = pd.concat([df_train,df_test])

df.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj0fm227318,fl,4.1,14872.0,gray,black,enterprise veh exchange/rental,13700.0,14300.0,Tue Jun 02 2015 02:15:00 GMT-0700 (PDT)
1,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj5fm219943,fl,4.4,16633.0,silver,black,sixt rent a car llc,13600.0,13600.0,Tue Jun 02 2015 02:15:00 GMT-0700 (PDT)
2,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj5fm221322,fl,4.8,18384.0,silver,black,sixt rent a car llc,13550.0,13500.0,Tue Jun 02 2015 02:15:00 GMT-0700 (PDT)
3,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj5fm273601,fl,2.8,339.0,black,black,florida auto financial group,14300.0,12900.0,Tue Jun 02 2015 02:30:00 GMT-0700 (PDT)
4,2015,volkswagen,Jetta,SE PZEV w/Connectivity,sedan,automatic,3vwd17aj6fm218641,fl,4.9,12655.0,red,black,enterprise veh exchange/rental,13850.0,14500.0,Tue May 26 2015 02:15:00 GMT-0700 (PDT)


In [15]:
#tworzymy cechy

num_feats = ['odometer','year','condition']
obj_feats = ['make','body']

for feat in obj_feats:
    df["{}_cat".format(feat)] = df[feat].factorize()[0]
    
cat_feats = [x for x in df.columns if "_cat" in x]
feats = cat_feats + num_feats
feats

['make_cat', 'body_cat', 'odometer', 'year', 'condition']

In [16]:
#tworzymy predykcje i zapisujemy plik

df_train = df[ ~df["sellingprice"].isnull() ].copy()
df_test = df[ df["sellingprice"].isnull() ].copy()

X_train = df_train[feats]
y_train = df_train["sellingprice"]

X_test = df_test[feats]

model = xgb.XGBRegressor(max_depth=5, n_estimators=50, learning_rate=0.3, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


df_test["sellingprice"] = y_pred
df_test.to_csv("predictions.csv", index=False)