In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
def load_data():
    return pd.DataFrame.from_csv('cleaned_data.csv')

In [3]:
# split the data
def split_data(df):
    train, test = train_test_split(df, random_state=100)
    train_x = train.drop(['price'], axis=1)
    train_y = train['price']
    test_x = test.drop(['price'], axis=1)
    test_y = test['price']
    return train_x, train_y, test_x, test_y

In [4]:
# generate encoders
def generate_encoders(df):
    le_vtype = LabelEncoder()
    le_vtype.fit(df['vehicleType'])

    le_ftype = LabelEncoder()
    le_ftype.fit(df['fuelType'])
    
    le_model = LabelEncoder()
    le_model.fit(df['model'])
    
    le_brand = LabelEncoder()
    le_brand.fit(df['brand'])
    
    return le_vtype, le_ftype, le_model, le_brand

In [5]:
# Encode the data
def encode_data(df, lev, lef, lem, leb):
    dfc = df.copy()
    dfc['vehicleType'] = lev.transform(dfc['vehicleType'])
    dfc['fuelType'] = lef.transform(dfc['fuelType'])
    dfc['model'] = lem.transform(dfc['model'])
    dfc['brand'] = leb.transform(dfc['brand'])
    return dfc

In [6]:
df = load_data()
tr_x, tr_y, ts_x, ts_y = split_data(df)
lev, lef, lem, leb = generate_encoders(df)
tr_x_ = encode_data(tr_x, lev, lef, lem, leb)

In [7]:
df.head()

Unnamed: 0,price,vehicleType,powerPS,model,kilometer,fuelType,brand,isAutomatic,damageRepaired,age
5,650,limousine,102,3er,150000,benzin,bmw,0.0,0.0,21.833333
6,2200,cabrio,109,2_reihe,150000,benzin,peugeot,0.0,1.0,13.0
10,2000,limousine,105,3_reihe,150000,benzin,mazda,0.0,1.0,12.666667
11,2799,kombi,140,passat,150000,diesel,volkswagen,0.0,0.0,11.666667
14,17999,suv,190,navara,70000,diesel,nissan,0.0,1.0,6.416667


In [10]:
# Train Logic
clf = RandomForestRegressor(n_estimators=500, max_features='auto', verbose=50, n_jobs=-1)
print('Training Started')
clf.fit(tr_x_, np.log1p(tr_y))
print('Training Finished')
clf.score(tr_x_, np.log1p(tr_y))

Training Started
building tree 1 of 500
building tree 2 of 500
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.5s
building tree 3 of 500
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.5s
building tree 4 of 500
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    1.1s
building tree 5 of 500
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.1s
building tree 6 of 500
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.6s
building tree 7 of 500
building tree 8 of 500[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    1.6s

[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    2.1s
building tree 9 of 500
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.1s
building tree 10 of 500
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.6s
building tree 12 of 500[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.6s
building tree 11 of 500

[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]:

0.95819727479793892

In [11]:
# Predict Logic
ts_x_ = encode_data(ts_x, lev, lef, lem, leb)
clf.score(ts_x_, np.log1p(ts_y))

[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done   6 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done   7 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  11 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  13 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  15 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Do

0.8362681007971452