In [60]:
import pandas as pd

df = pd.read_csv('./../data/toyota.csv')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,GT86,2016,16000,Manual,24089,Petrol,265,36.2,2.0
1,GT86,2017,15995,Manual,18615,Petrol,145,36.2,2.0
2,GT86,2015,13998,Manual,27469,Petrol,265,36.2,2.0
3,GT86,2017,18998,Manual,14736,Petrol,150,36.2,2.0
4,GT86,2017,17498,Manual,36284,Petrol,145,36.2,2.0


In [61]:
features = ['year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']

#### Check and count same data

In [62]:
for column in features:
    print(f'{column} ({df[column].nunique()}/{df[column].count()}): \n{df[column].unique()}\n')

year (23/6738): 
[2016 2017 2015 2020 2013 2019 2018 2014 2012 2005 2003 2004 2001 2008
 2007 2010 2011 2006 2009 2002 1999 2000 1998]

transmission (4/6738): 
['Manual' 'Automatic' 'Semi-Auto' 'Other']

mileage (5699/6738): 
[24089 18615 27469 ... 36154 60700 45128]

fuelType (4/6738): 
['Petrol' 'Other' 'Hybrid' 'Diesel']

tax (29/6738): 
[265 145 150 260 200 250 140 135 235 300 125  20 160 165 205 240 330 325
   0  30 120 155 115 190  10 305 565 555 540]

mpg (81/6738): 
[ 36.2  33.2  32.8  39.8  85.6  36.7  72.4  39.2  78.5  58.9  53.3  62.8
  56.5  57.7  37.2  34.5  35.3  40.4  51.1  57.6  60.1  55.4  42.2  43.5
  53.   49.6  52.3  48.7  42.8  47.1  38.2  31.4  32.1  30.4  47.9  78.
  76.3  58.   65.7  55.   86.   66.   80.7  51.4  67.3  74.3  42.9  54.3
  50.4  37.7  44.8  76.4  70.6  61.4  46.3  65.6  83.1  68.8  51.3  40.9
  68.9  69.   67.   41.5   6.   94.1  64.2 235.  217.3 134.5  44.1  45.6
  30.7  38.7   2.8  30.1  34.9  31.   27.2  23.9  29.7]

engineSize (16/6738): 
[2. 

In [63]:
X = df[features]
y = df['price']

In [64]:
X.head()

Unnamed: 0,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,2016,Manual,24089,Petrol,265,36.2,2.0
1,2017,Manual,18615,Petrol,145,36.2,2.0
2,2015,Manual,27469,Petrol,265,36.2,2.0
3,2017,Manual,14736,Petrol,150,36.2,2.0
4,2017,Manual,36284,Petrol,145,36.2,2.0


In [65]:
y.head()

0    16000
1    15995
2    13998
3    18998
4    17498
Name: price, dtype: int64

#### Convert to numeric

In [66]:
tranmission_convert = pd.get_dummies(X['transmission']).astype(int)
tranmission_convert = tranmission_convert.drop('Other', axis=1)

fultype_convert = pd.get_dummies(X['fuelType']).astype(int)
fultype_convert = fultype_convert.drop('Other', axis=1)

In [67]:
tranmission_convert.head()

Unnamed: 0,Automatic,Manual,Semi-Auto
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


In [68]:
fultype_convert.head()

Unnamed: 0,Diesel,Hybrid,Petrol
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


#### Delete column

In [69]:
X = X.drop(['transmission', 'fuelType'], axis=1)
X.head()

Unnamed: 0,year,mileage,tax,mpg,engineSize
0,2016,24089,265,36.2,2.0
1,2017,18615,145,36.2,2.0
2,2015,27469,265,36.2,2.0
3,2017,14736,150,36.2,2.0
4,2017,36284,145,36.2,2.0


#### Replace column

In [70]:
X = pd.concat([X, tranmission_convert, fultype_convert], axis=1)
X.head()

Unnamed: 0,year,mileage,tax,mpg,engineSize,Automatic,Manual,Semi-Auto,Diesel,Hybrid,Petrol
0,2016,24089,265,36.2,2.0,0,1,0,0,0,1
1,2017,18615,145,36.2,2.0,0,1,0,0,0,1
2,2015,27469,265,36.2,2.0,0,1,0,0,0,1
3,2017,14736,150,36.2,2.0,0,1,0,0,0,1
4,2017,36284,145,36.2,2.0,0,1,0,0,0,1


#### Change to Dataframe

In [71]:
import numpy as np

X = np.array(X)
y = np.array(y)

pd.DataFrame({
    'year': X[:, 0],
    'mileage': X[:, 1],
    'tax': X[:, 2],
    'mpg': X[:, 3],
    'engineSize': X[:, 4],
    'Automatic': X[:, 5],
    'Manual': X[:, 6],
    'Semi-Auto': X[:, 7],
    'Diesel': X[:, 8],
    'Hybrid': X[:, 9],
    'Petrol': X[:, 10]
}).head()

Unnamed: 0,year,mileage,tax,mpg,engineSize,Automatic,Manual,Semi-Auto,Diesel,Hybrid,Petrol
0,2016.0,24089.0,265.0,36.2,2.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2017.0,18615.0,145.0,36.2,2.0,0.0,1.0,0.0,0.0,0.0,1.0
2,2015.0,27469.0,265.0,36.2,2.0,0.0,1.0,0.0,0.0,0.0,1.0
3,2017.0,14736.0,150.0,36.2,2.0,0.0,1.0,0.0,0.0,0.0,1.0
4,2017.0,36284.0,145.0,36.2,2.0,0.0,1.0,0.0,0.0,0.0,1.0


In [72]:
from sklearn.neighbors import KNeighborsRegressor as KNN

K = 1

model = KNN(n_neighbors=K)
model.fit(X, y)

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [74]:
def convert_gbp_to_idr(pounds):
    clear_number = float("{:.2f}".format(pounds))
    return '{:,.0f}'.format(clear_number * 19281)

def convert_idr_to_gpb(idr):
    return idr / 19281

In [75]:
from sklearn.metrics import r2_score

y_pred = model.predict(X_test)
print(f'R-Squared: {r2_score(y_test, y_pred)}')

print('\nHasil Prediksi:')
print(f'£ {y_pred[0]}')
print(f'Rp. {convert_gbp_to_idr(y_pred[0])}')

pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
}).head()

R-Squared: 0.9994204077453455

Hasil Prediksi:
£ 11295.0
Rp. 217,778,895


Unnamed: 0,Actual,Predicted
0,11295,11295.0
1,18995,18995.0
2,8950,8950.0
3,9995,9995.0
4,15990,15990.0
