1. Membuat Data Frame

In [544]:
import pandas as pd

df = pd.read_csv('./resource/toyota.csv')
features = ['year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']

df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,GT86,2016,16000,Manual,24089,Petrol,265,36.2,2.0
1,GT86,2017,15995,Manual,18615,Petrol,145,36.2,2.0
2,GT86,2015,13998,Manual,27469,Petrol,265,36.2,2.0
3,GT86,2017,18998,Manual,14736,Petrol,150,36.2,2.0
4,GT86,2017,17498,Manual,36284,Petrol,145,36.2,2.0


2. Melihat Data yang Sama

In [545]:
for column in [features[0], features[1], features[3], features[6]]:
    print(f'{column} ({df[column].nunique()}/{df[column].count()}): \n{df[column].unique()}\n')

year (23/6738): 
[2016 2017 2015 2020 2013 2019 2018 2014 2012 2005 2003 2004 2001 2008
 2007 2010 2011 2006 2009 2002 1999 2000 1998]

transmission (4/6738): 
['Manual' 'Automatic' 'Semi-Auto' 'Other']

fuelType (4/6738): 
['Petrol' 'Other' 'Hybrid' 'Diesel']

engineSize (16/6738): 
[2.  1.8 1.2 1.6 1.4 2.5 2.2 1.5 1.  1.3 0.  2.4 3.  2.8 4.2 4.5]



3. Pemisahan Variabel

In [546]:
X = df[features]
y = df['price']

In [547]:
X.head()

Unnamed: 0,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,2016,Manual,24089,Petrol,265,36.2,2.0
1,2017,Manual,18615,Petrol,145,36.2,2.0
2,2015,Manual,27469,Petrol,265,36.2,2.0
3,2017,Manual,14736,Petrol,150,36.2,2.0
4,2017,Manual,36284,Petrol,145,36.2,2.0


In [548]:
y.head()

0    16000
1    15995
2    13998
3    18998
4    17498
Name: price, dtype: int64

4. Konversi Data Kategorial Menjadi Numerik

In [549]:
tranmission_convert = pd.get_dummies(X['transmission']).astype(int)

fultype_convert = pd.get_dummies(X['fuelType']).astype(int)

In [550]:
tranmission_convert.head()

Unnamed: 0,Automatic,Manual,Other,Semi-Auto
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,1,0,0


In [551]:
fultype_convert.head()

Unnamed: 0,Diesel,Hybrid,Other,Petrol
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


5. Menghapus Kolom yang Sudah Tidak Diperlukan

In [552]:
X = X.drop(['transmission', 'fuelType'], axis=1)
X.head()

Unnamed: 0,year,mileage,tax,mpg,engineSize
0,2016,24089,265,36.2,2.0
1,2017,18615,145,36.2,2.0
2,2015,27469,265,36.2,2.0
3,2017,14736,150,36.2,2.0
4,2017,36284,145,36.2,2.0


6. Mengabungkan Data yang Sudah Dikonversi

In [553]:
X = pd.concat([X, tranmission_convert, fultype_convert], axis=1)
X

Unnamed: 0,year,mileage,tax,mpg,engineSize,Automatic,Manual,Other,Semi-Auto,Diesel,Hybrid,Other.1,Petrol
0,2016,24089,265,36.2,2.0,0,1,0,0,0,0,0,1
1,2017,18615,145,36.2,2.0,0,1,0,0,0,0,0,1
2,2015,27469,265,36.2,2.0,0,1,0,0,0,0,0,1
3,2017,14736,150,36.2,2.0,0,1,0,0,0,0,0,1
4,2017,36284,145,36.2,2.0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6733,2011,30000,20,58.9,1.0,1,0,0,0,0,0,0,1
6734,2011,36154,125,50.4,1.3,0,1,0,0,0,0,0,1
6735,2012,46000,125,57.6,1.4,0,1,0,0,1,0,0,0
6736,2011,60700,125,50.4,1.3,0,1,0,0,0,0,0,1


7. Mengubah data

In [554]:
import numpy as np

X = np.array(X)
y = np.array(y)

pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,2016.0,24089.0,265.0,36.2,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2017.0,18615.0,145.0,36.2,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2015.0,27469.0,265.0,36.2,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2017.0,14736.0,150.0,36.2,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2017.0,36284.0,145.0,36.2,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6733,2011.0,30000.0,20.0,58.9,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6734,2011.0,36154.0,125.0,50.4,1.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
6735,2012.0,46000.0,125.0,57.6,1.4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
6736,2011.0,60700.0,125.0,50.4,1.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


8. Inisiasi Model dan Training Data

In [555]:
from sklearn.neighbors import KNeighborsRegressor as KNN

K = 3

model = KNN(n_neighbors=K)
model.fit(X, y)

9. Membuat Dataset Training dan Test

In [556]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

10. Evaluasi Model

In [557]:
from sklearn.metrics import r2_score

y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.5603960081591537