In [257]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

### Read Data

In [268]:
df = pd.read_csv("scrape_residents.csv", encoding = "ISO-8859-1")
df.head()

Unnamed: 0,Address,Price,Details
0,"Sleman, Sleman",Rp 689.500.000,2 Kamar tidur ...
1,"Gamping, Sleman",Rp 700.000.000,2 Kamar tidur ...
2,"Ngaglik, Sleman",Rp 689.500.000,2 Kamar tidur ...
3,"Bantul, Bantul",Rp 875.000.000,3 Kamar tidur ...
4,"Maguwoharjo, Sleman",Rp 270.000.000,2 Kamar tidur ...


In [269]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Address  236 non-null    object
 1   Price    236 non-null    object
 2   Details  236 non-null    object
dtypes: object(3)
memory usage: 5.7+ KB


### Preprocessing

In [270]:
# Split address by comma
df[['Kab', 'Kota']] = df['Address'].str.split(r",", expand=True)
df = df.drop(['Address'], axis = 1)
df.head()

Unnamed: 0,Price,Details,Kab,Kota
0,Rp 689.500.000,2 Kamar tidur ...,Sleman,Sleman
1,Rp 700.000.000,2 Kamar tidur ...,Gamping,Sleman
2,Rp 689.500.000,2 Kamar tidur ...,Ngaglik,Sleman
3,Rp 875.000.000,3 Kamar tidur ...,Bantul,Bantul
4,Rp 270.000.000,2 Kamar tidur ...,Maguwoharjo,Sleman


In [271]:
def cleaning(text):
    text = re.sub(r"[.,?]", " ", text) # Remove titik
    text = re.sub(r"[Rp, m², Lahan]", " ", text) # Remove 'Rp'
    text = re.sub(r"\s+", "", text) # Remove whitespace
    return text

In [272]:
# Remove character from column price and details
df['Price'] = df['Price'].apply(cleaning)
df['Details'] = df['Details'].apply(cleaning)
df.head()

Unnamed: 0,Price,Details,Kab,Kota
0,689500000,2Krtidur45Bgu93,Sleman,Sleman
1,700000000,2Krtidur76Bgu108,Gamping,Sleman
2,689500000,2Krtidur45Bgu93,Ngaglik,Sleman
3,875000000,3Krtidur82Bgu112,Bantul,Bantul
4,270000000,2Krtidur78Bgu94,Maguwoharjo,Sleman


In [273]:
# Split details
df[['KT', 'Bangunan', 'Lahan']] = df['Details'].str.split(r"Krtidur|Bgu", expand=True)
df = df.drop(['Details'], axis = 1)
df.head()

Unnamed: 0,Price,Kab,Kota,KT,Bangunan,Lahan
0,689500000,Sleman,Sleman,2,45,93
1,700000000,Gamping,Sleman,2,76,108
2,689500000,Ngaglik,Sleman,2,45,93
3,875000000,Bantul,Bantul,3,82,112
4,270000000,Maguwoharjo,Sleman,2,78,94


### Handling Missing Value

In [274]:
df.isnull().sum()

Price        0
Kab          0
Kota        16
KT           0
Bangunan     0
Lahan        0
dtype: int64

In [275]:
df['Kota'] = df['Kota'].fillna('Kota')
df.isnull().sum()

Price       0
Kab         0
Kota        0
KT          0
Bangunan    0
Lahan       0
dtype: int64

### MinMaxScaler

StandardScaler is useful for the features that follow a Normal distribution. Therefore, it makes mean = 0 and scales the data to unit variance. MinMaxScaler may be used when the upper and lower boundaries are well known from domain knowledge.

In [276]:
X = df.drop(['Kab','Kota', 'Price'], axis = 1 )
X.head()

Unnamed: 0,KT,Bangunan,Lahan
0,2,45,93
1,2,76,108
2,2,45,93
3,3,82,112
4,2,78,94
...,...,...,...
231,2,45,94
232,2,36,113
233,2,36,114
234,3,60,90


In [279]:
y= df['Price']
y.head()

0      689500000
1      700000000
2      689500000
3      875000000
4      270000000
         ...    
231    470000000
232    390000000
233    390000000
234    375000000
235    368000000
Name: Price, Length: 236, dtype: object

In [282]:
scaler = MinMaxScaler()
X_scaled=scaler.fit_transform(X)
X_scaled

array([[0.04166667, 0.0347667 , 0.03416264],
       [0.04166667, 0.063129  , 0.03973264],
       [0.04166667, 0.0347667 , 0.03416264],
       [0.08333333, 0.06861848, 0.04121797],
       [0.04166667, 0.06495883, 0.03453398],
       [0.04166667, 0.03934126, 0.03713331],
       [0.29166667, 0.16925892, 0.02933531],
       [0.04166667, 0.021043  , 0.02302265],
       [0.04166667, 0.03019213, 0.02190865],
       [0.04166667, 0.02653248, 0.01819532],
       [0.08333333, 0.11802379, 0.03787597],
       [0.04166667, 0.02653248, 0.02636465],
       [0.04166667, 0.02653248, 0.02599332],
       [0.33333333, 0.14913083, 0.0471593 ],
       [0.04166667, 0.02653248, 0.02190865],
       [0.08333333, 0.063129  , 0.02933531],
       [0.125     , 0.1820677 , 0.07760861],
       [0.04166667, 0.05306496, 0.02190865],
       [0.08333333, 0.07593779, 0.04084664],
       [0.04166667, 0.02653248, 0.01819532],
       [0.08333333, 0.04849039, 0.03676198],
       [0.08333333, 0.0347667 , 0.03193465],
       [0.

### Split Data

In [283]:
X_train,X_test, y_train,y_test=train_test_split(X_scaled, y,test_size=0.2, random_state=42)

In [284]:
print("Data train =", len(X_train))
print("Data test =", len(X_test))

Data train = 188
Data test = 48


### Random Forest

Random forest regression is used to solve a variety of business problems where the company needs to predict a continuous value: Predict future prices/costs.

In [285]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=1000)

# Training 
DT = model.fit(X_train, y_train)

# Evaluation
y_pred = DT.predict(X_test)

In [286]:
# Accuracy
print("Accuracy =", model.score(X_test, y_test)*100)

Accuracy = 84.17710945225397


### Prediction

In [287]:
# Making an prediction
prediction = model.predict(X_test)
print(prediction)

[1.91349200e+09 2.34488300e+09 1.44735900e+09 2.36447867e+08
 2.03610200e+09 9.78375000e+08 9.93794500e+08 5.85749250e+08
 1.49021450e+09 7.45565110e+08 4.66992114e+08 4.64801500e+08
 4.16923667e+08 1.33760320e+10 7.79004000e+08 7.20770500e+08
 2.36447867e+08 8.81841000e+08 1.83153250e+09 1.96950350e+09
 9.18982971e+08 1.49012000e+09 5.72883294e+08 6.52013828e+08
 5.40721661e+08 5.23378500e+08 6.31076296e+08 3.99244250e+08
 9.68850077e+08 9.93115077e+08 4.06787917e+08 3.70210757e+08
 1.18664518e+09 2.14326350e+09 7.28021000e+08 9.44764181e+08
 6.42215300e+09 1.97695350e+09 6.00858857e+08 8.03592962e+08
 6.22572489e+08 1.07160990e+09 2.87239214e+08 1.07558527e+09
 3.62466549e+08 5.58269120e+08 1.00206000e+09 3.23898150e+08]


In [300]:
def cek(x, y, z):
    input("Masukkan jumlah Kamar Tidur =", x)
    input("Masukkan Luas Bangunan =", y)
    input("Masukkan Luas Lahan =", z)

In [301]:
cek(2,36,60)

TypeError: Kernel.raw_input() takes from 1 to 2 positional arguments but 3 were given