### Read Files

In [10]:
import pandas as pd # lakukan import library untuk read file

In [11]:
data = pd.read_excel('HARGA RUMAH JAKSEL.xlsx',sheet_name = 'Sheet1',skiprows = 1)
data.head()

Unnamed: 0,HARGA,LT,LB,JKT,JKM,GRS,KOTA
0,28000000000,1100,700,5,6,ADA,JAKSEL
1,19000000000,824,800,4,4,ADA,JAKSEL
2,4700000000,500,400,4,3,ADA,JAKSEL
3,4900000000,251,300,5,4,ADA,JAKSEL
4,28000000000,1340,575,4,5,ADA,JAKSEL


### EDA (Exploratory Data Analysis)

In [12]:
%pip install plotly==5.11.0 # Lakukan instalasi untuk plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting plotly==5.11.0
  Downloading plotly-5.11.0-py2.py3-none-any.whl (15.3 MB)
[K     |████████████████████████████████| 15.3 MB 29.7 MB/s 
Installing collected packages: plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 5.5.0
    Uninstalling plotly-5.5.0:
      Successfully uninstalled plotly-5.5.0
Successfully installed plotly-5.11.0


In [13]:
import plotly.express as px

In [14]:
pd.DataFrame(data.HARGA.describe()).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HARGA,1001.0,17474720000.0,20795480000.0,430000000.0,6750000000.0,13500000000.0,20000000000.0,250000000000.0


In [15]:
px.histogram(data_frame=data,x='LT',y='HARGA',title='Harga Rumah Jakarta Selatan Berdasarkan Luas Tanah',histfunc='avg',nbins=10,color_discrete_sequence=['#54B435'])

In [16]:
px.histogram(data_frame=data,x='LB',y='HARGA',title='Harga Rumah Jakarta Selatan Berdasarkan Luas Bangunan',histfunc='avg',nbins=10,color_discrete_sequence=['#749F82'])

In [17]:
px.pie(data_frame=data.GRS.value_counts().reset_index(),values='GRS',names='index',title='Persentase Banyaknya Garasi pada Rumah Jaksel',color_discrete_sequence=px.colors.sequential.RdBu)

In [18]:
pd.DataFrame(data.JKT.describe()).T
# Rata-rata jumlah kamar tidur di Jakarta Selatan adalah 4
# Setidaknya ada 1 kamar tidur
# Paling banyak yaitu 27 kamar tidur

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
JKT,1001.0,4.457542,2.004606,1.0,4.0,4.0,5.0,27.0


In [19]:
data_kamarTidur = px.pd.DataFrame(data.JKT.value_counts().reset_index().rename(columns={'index':'Jumlah Kamar Tidur','JKT':'Banyaknya'}))
px.bar(data_frame=data_kamarTidur,x='Jumlah Kamar Tidur',y='Banyaknya',title='Banyaknya Kamar Tidur pada Rumah Jaksel',color_discrete_sequence=['#F5B041'])

In [20]:
pd.DataFrame(data.JKM.describe()).T
# Rata-rata jumlah kamar mandi di Jakarta Selatan adalah 3
# Setidaknya ada 1 kamar mandi
# Paling banyak yaitu 27 kamar mandi

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
JKM,1001.0,3.94006,1.903261,1.0,3.0,4.0,4.0,27.0


In [21]:
data_kamarMandi = px.pd.DataFrame(data.JKM.value_counts().reset_index().rename(columns={'index':'Jumlah Kamar Mandi','JKM':'Banyaknya'}))
px.bar(data_frame=data_kamarMandi,x='Jumlah Kamar Mandi',y='Banyaknya',title='Banyaknya Kamar Mandi pada Rumah Jaksel',color_discrete_sequence=['#F5B041'])

In [22]:
data.KOTA.value_counts()
# hanya terdapat 1 jenis kota yang sama

JAKSEL    1001
Name: KOTA, dtype: int64

### Preprocessing

In [23]:
# karena hanya terdapat 1 jenis kota saja pada data.KOTA maka kita tidak membutuhkannya
data.drop(columns='KOTA',inplace=True,axis=1)

In [24]:
# Karena pada machine learning tidak bisa membaca data bertipe object atau string maka kita harus mengubahnya menjadi data bertipe numerik

data.GRS = data.GRS.map({'ADA':1,'TIDAK ADA':0})

In [25]:
# Hasil akhir dari data yang sudah diubah menjadi numerik dan penghapusan kolom KOTA
data.head()

Unnamed: 0,HARGA,LT,LB,JKT,JKM,GRS
0,28000000000,1100,700,5,6,1
1,19000000000,824,800,4,4,1
2,4700000000,500,400,4,3,1
3,4900000000,251,300,5,4,1
4,28000000000,1340,575,4,5,1


In [26]:
data.shape

(1001, 6)

In [27]:
# Membuang outliers atau anomali pada data

data = data[(data.LT>data.LT.quantile(0.01))&(data.LT<data.LT.quantile(0.99))]
data = data[(data.LB>data.LB.quantile(0.01))&(data.LB<data.LB.quantile(0.99))]
data = data[(data.JKT>data.JKT.quantile(0.01))&(data.JKT<data.JKT.quantile(0.99))]
data = data[(data.JKM>data.JKM.quantile(0.01))&(data.JKM<data.JKM.quantile(0.99))]
data = data[(data.HARGA>data.HARGA.quantile(0.01))&(data.HARGA<data.HARGA.quantile(0.99))]

In [28]:
data.shape

(823, 6)

### Prediksi

In [29]:
# Import Library
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [30]:
# Memisahkan label target dengan variabel lainnya
X = data.iloc[:,1:]
X

Unnamed: 0,LT,LB,JKT,JKM,GRS
0,1100,700,5,6,1
1,824,800,4,4,1
2,500,400,4,3,1
3,251,300,5,4,1
4,1340,575,4,5,1
...,...,...,...,...,...
993,169,215,4,4,1
996,488,550,6,5,1
997,209,270,4,4,1
998,692,400,4,3,0


In [31]:
y = data.iloc[:,0]
y

0      28000000000
1      19000000000
2       4700000000
3       4900000000
4      28000000000
          ...     
993     3500000000
996    16000000000
997     4500000000
998    29000000000
999     1700000000
Name: HARGA, Length: 823, dtype: int64

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [33]:
model = RandomForestRegressor()
model.fit(X_train,y_train)
hasil = model.predict(X_test)

In [34]:
# model kita sudah selesai dibuat dan kita sudah mendapatkan hasil prediksi dari model kita
hasil

array([1.82210000e+10, 3.25040000e+09, 6.50888333e+09, 5.49650000e+09,
       1.55175000e+10, 2.19350000e+10, 2.10100000e+09, 1.26040000e+10,
       4.66040000e+09, 3.05140000e+10, 2.23920000e+10, 1.57655000e+10,
       2.40310000e+10, 1.38950000e+10, 1.53175000e+10, 1.88333500e+10,
       1.53852250e+10, 1.44730000e+10, 7.03550000e+09, 3.79245000e+10,
       3.24810000e+09, 1.37438571e+10, 9.64200000e+09, 2.73450000e+10,
       1.07660000e+10, 2.37286000e+10, 2.16070000e+10, 1.29034000e+10,
       4.96230000e+09, 1.12752000e+10, 2.55100000e+10, 2.59090000e+10,
       2.33260000e+10, 2.33320000e+10, 1.95636667e+10, 2.29208000e+10,
       1.20355000e+10, 5.21925000e+10, 4.54760000e+09, 1.24025000e+10,
       6.73760000e+09, 1.25355000e+10, 2.29250000e+10, 1.87620000e+10,
       9.15003333e+09, 3.79245000e+10, 2.25950000e+10, 5.72770000e+09,
       1.21495000e+10, 2.29890000e+09, 4.83650000e+10, 2.63755000e+10,
       6.80423333e+09, 1.87370000e+10, 1.50658000e+10, 1.64973333e+10,
      

In [35]:
def prediksi(LT,LB,JKT,JKM,GRS):
    predict = pd.DataFrame()
    predict['LT'] = [LT]
    predict['LB'] = [LB]
    predict['JKT'] = [JKT]
    predict['JKM'] = [JKM]
    predict['GRS'] = [GRS]
    print(model.predict(predict))

In [36]:
prediksi(500,600,2,2,1)

[1.590595e+10]


In [37]:
import  pickle
with open('prediksi_harga_rumah.pkl','wb') as file:
    pickle.dump(model,file)

In [38]:
%%writefile app.py

import pickle
import pandas as pd
import streamlit as st

with open ("prediksi_harga_rumah.pkl", "rb") as f:
    model = pickle.load(f)

def prediksi(LT,LB,JKT,JKM,GRS):
    predict = pd.DataFrame()
    predict['Luas Tanah'] = [LT]
    predict['Luas Bangunan'] = [LB]
    predict['Jumlah Kamar Tidur'] = [JKT]
    predict['Jumlah Kamar Mandi'] = [JKM]
    predict['Garasi'] = [GRS]
    return(model.predict(predict)[0])   

lt = st.number_input("Luas Tanah")
lb = st.number_input("Luas Bangunan")
jkt = st.number_input("Jumlah Kamar Tidur")
jkm = st.number_input("Jumlah Kamar Mandi")
opt = st.selectbox(
    'Garasi',
    ('Ada', 'Tidak ada')
)

grs = 0
if(opt == 'Ada') :
    grs = 1
elif (opt == 'Tidak ada'):
    grs = 0
    
    
if(st.button('Predict')) :
    st.write("Harga rumah impianmu adalah Rp{:,}".format(prediksi(lt,lb,jkt,jkm,grs)))

Writing app.py


In [39]:
%%writefile requirements.txt
streamlit

Writing requirements.txt


In [40]:
import sklearn
sklearn.__version__

'1.0.2'