In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
data = pd.read_excel('HARGA RUMAH JAKSEL.xlsx', sheet_name='Sheet1',skiprows=1)
data.head(10)

Unnamed: 0,HARGA,LT,LB,JKT,JKM,GRS,KOTA
0,28000000000,1100,700,5,6,ADA,JAKSEL
1,19000000000,824,800,4,4,ADA,JAKSEL
2,4700000000,500,400,4,3,ADA,JAKSEL
3,4900000000,251,300,5,4,ADA,JAKSEL
4,28000000000,1340,575,4,5,ADA,JAKSEL
5,10000000000,460,300,4,4,ADA,JAKSEL
6,7600000000,278,350,4,4,ADA,JAKSEL
7,5250000000,511,300,3,2,ADA,JAKSEL
8,670000000,70,69,3,2,TIDAK ADA,JAKSEL
9,480000000,66,42,2,1,TIDAK ADA,JAKSEL


In [3]:
pd.DataFrame(data.HARGA.describe()).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HARGA,1001.0,17474720000.0,20795480000.0,430000000.0,6750000000.0,13500000000.0,20000000000.0,250000000000.0


## BIVARIAT

In [4]:
px.histogram(
    data_frame=data, x ='LT', y = 'HARGA',
    nbins=5,histfunc='avg',color_discrete_sequence=['black'],
    title='Perbandingan Harga Dengan Luas Tanah'
            )

In [5]:
px.histogram(data_frame=data,x='LB',y='HARGA',
             title='Harga Rumah Jakarta Selatan Berdasarkan Luas Bangunan',
             histfunc='avg',
             nbins=4,color_discrete_sequence=['#749F82']
             )

## UNIVARIAT

In [6]:
pd.DataFrame(data['GRS'].value_counts().reset_index())

Unnamed: 0,GRS,count
0,ADA,779
1,TIDAK ADA,222


In [7]:
px.pie(data_frame=data['GRS'].value_counts().reset_index(), 
       values = 'count',
       names = 'GRS',
       title = 'Jumlah Rumah Jaksel yang Ada Garasi'
       )

In [8]:
pd.DataFrame(data['JKT'].describe()).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
JKT,1001.0,4.457542,2.004606,1.0,4.0,4.0,5.0,27.0


In [9]:
dataKamarTidur = pd.DataFrame(
    data['JKT'].value_counts().reset_index().rename(columns={'JKT' :'JKT','count': 'Banyak Rumah'})
    )
px.bar(
    data_frame=dataKamarTidur, x = 'JKT', y = 'Banyak Rumah',
    title = 'Jumlah Kamar Tidur Rumah Jaksel'
)

In [10]:
dataKamarMandi = pd.DataFrame(
    data['JKM'].value_counts().reset_index().rename(columns={'count':'Banyak Rumah'})
  )
px.bar(
    data_frame=dataKamarMandi, y = 'Banyak Rumah', x = 'JKM',
    title = 'Jumlah Kamar Mandi Rumah Jaksel'
)

In [11]:
data['KOTA'].value_counts()

KOTA
JAKSEL    1001
Name: count, dtype: int64

## PREPROCESSING

In [12]:
data.drop(columns='KOTA',inplace=True,axis=1)

In [13]:
data.GRS = data.GRS.map({'ADA':1,'TIDAK ADA':0})

In [22]:
data.head()

Unnamed: 0,HARGA,LT,LB,JKT,JKM,GRS
0,28000000000,1100,700,5,6,1
1,19000000000,824,800,4,4,1
2,4700000000,500,400,4,3,1
3,4900000000,251,300,5,4,1
4,28000000000,1340,575,4,5,1


In [24]:
data.shape

(823, 6)

In [25]:
data = data[(data.LT>data.LT.quantile(0.01))&(data.LT<data.LT.quantile(0.99))]
data = data[(data.LB>data.LB.quantile(0.01))&(data.LB<data.LB.quantile(0.99))]
data = data[(data.JKT>data.JKT.quantile(0.01))&(data.JKT<data.JKT.quantile(0.99))]
data = data[(data.JKM>data.JKM.quantile(0.01))&(data.JKM<data.JKM.quantile(0.99))]
data = data[(data.HARGA>data.HARGA.quantile(0.01))&(data.HARGA<data.HARGA.quantile(0.99))]

## PREDICTION

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [29]:
X = data.iloc[:,1:]
X

Unnamed: 0,LT,LB,JKT,JKM,GRS
1,824,800,4,4,1
3,251,300,5,4,1
4,1340,575,4,5,1
5,460,300,4,4,1
6,278,350,4,4,1
...,...,...,...,...,...
985,235,310,5,4,1
987,187,337,6,4,1
993,169,215,4,4,1
996,488,550,6,5,1


In [30]:
y = data.iloc[:,0]
y

1      19000000000
3       4900000000
4      28000000000
5      10000000000
6       7600000000
          ...     
985     6400000000
987     6750000000
993     3500000000
996    16000000000
997     4500000000
Name: HARGA, Length: 434, dtype: int64

In [16]:
X = data.drop('HARGA' , axis=1)
y = data['HARGA']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
data = data.fillna(0)

In [19]:
model = RandomForestRegressor()
model.fit(X_train,y_train)
hasil = model.predict(X_test)

In [31]:
hasil

array([1.76480000e+10, 3.08550000e+09, 6.37898333e+09, 5.40000000e+09,
       1.52740000e+10, 2.17750000e+10, 2.12440000e+09, 1.20150000e+10,
       4.36020000e+09, 3.08640000e+10, 2.24510000e+10, 1.69710000e+10,
       2.53647500e+10, 1.39800000e+10, 1.67120000e+10, 1.87040000e+10,
       1.59213500e+10, 1.42470000e+10, 6.74070000e+09, 3.87145000e+10,
       3.31520000e+09, 1.28005000e+10, 9.72150000e+09, 2.65410000e+10,
       1.10960000e+10, 2.37789000e+10, 2.27980000e+10, 1.19838333e+10,
       4.89610000e+09, 1.11380000e+10, 2.46186250e+10, 3.05910000e+10,
       2.48260000e+10, 2.37410000e+10, 1.91550000e+10, 2.39740000e+10,
       1.21510000e+10, 5.31916667e+10, 4.08630000e+09, 1.20880000e+10,
       6.73740000e+09, 1.21330000e+10, 2.14380000e+10, 1.98755000e+10,
       8.94180000e+09, 3.87145000e+10, 2.35750000e+10, 5.70560000e+09,
       1.18622500e+10, 2.20280000e+09, 5.33200000e+10, 2.45640000e+10,
       6.58810000e+09, 1.68670000e+10, 1.63741333e+10, 1.63982083e+10,
      

In [32]:
def prediksi(LT,LB,JKT,JKM,GRS):
    predict = pd.DataFrame()
    predict['LT'] = [LT]
    predict['LB'] = [LB]
    predict['JKT'] = [JKT]
    predict['JKM'] = [JKM]
    predict['GRS'] = [GRS]
    print(model.predict(predict))

In [33]:
prediksi(500,600,2,2,1)

[1.75188e+10]


In [34]:
import  pickle
with open('prediksi_harga_rumah.pkl','wb') as file:
    pickle.dump(model,file)

In [21]:
import sklearn
sklearn.__version__

'1.3.1'