In [1]:
import pandas as pd
import numpy as np

In [2]:
data = [
    {'luas': 1000, 'harga': 1000, 'kota': 'Jakarta'},
    {'luas': 2000, 'harga': 2000, 'kota': 'Jakarta'},
    {'luas': 3000, 'harga': 3000, 'kota': 'Jakarta'},
    {'luas': 4000, 'harga': 4000, 'kota': 'Jakarta'},
    {'luas': 5000, 'harga': 5000, 'kota': 'Jakarta'},
    {'luas': 1000, 'harga': 500, 'kota': 'Bogor'},
    {'luas': 2000, 'harga': 1000, 'kota': 'Bogor'},
    {'luas': 3000, 'harga': 1500, 'kota': 'Bogor'},
    {'luas': 4000, 'harga': 2000, 'kota': 'Bogor'},
    {'luas': 5000, 'harga': 2500, 'kota': 'Bogor'},
    {'luas': 1000, 'harga': 2000, 'kota': 'Tangsel'},
    {'luas': 2000, 'harga': 4000, 'kota': 'Tangsel'},
    {'luas': 3000, 'harga': 6000, 'kota': 'Tangsel'},
    {'luas': 4000, 'harga': 8000, 'kota': 'Tangsel'},
    {'luas': 5000, 'harga': 10000, 'kota': 'Tangsel'}
]
df = pd.DataFrame(data)
df

Unnamed: 0,luas,harga,kota
0,1000,1000,Jakarta
1,2000,2000,Jakarta
2,3000,3000,Jakarta
3,4000,4000,Jakarta
4,5000,5000,Jakarta
5,1000,500,Bogor
6,2000,1000,Bogor
7,3000,1500,Bogor
8,4000,2000,Bogor
9,5000,2500,Bogor


<hr>

### 1. Dummy Var Pandas

In [5]:
dfDummy = pd.get_dummies(df['kota'])
dfDummy

Unnamed: 0,Bogor,Jakarta,Tangsel
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,1,0,0


In [10]:
dfNew = pd.concat([df, dfDummy], sort=False, axis='columns')
dfNew = dfNew.drop(['kota'], axis=1)
dfNew

Unnamed: 0,luas,harga,Bogor,Jakarta,Tangsel
0,1000,1000,0,1,0
1,2000,2000,0,1,0
2,3000,3000,0,1,0
3,4000,4000,0,1,0
4,5000,5000,0,1,0
5,1000,500,1,0,0
6,2000,1000,1,0,0
7,3000,1500,1,0,0
8,4000,2000,1,0,0
9,5000,2500,1,0,0


In [11]:
dfNew.corr()

Unnamed: 0,luas,harga,Bogor,Jakarta,Tangsel
luas,1.0,0.62361,1.136868e-17,1.136868e-17,0.0
harga,0.6236096,1.0,-0.5345225,-0.1336306,0.668153
Bogor,1.136868e-17,-0.534522,1.0,-0.5,-0.5
Jakarta,1.136868e-17,-0.133631,-0.5,1.0,-0.5
Tangsel,0.0,0.668153,-0.5,-0.5,1.0


In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
model = LinearRegression()
model.fit(dfNew[['luas', 'Bogor', 'Tangsel']], dfNew['harga'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [16]:
m = model.coef_
c = model.intercept_
print(m)
print(c)

[ 1.16666667e+00 -1.50000000e+03  3.00000000e+03]
-500.00000000000364


$ y = -500 + 1.16 \times luas - 1500 \times Bogor + 3000 \times Tangsel $

In [18]:
# Rumah 1000m2 jakarta, harga ?
harga_pred = model.predict(dfNew[['luas', 'Bogor', 'Tangsel']])
dfNew['harga_pred'] = harga_pred
dfNew

Unnamed: 0,luas,harga,Bogor,Jakarta,Tangsel,harga_pred
0,1000,1000,0,1,0,666.666667
1,2000,2000,0,1,0,1833.333333
2,3000,3000,0,1,0,3000.0
3,4000,4000,0,1,0,4166.666667
4,5000,5000,0,1,0,5333.333333
5,1000,500,1,0,0,-833.333333
6,2000,1000,1,0,0,333.333333
7,3000,1500,1,0,0,1500.0
8,4000,2000,1,0,0,2666.666667
9,5000,2500,1,0,0,3833.333333


<hr>

### 2. Label Encoder + One Hot Encoder Sklearn

In [20]:
from sklearn.preprocessing import LabelEncoder

In [25]:
label = LabelEncoder()
dfUji = df
dfUji['kota'] = label.fit_transform(dfUji['kota'])
print(label.classes_)
dfUji

[0 1 2]


Unnamed: 0,luas,harga,kota
0,1000,1000,1
1,2000,2000,1
2,3000,3000,1
3,4000,4000,1
4,5000,5000,1
5,1000,500,0
6,2000,1000,0
7,3000,1500,0
8,4000,2000,0
9,5000,2500,0


In [42]:
x = dfUji[['kota', 'luas']]
y = dfUji['harga']
y

0      1000
1      2000
2      3000
3      4000
4      5000
5       500
6      1000
7      1500
8      2000
9      2500
10     2000
11     4000
12     6000
13     8000
14    10000
Name: harga, dtype: int64

In [29]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [43]:
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],
    remainder='passthrough'
)

x = np.array(ct.fit_transform(x), dtype=np.int64)
x

array([[   0,    1,    0, 1000],
       [   0,    1,    0, 2000],
       [   0,    1,    0, 3000],
       [   0,    1,    0, 4000],
       [   0,    1,    0, 5000],
       [   1,    0,    0, 1000],
       [   1,    0,    0, 2000],
       [   1,    0,    0, 3000],
       [   1,    0,    0, 4000],
       [   1,    0,    0, 5000],
       [   0,    0,    1, 1000],
       [   0,    0,    1, 2000],
       [   0,    0,    1, 3000],
       [   0,    0,    1, 4000],
       [   0,    0,    1, 5000]], dtype=int64)

In [44]:
model = LinearRegression()
model.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [45]:
model.predict([[0,1,0,1000]])

array([666.66666667])

In [46]:
df['harga_pred'] = model.predict(x)
df

Unnamed: 0,luas,harga,kota,harga_pred
0,1000,1000,1,666.666667
1,2000,2000,1,1833.333333
2,3000,3000,1,3000.0
3,4000,4000,1,4166.666667
4,5000,5000,1,5333.333333
5,1000,500,0,-833.333333
6,2000,1000,0,333.333333
7,3000,1500,0,1500.0
8,4000,2000,0,2666.666667
9,5000,2500,0,3833.333333


<hr>

### 3. Label Encoder (Tanpa One Hot Encoder Sklearn)

In [48]:
dfTes = df[['harga', 'kota', 'luas']]
label = LabelEncoder()
dfUji = df
dfTes['kota'] = label.fit_transform(dfTes['kota'])
print(label.classes_)
dfTes

[0 1 2]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,harga,kota,luas
0,1000,1,1000
1,2000,1,2000
2,3000,1,3000
3,4000,1,4000
4,5000,1,5000
5,500,0,1000
6,1000,0,2000
7,1500,0,3000
8,2000,0,4000
9,2500,0,5000


In [50]:
modelTes = LinearRegression()
modelTes.fit(dfTes[['kota', 'luas']], dfTes['harga'])
dfTes['pred'] = modelTes.predict(dfTes[['kota', 'luas']])
dfTes

Unnamed: 0,harga,kota,luas,pred
0,1000,1,1000,1166.666667
1,2000,1,2000,2333.333333
2,3000,1,3000,3500.0
3,4000,1,4000,4666.666667
4,5000,1,5000,5833.333333
5,500,0,1000,-1083.333333
6,1000,0,2000,83.333333
7,1500,0,3000,1250.0
8,2000,0,4000,2416.666667
9,2500,0,5000,3583.333333
