In [1]:
import pandas as pd
import numpy as np

In [2]:
file_url = "https://raw.githubusercontent.com/emanhamed/Houses-dataset/master/Houses%20Dataset/HousesInfo.txt"
columns = ["bedroom", "bathroom", "area", "zipcodes", "price"]
df = pd.read_csv(file_url, sep=" ", names=columns)
df

Unnamed: 0,bedroom,bathroom,area,zipcodes,price
0,4,4.0,4053,85255,869500
1,4,3.0,3343,36372,865200
2,3,4.0,3923,85266,889000
3,5,5.0,4022,85262,910000
4,3,4.0,4116,85266,971226
...,...,...,...,...,...
530,5,2.0,2066,94531,399900
531,4,3.5,9536,94531,460000
532,3,2.0,2014,94531,407000
533,4,3.0,2312,94531,419000


In [3]:
df.dtypes

Unnamed: 0,0
bedroom,int64
bathroom,float64
area,int64
zipcodes,int64
price,int64


In [4]:
df["zipcodes"].value_counts()

Unnamed: 0_level_0,count
zipcodes,Unnamed: 1_level_1
92276,100
93510,60
93446,54
92880,49
94501,41
91901,32
92677,26
94531,22
85255,12
96019,12


In [5]:
temp_df = df[["zipcodes", "price"]]
temp_df.groupby("zipcodes").mean().astype("int32")

Unnamed: 0_level_0,price
zipcodes,Unnamed: 1_level_1
36372,865200
60002,267966
60016,261950
60046,289900
62025,384950
62034,209900
62088,119000
62214,180450
62234,191042
62249,339900


In [6]:
zip_codes, counts = np.unique(df["zipcodes"], return_counts=True)

In [7]:
len(df)

535

In [8]:
df

Unnamed: 0,bedroom,bathroom,area,zipcodes,price
0,4,4.0,4053,85255,869500
1,4,3.0,3343,36372,865200
2,3,4.0,3923,85266,889000
3,5,5.0,4022,85262,910000
4,3,4.0,4116,85266,971226
...,...,...,...,...,...
530,5,2.0,2066,94531,399900
531,4,3.5,9536,94531,460000
532,3,2.0,2014,94531,407000
533,4,3.0,2312,94531,419000


In [9]:
for zipcode, count in zip(zip_codes,counts):
  if count<25:
    idxs = df[df["zipcodes"] ==zipcode].index
    df.drop(idxs, inplace=True)

In [10]:
df

Unnamed: 0,bedroom,bathroom,area,zipcodes,price
30,5,3.0,2520,93446,789000
32,3,2.0,1802,93446,365000
39,3,3.0,2146,93446,455000
80,4,2.5,2464,91901,599000
81,2,2.0,1845,91901,529800
...,...,...,...,...,...
499,4,4.0,3000,93446,1495000
500,3,2.0,2330,93446,599900
501,3,2.5,1339,93446,344900
502,3,2.0,1472,93446,309995


In [11]:
len(df)

362

In [12]:
np.unique(df["zipcodes"], return_counts=True)

(array([91901, 92276, 92677, 92880, 93446, 93510, 94501]),
 array([ 32, 100,  26,  49,  54,  60,  41]))

In [13]:
df["zipcodes"].value_counts()

Unnamed: 0_level_0,count
zipcodes,Unnamed: 1_level_1
92276,100
93510,60
93446,54
92880,49
94501,41
91901,32
92677,26


In [14]:
temp_df = df[["zipcodes", "price"]]
temp_df.groupby("zipcodes").mean().astype("int64")

Unnamed: 0_level_0,price
zipcodes,Unnamed: 1_level_1
91901,732171
92276,124416
92677,1051038
92880,559302
93446,581401
93510,628142
94501,814594


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)

print(train_df.shape)
print(test_df.shape)

(271, 5)
(91, 5)


In [17]:
m = train_df["price"].max()

train_y = train_df["price"]/ m
test_y = test_df["price"]/ m

In [18]:
max(list(train_y))

1.0

In [19]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()

In [20]:
numeric_columns = ["bedroom", "bathroom", "area"]
train_numeric = minmax.fit_transform(train_df[numeric_columns])


In [21]:
test_numeric = minmax.transform(test_df[numeric_columns])

In [22]:
from sklearn.preprocessing import LabelBinarizer
lbl = LabelBinarizer()

In [23]:
lbl.classes_

AttributeError: 'LabelBinarizer' object has no attribute 'classes_'

In [24]:
train_categorical = lbl.fit_transform(train_df[["zipcodes"]])
test_categorical = lbl.transform(test_df[["zipcodes"]])

In [25]:
lbl.classes_

array([91901, 92276, 92677, 92880, 93446, 93510, 94501])

In [26]:
test_categorical[:2]

array([[0, 0, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0]])

In [27]:
print(train_numeric.shape)
print(train_categorical.shape)

(271, 3)
(271, 7)


In [28]:
train_x = np.hstack([train_numeric,train_categorical])
train_x.shape

(271, 10)

In [29]:
test_x = np.hstack([test_numeric,test_categorical])
test_x.shape

(91, 10)

In [30]:
from sklearn.linear_model import LinearRegression

In [31]:
est = LinearRegression()

In [32]:
est.fit(train_x, train_y)

In [33]:
est.score(train_x, train_y)

0.48965479259085

In [34]:
est.score(test_x, test_y)

0.7746030953741593

In [35]:
predicted = est.predict(test_x)

In [36]:
for i, j in zip(predicted, test_y):
  print(int(i*m), "-->", int(j*m))

327344 --> 510000
872288 --> 669000
743909 --> 979000
690988 --> 768999
856350 --> 649900
626395 --> 539900
1291249 --> 1895000
610142 --> 545000
153842 --> 189000
427409 --> 529900
51426 --> 68500
699014 --> 347000
276691 --> 214900
1105273 --> 1045000
483757 --> 520000
118447 --> 69900
184801 --> 69900
115911 --> 248500
631880 --> 689000
804363 --> 668000
1353721 --> 1430000
274596 --> 218999
624213 --> 599000
564827 --> 559000
314175 --> 179900
177556 --> 136900
-52160 --> 63500
632888 --> 553000
96107 --> 115000
595778 --> 599900
292062 --> 245900
729755 --> 699000
1258995 --> 1499999
185920 --> 179000
467525 --> 515000
570084 --> 570000
675542 --> 739000
947586 --> 679000
495511 --> 639000
394459 --> 485000
129175 --> 122499
277872 --> 125000
402577 --> 875000
373748 --> 395000
784000 --> 695000
426341 --> 699500
593240 --> 739000
969448 --> 749999
252470 --> 397500
1537147 --> 1200000
-94048 --> 67900
581410 --> 647000
596918 --> 557000
10803 --> 79900
927942 --> 695000
473814 --