In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from preprocess import DataProcessor                                        

In [13]:
df = pd.read_csv('saved_data/df_clean.csv')
cols = [
    "street",
    "ward",
    "district",
    "city",
    "size",
    "property_legal_document",
    "bed_rooms",
    "toilets",
    "floors",
    "price",
    "house_type",
]

df = df[cols]

df.head()

Unnamed: 0,street,ward,district,city,size,property_legal_document,bed_rooms,toilets,floors,price,house_type
0,Tôn Đức Thắng,Hòa Minh,Liên Chiểu,Đà Nẵng,60.8,1,4,1,2,5.8,1
1,Nguyễn Văn Thoại,Bắc Phú Mỹ,Ngũ Hành Sơn,Đà Nẵng,110.0,1,6,5,4,13.3,1
2,Xuân Thiều 8,Hòa Hiệp Nam,Liên Chiểu,Đà Nẵng,80.0,1,2,1,2,3.6,1
3,Lê Duy Đình,An Khê,Thanh Khê,Đà Nẵng,76.0,1,4,4,4,7.2,1
4,Lê Duẩn,Tân Chính,Thanh Khê,Đà Nẵng,108.3,1,4,3,4,15.5,1


In [14]:
X, y = df.drop(columns=['price']), df['price']
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.1, random_state = 42)

print(X.shape)
print(y.shape)
print(train_X.shape)
print(train_y.shape)
print(test_X.shape)
print(test_y.shape)
train_X.head()

(5520, 10)
(5520,)
(4968, 10)
(4968,)
(552, 10)
(552,)


Unnamed: 0,street,ward,district,city,size,property_legal_document,bed_rooms,toilets,floors,house_type
4665,Lê Thị Hà,Tân Xuân,Hóc Môn,Hồ Chí Minh,80.0,1,3,3,3,1
336,Võ Như Hưng,Mỹ An,Ngũ Hành Sơn,Đà Nẵng,92.0,9,3,2,2,1
4630,Châu Văn Liêm,An Lạc,Ninh Kiều,Cần Thơ,69.0,1,5,5,4,1
4355,A3 Quang Trung,Phú Thứ,Cái Răng,Cần Thơ,120.0,1,5,6,3,1
2916,Quang Dũng,Vĩnh Trung,Thanh Khê,Đà Nẵng,50.0,1,3,3,2,3


In [15]:
processor = DataProcessor()
processor.scale(train_X, train_y)
processor.save('saved_models/processor.pkl')

scaled_train_df = processor.transform(train_X)
scaled_test_df = processor.transform(test_X)

scaled_train_df.head()

Unnamed: 0,street,ward,district,city,size,property_legal_document,bed_rooms,toilets,floors,house_type
4665,-1.108516,-1.497725,-1.841108,1.750851,0.107497,0.123381,-0.090157,0.152288,0.485824,0.832956
336,0.196962,1.479461,0.819971,-0.221011,0.512415,1.041508,-0.090157,-0.695881,-0.920688,0.832956
4630,0.704575,1.431798,0.404731,-0.605881,-0.263678,0.123381,1.768352,1.848626,1.892337,0.832956
4355,0.084307,-0.709135,-1.717542,-0.605881,1.457224,0.123381,1.768352,2.696795,0.485824,0.832956
2916,-1.30732,-0.360524,-0.929962,-0.221011,-0.904799,0.123381,-0.090157,0.152288,-0.920688,-1.196906


In [16]:
scaled_train_df['price'] = train_y.values
scaled_test_df['price'] = test_y.values

scaled_train_df.to_csv('split_data/train.csv', index=False)
scaled_test_df.to_csv('split_data/test.csv', index=False)

scaled_train_df.head()

Unnamed: 0,street,ward,district,city,size,property_legal_document,bed_rooms,toilets,floors,house_type,price
4665,-1.108516,-1.497725,-1.841108,1.750851,0.107497,0.123381,-0.090157,0.152288,0.485824,0.832956,3.1
336,0.196962,1.479461,0.819971,-0.221011,0.512415,1.041508,-0.090157,-0.695881,-0.920688,0.832956,5.8
4630,0.704575,1.431798,0.404731,-0.605881,-0.263678,0.123381,1.768352,1.848626,1.892337,0.832956,17.5
4355,0.084307,-0.709135,-1.717542,-0.605881,1.457224,0.123381,1.768352,2.696795,0.485824,0.832956,6.1
2916,-1.30732,-0.360524,-0.929962,-0.221011,-0.904799,0.123381,-0.090157,0.152288,-0.920688,-1.196906,2.6
