In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

import warnings
import pickle

warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
def read_json_df(path):
    json = pd.read_json(r"{}".format(path))
    df_apartments = json[['id', 'floor', 'beds', 'price', 'size_m2']]
    
    df_tags = json.explode('tags').dropna(subset=['tags'])
    df_tags['tag_category'] = df_tags['tags'].apply(lambda x: x['tag_category'])
    df_tags['tag_value'] = df_tags['tags'].apply(lambda x: x['tag_value'])
    
    df_tags = df_tags.pivot_table(index='id',
                                  columns='tag_category', 
                                  values='tag_value', 
                                  aggfunc='first').reset_index()
    
    df = df_apartments.merge(df_tags, on='id', how='left')
    return df


df = read_json_df("for_sale_apartments.json")
df.head()

Unnamed: 0,id,floor,beds,price,size_m2,families,light_trail,parks,quiet_street,religious,school,secular
0,d3dbVQErVCp,6,4,5290000,106,1.0,3.0,3.0,5.0,2.0,1.0,1.0
1,co745SgbSvo,4,2,4000000,81,2.0,3.0,3.0,1.0,2.0,1.0,1.0
2,hZ5vNRFKDr8,5,3,9500000,160,4.0,1.0,2.0,3.0,5.0,5.0,5.0
3,bMWmA4bbRo3,7,4,4980000,116,3.0,3.0,2.0,3.0,1.0,1.0,2.0
4,hTGedzNQG2f,13,4,7250000,105,1.0,5.0,1.0,3.0,2.0,2.0,5.0


In [3]:
numerical_features = df.drop("id",axis=1).columns

In [4]:
numerical_transformer = StandardScaler()

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ])

In [6]:
preprocessor.fit_transform(df)

array([[ 0.80261186,  0.03954489,  0.11938669, ..., -0.59878354,
        -0.83507186, -1.07398861],
       [ 0.26123038, -1.34452615, -0.05478728, ..., -0.59878354,
        -0.83507186, -1.07398861],
       [ 0.53192112, -0.65249063,  0.68781493, ...,  1.20677914,
         2.69809781,  1.92034664],
       ...,
       [-0.00946037,  1.42361593, -0.0142817 , ...,  1.20677914,
         0.04822056, -0.3254048 ],
       [-0.00946037, -0.65249063, -0.41258652, ...,  1.20677914,
         0.04822056, -0.3254048 ],
       [-0.82153259,  0.73158041, -0.0142817 , ...,  1.20677914,
         0.04822056, -0.3254048 ]])

In [7]:
with open('for_sale_preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)