In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from IPython.display import display

import warnings
import pickle

warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
def read_json_df(path):
    json = pd.read_json(r"{}".format(path))
    df_apartments = json[['id', 'floor', 'beds', 'price', 'size_m2']]
    
    df_tags = json.explode('tags').dropna(subset=['tags'])
    df_tags['tag_category'] = df_tags['tags'].apply(lambda x: x['tag_category'])
    df_tags['tag_value'] = df_tags['tags'].apply(lambda x: x['tag_value'])
    
    df_tags = df_tags.pivot_table(index='id',
                                  columns='tag_category', 
                                  values='tag_value', 
                                  aggfunc='first').reset_index()
    
    df = df_apartments.merge(df_tags, on='id', how='left')
    return df


df = read_json_df("for_rent_apartments.json")
df.head()

Unnamed: 0,id,floor,beds,price,size_m2,families,light_trail,parks,quiet_street,religious,school,secular
0,ieVNwuOX98B,2,3,10000,70,2.0,1.0,1.0,1.0,2.0,2.0,5.0
1,kRYP2kIMVho,3,5,12300,120,1.0,5.0,2.0,1.0,1.0,1.0,1.0
2,bk4LlhcSrQI,14,4,13000,170,1.0,3.0,4.0,1.0,1.0,1.0,5.0
3,cTRw7SDpf1E,0,3,4800,85,3.0,5.0,3.0,3.0,2.0,2.0,2.0
4,N0okrcZMZ88,1,1,3200,19,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [4]:
numerical_features = df.drop("id",axis=1).columns

In [5]:
numerical_transformer = StandardScaler()

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ])

In [7]:
preprocessor.fit_transform(df)

array([[-0.24360446, -0.17568178, -0.03773616, ..., -0.59701748,
        -0.27776431,  1.15964766],
       [-0.02920649,  1.21112129, -0.02777266, ..., -1.18385129,
        -0.90178647, -1.27532192],
       [ 2.32917119,  0.51771976, -0.02474029, ..., -1.18385129,
        -0.90178647,  1.15964766],
       ...,
       [-0.45800243, -0.17568178, -0.0645943 , ...,  1.16348396,
        -0.27776431, -0.66657952],
       [ 1.90037525, -0.17568178, -0.05722997, ...,  1.16348396,
         1.59430216,  1.15964766],
       [-0.24360446, -1.56248485, -0.07282502, ...,  1.16348396,
         0.97028001,  0.55090526]])

In [8]:
with open('for_rent_preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)