Setup

In [None]:
import locale
import numpy as np
import pandas as pd
from datetime import datetime

In [None]:
import plotly.express as px

In [None]:
locale.setlocale(locale.LC_ALL, '')

# Data

In [None]:
store = "../data/20200914.jl"

In [None]:
%%time
df = pd.read_json(store, lines=True)

In [None]:
%%time
df = df.set_index("id")

In [None]:
%%time
contract = df["contract"].apply(pd.Series)
df = df.join(contract, rsuffix="_contract")
del df["contract"]

In [None]:
%%time
advertiser = df["advertiser"].apply(pd.Series)
agency = advertiser["agency"].apply(pd.Series)
supervisor = advertiser["supervisor"].apply(pd.Series)
df = df.join(supervisor, rsuffix="_agency").join(supervisor, rsuffix="_supervisor")
del df["advertiser"]

In [None]:
%%time
properties = df["properties"].apply(lambda r: r[0]).apply(pd.Series)
df = df.join(properties, rsuffix="_properties")
del df["properties"]

In [None]:
%%time
location = df["location"].apply(pd.Series)
df = df.join(location, rsuffix="_location")
del df["location"]

In [None]:
%%time
costs = df["price_properties"].apply(pd.Series)
df = df.join(costs, rsuffix="_costs")
del df["price_properties"]

In [None]:
%%time
surfaceConstitution = df["surfaceConstitution"].apply(pd.Series)
surfaceConstitutionElements = surfaceConstitution["surfaceConstitutionElements"].dropna().apply(lambda r: r[0]).apply(pd.Series)
df = df.join(surfaceConstitutionElements, rsuffix="_surfaceConstitutionElements")
del df["surfaceConstitution"]

In [None]:
%%time
to_drop = """name value imageUrl label type_agency displayName_supervisor imageUrl_supervisor label_supervisor type_supervisor
typology category multimedia nation region macrozone microzone marker constitution percentage surfaceType phoneUrl phoneUrl_supervisor 
zoom province city formattedPrice formattedPriceTop visible floor_surfaceConstitutionElements""".split()
df = df.drop(columns=to_drop)
df = df.dropna(axis=1, how="all")

# Data types

In [None]:
df["surfaceValue"] = df.surface.str.replace(" m²", "").str.replace(".","").str.replace(",",".").astype(float)

In [None]:
df["currency"] = df.price_costs.apply(lambda p: locale.currency(p, grouping=True))

# Store

In [None]:
%%time
today = datetime.today().strftime("%Y%m%d")
df.reset_index().to_feather(f"../data/{today}.feather")