In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
def get_typespeiffic_price(df, property_subtype):
    df_panel = df[df['property_subtype'] == property_subtype]
    return df_panel[['city', 'unit_property_price']].groupby(by=['city'])['unit_property_price'].mean()

def get_statistics(filename = '../data/DataSet_LakasP.csv'):
    df = pd.read_csv(filename)
    df['unit_property_price'] = df.price_created_at / df.property_area
    df_city_gr = df[['city', 'price_created_at', 'unit_property_price']].groupby(by='city')
    mean_unit_property_price = df_city_gr['unit_property_price'].mean().values
    min_price_created_at = df_city_gr['price_created_at'].min()
    max_price_created_at = df_city_gr['price_created_at'].max()
    df_panel_price = get_typespeiffic_price(df, 'prefabricated panel flat (for sale)')
    df_brick_price = get_typespeiffic_price(df, 'brick flat (for sale)')
    df_price_diff = (df_panel_price - df_brick_price).abs()
    return pd.DataFrame({'City': df_price_diff.index,
                         'Avg. m2 price': mean_unit_property_price, 
                         'Min': min_price_created_at,
                         'Max': max_price_created_at,
                         'df_panel_price': df_panel_price,
                         'df_brick_price': df_brick_price,
                         'Panel – brick diff': df_price_diff})

In [5]:
get_statistics()

Unnamed: 0_level_0,City,Avg. m2 price,Min,Max,df_panel_price,df_brick_price,Panel – brick diff
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Budapest I.,Budapest I.,0.564487,1.0,89.9,0.527283,0.564709,0.037426
Budapest II.,Budapest II.,0.529692,1.0,95.0,0.501896,0.53188,0.029984
Budapest III.,Budapest III.,0.372534,2.1,96.0,0.307941,0.478615,0.170673
Budapest IV.,Budapest IV.,0.303773,1.0,68.0,0.284946,0.331745,0.046799
Budapest IX.,Budapest IX.,0.440449,2.0,96.0,0.329711,0.449685,0.119974
Budapest V.,Budapest V.,0.728948,0.5,88.0,0.845024,0.728535,0.116489
Budapest VI.,Budapest VI.,0.524832,1.0,92.4,0.536202,0.524853,0.011349
Budapest VII.,Budapest VII.,0.442014,2.0,88.0,0.430283,0.441934,0.011651
Budapest VIII.,Budapest VIII.,0.348262,2.4,95.0,0.281381,0.353757,0.072377
Budapest X.,Budapest X.,0.272447,2.5,56.9,0.250766,0.291144,0.040378


In [12]:
def get_data_cleaning_helper(filename = '../data/DataSet_LakasP.csv'):
    df = pd.read_csv(filename)
    assert not df.price_created_at.isnull().any()
    df['unit_property_price'] = df.price_created_at / df.property_area
    df_panel_price = get_typespeiffic_price(df, 'prefabricated panel flat (for sale)')
    df_brick_price = get_typespeiffic_price(df, 'brick flat (for sale)')
    df_price = pd.DataFrame({'unit_property_price_panel': df_panel_price, 'unit_property_price_brick': df_brick_price})

    df_null = df[df.property_subtype.isnull()]
    df_join_pr = df_null.join(df_price, on='city')
    return pd.DataFrame({'City': df_join_pr.city,
                         'Property_subtype': df_join_pr.property_subtype,
                         'Property_area': df_join_pr.property_area,
                         'Price_created_at': df_join_pr.price_created_at,
                         'Avg. m2 price': df_join_pr.unit_property_price,
                         'Avg. m2 price panel': df_join_pr.unit_property_price_panel,
                         'Avg. m2 price brick': df_join_pr.unit_property_price_brick})

In [14]:
df = get_data_cleaning_helper()
get_diff = lambda x: (df['Avg. m2 price'] - df[x]).abs()
df['is_panel'] = get_diff('Avg. m2 price panel') < get_diff('Avg. m2 price brick')
df

Unnamed: 0,City,Property_subtype,Property_area,Price_created_at,Avg. m2 price,Avg. m2 price panel,Avg. m2 price brick,is_panel
51,Budapest XX.,,65.0,15.5,0.238462,0.221074,0.263274,True
119,Budapest XX.,,55.0,9.5,0.172727,0.221074,0.263274,True
314,Budapest IX.,,50.0,12.9,0.258000,0.329711,0.449685,True
359,Budapest XXII.,,50.0,13.0,0.260000,0.291089,0.337936,True
455,Budapest XIV.,,50.0,13.2,0.264000,0.304399,0.400750,True
465,Budapest X.,,50.0,11.6,0.232000,0.250766,0.291144,True
476,Budapest X.,,65.0,10.9,0.167692,0.250766,0.291144,True
607,Budapest XVIII.,,50.0,9.9,0.198000,0.232965,0.322584,True
827,Budapest XIX.,,55.0,10.7,0.194545,0.249429,0.295512,True
913,Budapest VIII.,,60.0,12.7,0.211667,0.281381,0.353757,True
