In [1]:
import pandas as pd
import numpy as np
import re
import random

In [2]:
data_location = 'https://github.com/DHDSDesafios/DH_DS_desafio_2_properati/raw/master/data/properatid2caba.zip'
data = pd.read_csv(data_location, compression='zip', encoding="utf-8")

In [3]:
data.isnull().sum()

Id_caso                           0
property_type                     0
place_with_parent_names           0
country_name                      0
state_name                        0
lat-lon                        6810
lat                            6810
lon                            6810
price                          1756
currency                       1756
price_aprox_local_currency     1756
price_aprox_usd                1532
floor                         26480
rooms                         11341
expenses                      23009
description                       0
title                             0
Pais                              0
Zona                              0
Partido_barrio                  336
Localidad                     28266
Obs_localidad                 29380
place_name                        0
geonames_id                       0
price_usd_per_m2               7227
price_per_m2                   2512
surface_total_in_m2            5527
surface_covered_in_m2       

In [4]:
# imputamos los nulos de mt2 totales con los valores de mt2 cubiertos y lo mismo para las columnas de precio:

super_cub_no_nulo = data.surface_covered_in_m2.notnull()
data.loc[super_cub_no_nulo, "surface_total_in_m2"] = data.loc[super_cub_no_nulo, "surface_covered_in_m2"]



In [5]:
# media de precios por barrios

data.groupby("place_name")["price_usd_per_m2"].mean().round(2)

place_name
Abasto              2057.05
Agronomía           2088.12
Almagro             2753.35
Balvanera           1983.00
Barracas            1977.81
                     ...   
Villa Riachuelo     1110.69
Villa Santa Rita    1889.19
Villa Soldati        736.93
Villa Urquiza       2502.35
Villa del Parque    2191.73
Name: price_usd_per_m2, Length: 62, dtype: float64

In [6]:
# armamos un data con los nulos de precio y solo algunas columnas

precio_nulo = data.price_usd_per_m2.isnull()
x = data.loc[precio_nulo,["Id_caso", "place_name", "price_usd_per_m2"]]


In [7]:
x.set_index("Id_caso", inplace = True)

In [8]:
# cantidad de nulos por barrios

data_porc_nulos = pd.DataFrame(x["place_name"].value_counts())
data_porc_nulos.rename(columns = {"place_name":"nulos"}, inplace = True)
data_porc_todos = pd.DataFrame(data.place_name.value_counts())

data_porc_todos
data_porc = pd.concat([data_porc_nulos, data_porc_todos], axis = 1)
data_porc["porc"] = (data_porc.nulos/data_porc.place_name * 100).round(2)
data_porc.sort_values(by = ["porc"], ascending = False)

Unnamed: 0,nulos,place_name,porc
Catalinas,3,4,75.00
Parque Chas,29,44,65.91
Villa Soldati,8,14,57.14
Versalles,29,55,52.73
Villa Santa Rita,29,56,51.79
...,...,...,...
Villa Ortuzar,9,85,10.59
Boedo,88,852,10.33
Palermo Hollywood,48,474,10.13
Las Cañitas,14,140,10.00


In [9]:
# menor al 25% de nulos imputamos por la media:

palermo_viejo_nulos = x.place_name == "Palermo Viejo"
x.loc[palermo_viejo_nulos,:] = 3065.84

las_cañitas_nulos = x.place_name == "Las Cañitas"
x.loc[las_cañitas_nulos,:] = 3478.26

Palermo_Hollywood_nulos = x.place_name == "Palermo Hollywood"
x.loc[Palermo_Hollywood_nulos,:] = 3186.76

Boedo_nulos = x.place_name == "Boedo"
x.loc[Boedo_nulos,:] = 10637.29

Villa_Ortuzar_nulos =  x.place_name == "Villa Ortuzar"
x.loc[Villa_Ortuzar_nulos,:] = 2339.69

Parque_Centenario_nulos =  x.place_name == "Parque Centenario"
x.loc[Parque_Centenario_nulos,:] = 2267.19

Centro_Microcentro_nulos =  x.place_name == "Centro / Microcentro"
x.loc[Centro_Microcentro_nulos,:] = 2548.98

Barracas_nulos =  x.place_name == "Barracas"
x.loc[Barracas_nulos,:] = 1977.81

San_Telmo_nulos = x.place_name == "San Telmo"
x.loc[San_Telmo_nulos,:] = 2295.08

Villa_Crespo_nulos = x.place_name == "Villa Crespo"
x.loc[Villa_Crespo_nulos,:] = 2519.02

Abasto_nulos = x.place_name == "Abasto"
x.loc[Abasto_nulos,:] = 2057.05

Villa_General_Mitre = x.place_name == "Villa General Mitre"
x.loc[Villa_General_Mitre,:] = 1800.70

Boca_nulos = x.place_name == "Boca"
x.loc[Boca_nulos,:] = 2073.90

Constitucion_nulos = x.place_name == "Constitución"
x.loc[Constitucion_nulos,:] = 1783.07

Caballito_nulos = x.place_name == "Caballito"
x.loc[Caballito_nulos,:] = 2471.56

Belgrano_nulos = x.place_name == "Belgrano"
x.loc[Belgrano_nulos,:] = 3155.18

Coghlan_nulos = x.place_name ==  "Coghlan"
x.loc[Coghlan_nulos,:] = 2528.63

Congreso_nulos = x.place_name ==  "Congreso"
x.loc[Congreso_nulos,:] = 2610.95

Flores_nulos = x.place_name == "Flores"
x.loc[Flores_nulos,:] = 2160.70

Palermo_Soho_nulos = x.place_name == "Palermo Soho"
x.loc[Palermo_Soho_nulos,:] = 3216.49

Barrio_Norte_nulos = x.place_name == "Barrio Norte"
x.loc[Barrio_Norte_nulos,:] = 3085.71

Puerto_Madero_nulos = x.place_name == "Puerto Madero"
x.loc[Puerto_Madero_nulos,:] = 6018.91

Chacarita_nulos = x.place_name == "Chacarita"
x.loc[Chacarita_nulos,:] = 2187.16

Palermo_Chico_nulos = x.place_name == "Palermo Chico"
x.loc[Palermo_Chico_nulos,:] = 4814.62

Recoleta_nulos = x.place_name == "Recoleta"
x.loc[Recoleta_nulos,:] = 3378.38

Saavedra_nulos = x.place_name == "Saavedra"
x.loc[Saavedra_nulos,:] = 2538.66

In [10]:
# armamos df con los valores imputados para concatenar al data original

a = x.drop("place_name",axis=1)
a.rename(columns = {"price_usd_per_m2": "precio_imputado"}, inplace=True)

In [11]:
a

Unnamed: 0_level_0,precio_imputado
Id_caso,Unnamed: 1_level_1
3,
97,2295.08
120,3155.18
137,
157,
...,...
121148,2160.70
121149,2471.56
121150,
121151,2160.70


In [12]:
# seteamos índices para concatenar

data.set_index("Id_caso", inplace = True)

In [13]:
data_imputac = pd.concat([data, a], axis = 1)

In [14]:
data_imputac.price_usd_per_m2.isnull().sum()

7227

In [15]:
# reemplazamos los valores imputados en la columna precio/mt2

# 'price_per_m2'

precios_no_nulos = data_imputac.precio_imputado.notnull()
data_imputac.loc[precios_no_nulos, "price_usd_per_m2"] = data_imputac.loc[precios_no_nulos, "precio_imputado"]

In [16]:
data_imputac.price_usd_per_m2.isnull().sum()

3861

In [17]:
data.price_per_m2.isnull().sum()

2512

In [18]:
data = data_imputac
data.price_usd_per_m2.isnull().sum()

3861

In [19]:
data.price_usd_per_m2.isnull().sum()

3861

In [20]:
data.surface_total_in_m2.isnull().sum()


539

In [21]:
data.surface_covered_in_m2.isnull().sum()

1617

In [22]:
caba_place_name_mask = data.place_name != 'Capital Federal'
surface_total_in_m2_notnull = data.surface_total_in_m2.notnull()
surface_covered_mask_notnull = data.surface_covered_in_m2.notnull()
price_usd_per_m2_mask_notnull = data.price_usd_per_m2.notnull()

In [23]:
data = data.loc[caba_place_name_mask & surface_covered_mask_notnull & surface_total_in_m2_notnull & price_usd_per_m2_mask_notnull]

In [24]:
data.shape

(24050, 36)

In [25]:
data.isnull().sum()

property_type                     0
place_with_parent_names           0
country_name                      0
state_name                        0
lat-lon                        5719
lat                            5719
lon                            5719
price                          1004
currency                       1004
price_aprox_local_currency     1004
price_aprox_usd                 895
floor                         22093
rooms                          9170
expenses                      18407
description                       0
title                             0
Pais                              0
Zona                              0
Partido_barrio                    0
Localidad                     22999
Obs_localidad                 24050
place_name                        0
geonames_id                       0
price_usd_per_m2                  0
price_per_m2                   1006
surface_total_in_m2               0
surface_covered_in_m2             0
cochera                     

In [26]:
def remove_outlier(df_in, mask, col_name):
    place_name_mask = df_in.place_name == mask
    q1 = df_in.loc[place_name_mask, col_name].quantile(0.25)
    q3 = df_in.loc[place_name_mask, col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high) & place_name_mask]
    return df_out

In [27]:
data2 = data.iloc[0:0]
place_name_list = data.place_name.unique()
for place_name_n in place_name_list:
    df_final1 = remove_outlier(data, place_name_n,  "price_usd_per_m2")
    data2 = data2.append(df_final1)

In [28]:
data2.shape

(22320, 36)

In [29]:
property_type_dummies = pd.get_dummies(data2.property_type, prefix='prop_type', drop_first = True)

data = pd.concat([data2, property_type_dummies], axis=1)

In [30]:
place_name_dummies = pd.get_dummies(data.place_name, prefix='pn', drop_first = True)

data = pd.concat([data, place_name_dummies], axis=1)

In [31]:
data.shape

(22320, 98)

In [32]:
data.loc[data.cochera.isnull(), 'cochera'] = 0
data.cochera.astype('uint8')

data.loc[data.piscina.isnull(), 'piscina'] = 0
data.piscina.astype('uint8')

data.loc[data.lavadero.isnull(), 'lavadero'] = 0
data.lavadero.astype('uint8')

data.loc[data.parrilla.isnull(), 'parrilla'] = 0
data.parrilla.astype('uint8')

data.loc[data.terraza.isnull(), 'terraza'] = 0
data.terraza.astype('uint8')

data.loc[data.jardin.isnull(), 'jardin'] = 0
data.jardin.astype('uint8')

data.loc[data.balcon == 'Balcon', 'balcon'] = 1
data.loc[data.balcon == 'BALCON', 'balcon'] = 1
data.loc[data.balcon == 'balcon', 'balcon'] = 1
data.loc[data.balcon.isnull(), 'balcon'] = 0
data.balcon.astype('uint8')

data.loc[data.baulera.isnull(), 'baulera'] = 0
data.baulera.astype('uint8')

Id_caso
0         0
2         0
16        0
254       0
557       0
         ..
109935    0
110797    0
22956     0
33674     0
53056     0
Name: baulera, Length: 22320, dtype: uint8

In [33]:
nulos_piso = data.floor.isnull()
data.loc[nulos_piso, "floor" ] = 4

nulos_rooms = data.rooms.isnull()
data.loc[nulos_rooms, "rooms" ] = 2

In [34]:
erase_columns = ['property_type', 'place_with_parent_names', 'country_name', 'price_per_m2','state_name', 'lat-lon', 'lat', 'lon', 'price', 'currency',
       'price_aprox_local_currency', 'expenses', 'description', 'title', 'Pais', 'Zona', 'Partido_barrio',
       'Localidad', 'Obs_localidad', 'place_name']

In [35]:
datad2 = data.drop(data[erase_columns], axis=1)

In [36]:
datad2.columns

Index(['price_aprox_usd', 'floor', 'rooms', 'geonames_id', 'price_usd_per_m2',
       'surface_total_in_m2', 'surface_covered_in_m2', 'cochera', 'piscina',
       'parrilla', 'baulera', 'balcon', 'terraza', 'jardin', 'lavadero',
       'precio_imputado', 'prop_type_apartment', 'prop_type_house',
       'prop_type_store', 'pn_Agronomía', 'pn_Almagro', 'pn_Balvanera',
       'pn_Barracas', 'pn_Barrio Norte', 'pn_Belgrano', 'pn_Boca', 'pn_Boedo',
       'pn_Caballito', 'pn_Centro / Microcentro', 'pn_Chacarita', 'pn_Coghlan',
       'pn_Colegiales', 'pn_Congreso', 'pn_Constitución', 'pn_Flores',
       'pn_Floresta', 'pn_Las Cañitas', 'pn_Liniers', 'pn_Mataderos',
       'pn_Monserrat', 'pn_Monte Castro', 'pn_Nuñez', 'pn_Once', 'pn_Palermo',
       'pn_Palermo Chico', 'pn_Palermo Hollywood', 'pn_Palermo Soho',
       'pn_Palermo Viejo', 'pn_Parque Avellaneda', 'pn_Parque Centenario',
       'pn_Parque Chacabuco', 'pn_Parque Chas', 'pn_Parque Patricios',
       'pn_Paternal', 'pn_Pompeya', 

In [37]:
n = ['surface_total_in_m2', 'surface_covered_in_m2', 'cochera', 'piscina',
       'parrilla', 'baulera', 'balcon', 'terraza', 'jardin', 'lavadero', "price_usd_per_m2"]

datad2[n].isnull().sum()

surface_total_in_m2      0
surface_covered_in_m2    0
cochera                  0
piscina                  0
parrilla                 0
baulera                  0
balcon                   0
terraza                  0
jardin                   0
lavadero                 0
price_usd_per_m2         0
dtype: int64

In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [39]:

model = LinearRegression(fit_intercept=True)

feature_cols = ['surface_total_in_m2', 'surface_covered_in_m2', 'cochera', 'piscina',
       'parrilla', 'baulera', 'balcon', 'terraza', 'jardin', 'lavadero',
       'prop_type_apartment', 'prop_type_house', 'prop_type_store',
       'pn_Agronomía', 'pn_Almagro', 'pn_Balvanera', 'pn_Barracas',
       'pn_Barrio Norte', 'pn_Belgrano', 'pn_Boca', 'pn_Boedo', 'pn_Caballito',
       'pn_Centro / Microcentro', 'pn_Chacarita', 'pn_Coghlan',
       'pn_Colegiales', 'pn_Congreso', 'pn_Constitución', 'pn_Flores',
       'pn_Floresta', 'pn_Las Cañitas', 'pn_Liniers', 'pn_Mataderos',
       'pn_Monserrat', 'pn_Monte Castro', 'pn_Nuñez', 'pn_Once', 'pn_Palermo',
       'pn_Palermo Chico', 'pn_Palermo Hollywood', 'pn_Palermo Soho',
       'pn_Palermo Viejo', 'pn_Parque Avellaneda', 'pn_Parque Centenario',
       'pn_Parque Chacabuco', 'pn_Parque Chas', 'pn_Parque Patricios',
       'pn_Paternal', 'pn_Pompeya', 'pn_Puerto Madero', 'pn_Recoleta',
       'pn_Retiro', 'pn_Saavedra', 'pn_San Cristobal', 'pn_San Nicolás',
       'pn_San Telmo', 'pn_Tribunales', 'pn_Velez Sarsfield', 'pn_Versalles',
       'pn_Villa Crespo', 'pn_Villa Devoto', 'pn_Villa General Mitre',
       'pn_Villa Lugano', 'pn_Villa Luro', 'pn_Villa Ortuzar',
       'pn_Villa Pueyrredón', 'pn_Villa Real', 'pn_Villa Riachuelo',
       'pn_Villa Santa Rita', 'pn_Villa Soldati', 'pn_Villa Urquiza',
       'pn_Villa del Parque', 'floor', 'rooms']
X = datad2[feature_cols]
y = datad2.price_usd_per_m2

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

scaler = StandardScaler()
scaler.fit_transform(Xtrain)

model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)

print ('MAE:', mean_absolute_error(ytest, ypred).round(2))
print ('MSE:', mean_squared_error(ytest, ypred).round(2))
print ('RMSE:', np.sqrt(mean_squared_error(ytest, ypred)).round(2))
print ('R2:', r2_score(ytest, ypred).round(2))

MAE: 889.79
MSE: 4450185.95
RMSE: 2109.55
R2: 0.45
