# **DESAFIO 3 G2** 
# **Dataset Properati - Modelado de datos - Preliminar.**
---

<a id="section_toc"></a> 
## Tabla de Contenidos

[Intro](#section_intro)

Vacio

$\hspace{.5cm}$[Observaciones generales](#section_og)
  
$\hspace{.9cm}$[Dimension de propiedades](#section_tsv_gdp)

Vacio

$\hspace{.5cm}$[Expresiones regulares](#section_re)

$\hspace{.9cm}$[Principios generales](#section_re_pg)

Vacio

$\hspace{.5cm}$[Borrado de registros duplicados](#section_drd)

$\hspace{.5cm}$[Borrado de registros sin información completa](#section_drii)

Visualizaciones basicas

$\hspace{.5cm}$[Venta departamentos CABA](#section_v_1)

$\hspace{.5cm}$[Venta casas CABA, Buenos Aires e Interior](#section_v_4)


---

<a id="section_intro"></a> 
## Intro

[volver a TOC](#section_toc)

Carga de DataFrame y exploracion inicial

In [1]:
import pandas as pd
import numpy as np
import re
import random

In [2]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data_location = 'https://github.com/DHDSDesafios/DH_DS_desafio_2_properati/raw/master/data/properatid2caba.zip'
data = pd.read_csv(data_location, compression='zip', encoding="utf-8")

In [4]:
pd.set_option('display.max_colwidth', 1500)
pd.set_option('display.max_rows', 400)

<a id="section_og"></a> 
### Observaciones Generales

[volver a TOC](#section_toc)

In [5]:
data.sample(10)

Unnamed: 0,Id_caso,property_type,place_with_parent_names,country_name,state_name,lat-lon,lat,lon,price,currency,...,surface_total_in_m2,surface_covered_in_m2,cochera,piscina,parrilla,baulera,balcon,terraza,jardin,lavadero
23441,93948,apartment,|Argentina|Capital Federal|Belgrano|,Argentina,Capital Federal,"-34.5559198337,-58.4632788696",-34.55592,-58.463279,160000.0,USD,...,55.0,55.0,,,,,balcon,,,1.0
21586,86588,apartment,|Argentina|Capital Federal|Boca|,Argentina,Capital Federal,,,,1771206.0,ARS,...,53.0,47.0,,,,,,,,
15308,61338,apartment,|Argentina|Capital Federal|Coghlan|,Argentina,Capital Federal,,,,119000.0,USD,...,54.0,54.0,,,,,,,,
12551,51318,apartment,|Argentina|Capital Federal|Villa del Parque|,Argentina,Capital Federal,"-34.6077807017,-58.4788192758",-34.607781,-58.478819,75000.0,USD,...,37.0,37.0,,,,,BALCON,,,1.0
13962,56908,apartment,|Argentina|Capital Federal|Paternal|,Argentina,Capital Federal,"-34.602782,-58.465741",-34.602782,-58.465741,78000.0,USD,...,,35.0,,,,1.0,1,,,1.0
15081,60894,apartment,|Argentina|Capital Federal|Abasto|,Argentina,Capital Federal,"-34.5993959,-58.4106029",-34.599396,-58.410603,89500.0,USD,...,46.0,43.0,,,,,,,,1.0
3368,16531,apartment,|Argentina|Capital Federal|Boedo|,Argentina,Capital Federal,"-34.6277267,-58.4225394",-34.627727,-58.422539,135000.0,USD,...,70.0,60.0,,,,,1,,,1.0
6862,28146,apartment,|Argentina|Capital Federal|Palermo|Palermo Soho|,Argentina,Capital Federal,,,,145000.0,USD,...,54.0,45.0,,,,,,,,
914,5914,apartment,|Argentina|Capital Federal|Recoleta|,Argentina,Capital Federal,"-34.5900967,-58.3827598",-34.590097,-58.38276,490000.0,USD,...,112.0,92.0,,,,,,1.0,,1.0
12232,49849,store,|Argentina|Capital Federal|Barrio Norte|,Argentina,Capital Federal,"-34.6004384,-58.3901694",-34.600438,-58.390169,120000.0,USD,...,37.0,37.0,,,,,,,,


In [6]:
data.columns

Index(['Id_caso', 'property_type', 'place_with_parent_names', 'country_name',
       'state_name', 'lat-lon', 'lat', 'lon', 'price', 'currency',
       'price_aprox_local_currency', 'price_aprox_usd', 'floor', 'rooms',
       'expenses', 'description', 'title', 'Pais', 'Zona', 'Partido_barrio',
       'Localidad', 'Obs_localidad', 'place_name', 'geonames_id',
       'price_usd_per_m2', 'price_per_m2', 'surface_total_in_m2',
       'surface_covered_in_m2', 'cochera', 'piscina', 'parrilla', 'baulera',
       'balcon', 'terraza', 'jardin', 'lavadero'],
      dtype='object')

In [7]:
data.shape

(29380, 36)

In [8]:
caba_place_name_mask = data.place_name != 'Capital Federal'
surface_covered_mask_notnull = data.surface_covered_in_m2.notnull()
price_per_m2_mask_notnull = data.price_per_m2.notnull()
price_usd_per_m2_mask_notnull = data.price_usd_per_m2.notnull()

In [9]:
#data = data.loc[caba_place_name_mask & surface_covered_mask_notnull & price_per_m2_mask_notnull]
data = data.loc[caba_place_name_mask & surface_covered_mask_notnull & price_usd_per_m2_mask_notnull]

In [10]:
# crear variables dummies
property_type_dummies = pd.get_dummies(data.property_type, prefix='prop_type')

data = pd.concat([data, property_type_dummies], axis=1)

# imprimimos 5 filas cualquieras
data.sample(n=5)

Unnamed: 0,Id_caso,property_type,place_with_parent_names,country_name,state_name,lat-lon,lat,lon,price,currency,...,parrilla,baulera,balcon,terraza,jardin,lavadero,prop_type_PH,prop_type_apartment,prop_type_house,prop_type_store
7836,32805,apartment,|Argentina|Capital Federal|Caballito|,Argentina,Capital Federal,"-34.6213762,-58.4222105",-34.621376,-58.42221,140000.0,USD,...,1.0,,,,,,0,1,0,0
11122,46079,apartment,|Argentina|Capital Federal|Floresta|,Argentina,Capital Federal,"-34.644747,-58.4857801",-34.644747,-58.48578,82000.0,USD,...,,1.0,1,,,,0,1,0,0
3397,16731,house,|Argentina|Capital Federal|Villa Lugano|,Argentina,Capital Federal,"-34.6624768,-58.4808722",-34.662477,-58.480872,189000.0,USD,...,1.0,,,1.0,1.0,1.0,0,0,1,0
20881,83919,apartment,|Argentina|Capital Federal|Flores|,Argentina,Capital Federal,"-34.6297991751,-58.4638373591",-34.629799,-58.463837,125000.0,USD,...,,,balcon,,,,0,1,0,0
6445,26629,apartment,|Argentina|Capital Federal|Boedo|,Argentina,Capital Federal,"-34.619665,-58.416391",-34.619665,-58.416391,607800.0,USD,...,1.0,,1,,,,0,1,0,0


In [11]:
# crear variables dummies
place_name_dummies = pd.get_dummies(data.place_name, prefix='pn')

data = pd.concat([data, place_name_dummies], axis=1)

# imprimimos 5 filas cualquieras
data.sample(20)

Unnamed: 0,Id_caso,property_type,place_with_parent_names,country_name,state_name,lat-lon,lat,lon,price,currency,...,pn_Villa Lugano,pn_Villa Luro,pn_Villa Ortuzar,pn_Villa Pueyrredón,pn_Villa Real,pn_Villa Riachuelo,pn_Villa Santa Rita,pn_Villa Soldati,pn_Villa Urquiza,pn_Villa del Parque
2624,13019,apartment,|Argentina|Capital Federal|Nuñez|,Argentina,Capital Federal,"-34.5485316,-58.4567197",-34.548532,-58.45672,285000.0,USD,...,0,0,0,0,0,0,0,0,0,0
25788,108682,apartment,|Argentina|Capital Federal|Palermo|Palermo Soho|,Argentina,Capital Federal,,,,90312.0,USD,...,0,0,0,0,0,0,0,0,0,0
6059,26059,apartment,|Argentina|Capital Federal|Boedo|,Argentina,Capital Federal,,,,945810.0,USD,...,0,0,0,0,0,0,0,0,0,0
3324,16380,apartment,|Argentina|Capital Federal|Las Cañitas|,Argentina,Capital Federal,"-34.5676701,-58.4342435",-34.56767,-58.434244,590000.0,USD,...,0,0,0,0,0,0,0,0,0,0
27253,113437,apartment,|Argentina|Capital Federal|Balvanera|,Argentina,Capital Federal,"-34.6073302474,-58.3853199865",-34.60733,-58.38532,255000.0,USD,...,0,0,0,0,0,0,0,0,0,0
16686,69235,apartment,|Argentina|Capital Federal|Belgrano|,Argentina,Capital Federal,"-34.5564151069,-58.4615261795",-34.556415,-58.461526,245000.0,USD,...,0,0,0,0,0,0,0,0,0,0
797,5434,apartment,|Argentina|Capital Federal|Palermo|Palermo Soho|,Argentina,Capital Federal,,,,218000.0,USD,...,0,0,0,0,0,0,0,0,0,0
13410,55179,apartment,|Argentina|Capital Federal|Belgrano|,Argentina,Capital Federal,"-34.5669613563,-58.4420093523",-34.566961,-58.442009,500000.0,USD,...,0,0,0,0,0,0,0,0,0,0
28397,118097,apartment,|Argentina|Capital Federal|Villa Crespo|,Argentina,Capital Federal,,,,386573.0,USD,...,0,0,0,0,0,0,0,0,0,0
11617,48057,apartment,|Argentina|Capital Federal|San Telmo|,Argentina,Capital Federal,"-34.6211839,-58.3731163",-34.621184,-58.373116,1456300.0,ARS,...,0,0,0,0,0,0,0,0,0,0


In [12]:
data.balcon.value_counts()

1         3643
balcon    1674
BALCON     750
Balcon     502
Name: balcon, dtype: int64

In [13]:
data.loc[data.cochera.isnull(), 'cochera'] = 0
data.cochera.astype('uint8')

data.loc[data.piscina.isnull(), 'piscina'] = 0
data.piscina.astype('uint8')

data.loc[data.lavadero.isnull(), 'lavadero'] = 0
data.lavadero.astype('uint8')

data.loc[data.parrilla.isnull(), 'parrilla'] = 0
data.parrilla.astype('uint8')

data.loc[data.terraza.isnull(), 'terraza'] = 0
data.terraza.astype('uint8')

data.loc[data.jardin.isnull(), 'jardin'] = 0
data.jardin.astype('uint8')

data.loc[data.balcon == 'Balcon', 'balcon'] = 1
data.loc[data.balcon == 'BALCON', 'balcon'] = 1
data.loc[data.balcon == 'balcon', 'balcon'] = 1
data.loc[data.balcon.isnull(), 'balcon'] = 0
data.balcon.astype('uint8')

data.loc[data.baulera.isnull(), 'baulera'] = 0
data.baulera.astype('uint8')

0        0
1        0
3        0
4        0
5        0
        ..
29374    0
29375    0
29376    0
29378    0
29379    0
Name: baulera, Length: 20869, dtype: uint8

In [14]:
data.loc[:, ['baulera','piscina']].sample(20)

Unnamed: 0,baulera,piscina
20267,0.0,0.0
10702,0.0,1.0
21948,1.0,0.0
26585,0.0,0.0
4399,0.0,0.0
12744,0.0,0.0
12281,0.0,0.0
23961,0.0,0.0
12418,1.0,1.0
4431,0.0,0.0


In [15]:
erase_columns = ['Id_caso', 'property_type', 'place_with_parent_names', 'country_name',
       'state_name', 'lat-lon', 'lat', 'lon', 'price', 'currency',
       'price_aprox_local_currency', 'floor', 'rooms',
       'expenses', 'description', 'title', 'Pais', 'Zona', 'Partido_barrio',
       'Localidad', 'Obs_localidad', 'place_name']

In [16]:
datad2 = data.drop(data[erase_columns], axis=1)

In [17]:
datad2.columns

Index(['price_aprox_usd', 'geonames_id', 'price_usd_per_m2', 'price_per_m2',
       'surface_total_in_m2', 'surface_covered_in_m2', 'cochera', 'piscina',
       'parrilla', 'baulera', 'balcon', 'terraza', 'jardin', 'lavadero',
       'prop_type_PH', 'prop_type_apartment', 'prop_type_house',
       'prop_type_store', 'pn_Abasto', 'pn_Agronomía', 'pn_Almagro',
       'pn_Balvanera', 'pn_Barracas', 'pn_Barrio Norte', 'pn_Belgrano',
       'pn_Boca', 'pn_Boedo', 'pn_Caballito', 'pn_Catalinas',
       'pn_Centro / Microcentro', 'pn_Chacarita', 'pn_Coghlan',
       'pn_Colegiales', 'pn_Congreso', 'pn_Constitución', 'pn_Flores',
       'pn_Floresta', 'pn_Las Cañitas', 'pn_Liniers', 'pn_Mataderos',
       'pn_Monserrat', 'pn_Monte Castro', 'pn_Nuñez', 'pn_Once', 'pn_Palermo',
       'pn_Palermo Chico', 'pn_Palermo Hollywood', 'pn_Palermo Soho',
       'pn_Palermo Viejo', 'pn_Parque Avellaneda', 'pn_Parque Centenario',
       'pn_Parque Chacabuco', 'pn_Parque Chas', 'pn_Parque Patricios',
     

In [18]:
datad2.sample(10)

Unnamed: 0,price_aprox_usd,geonames_id,price_usd_per_m2,price_per_m2,surface_total_in_m2,surface_covered_in_m2,cochera,piscina,parrilla,baulera,...,pn_Villa Lugano,pn_Villa Luro,pn_Villa Ortuzar,pn_Villa Pueyrredón,pn_Villa Real,pn_Villa Riachuelo,pn_Villa Santa Rita,pn_Villa Soldati,pn_Villa Urquiza,pn_Villa del Parque
1512,645000.0,3430234.0,4387.76,4708.03,147.0,137.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
5947,195600.0,3429153.0,4890.0,4890.0,40.0,40.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
22625,136203.0,3430234.0,3584.29,4005.97,38.0,34.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8230,115000.0,3428113.0,3484.85,3484.85,33.0,33.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
12376,130000.0,3435874.0,2407.41,3823.53,54.0,34.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1887,113000.0,3429703.0,2456.52,2756.1,46.0,41.0,0.0,0.0,1.0,0.0,...,0,0,0,1,0,0,0,0,0,0
4350,350000.0,3435359.0,1268.12,1988.64,276.0,176.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
7157,135000.0,3436077.0,2755.1,3139.53,49.0,43.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9162,157000.0,3430234.0,2573.77,2962.26,61.0,53.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
15219,82000.0,3435874.0,1952.38,2157.89,42.0,38.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
#data.cochera.value_counts()
datad2.loc[:, ['price_aprox_usd', 'price_usd_per_m2', 'surface_covered_in_m2', 'surface_total_in_m2', 'cochera', 'price_per_m2']].corr()

Unnamed: 0,price_aprox_usd,price_usd_per_m2,surface_covered_in_m2,surface_total_in_m2,cochera,price_per_m2
price_aprox_usd,1.0,0.404241,0.57024,0.585183,0.166507,0.238959
price_usd_per_m2,0.404241,1.0,-0.046184,-0.067926,-0.010403,0.5329
surface_covered_in_m2,0.57024,-0.046184,1.0,0.965909,0.051478,-0.036247
surface_total_in_m2,0.585183,-0.067926,0.965909,1.0,0.055866,-0.017616
cochera,0.166507,-0.010403,0.051478,0.055866,1.0,0.001267
price_per_m2,0.238959,0.5329,-0.036247,-0.017616,0.001267,1.0


In [20]:
#f, ax = plt.subplots(figsize=(7, 5))
#ax.heatmap(data.corr(), vmin=-1, vmax=1, center=0, annot=True, cmap=sns.diverging_palette(20, 220, n=200), square=True)

#with sns.axes_style("white"):
#    f, ax = plt.subplots(figsize=(8, 8))
#    ax = sns.heatmap(data.corr(), vmax=.3, annot=True, square=True, cmap="YlGnBu")

In [27]:
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 

In [28]:
feature_cols = ['cochera', 'prop_type_PH', 'prop_type_apartment', 'prop_type_house','prop_type_store', 'pn_Abasto', 'pn_Agronomía', 'pn_Almagro',
       'pn_Balvanera', 'pn_Barracas', 'pn_Barrio Norte', 'pn_Belgrano',
       'pn_Boca', 'pn_Boedo', 'pn_Caballito', 'pn_Catalinas',
       'pn_Centro / Microcentro', 'pn_Chacarita', 'pn_Coghlan',
       'pn_Colegiales', 'pn_Congreso', 'pn_Constitución', 'pn_Flores',
       'pn_Floresta', 'pn_Las Cañitas', 'pn_Liniers', 'pn_Mataderos',
       'pn_Monserrat', 'pn_Monte Castro', 'pn_Nuñez', 'pn_Once', 'pn_Palermo',
       'pn_Palermo Chico', 'pn_Palermo Hollywood', 'pn_Palermo Soho',
       'pn_Palermo Viejo', 'pn_Parque Avellaneda', 'pn_Parque Centenario',
       'pn_Parque Chacabuco', 'pn_Parque Chas', 'pn_Parque Patricios',
       'pn_Paternal', 'pn_Pompeya', 'pn_Puerto Madero', 'pn_Recoleta',
       'pn_Retiro', 'pn_Saavedra', 'pn_San Cristobal', 'pn_San Nicolás',
       'pn_San Telmo', 'pn_Tribunales', 'pn_Velez Sarsfield', 'pn_Versalles',
       'pn_Villa Crespo', 'pn_Villa Devoto', 'pn_Villa General Mitre',
       'pn_Villa Lugano', 'pn_Villa Luro', 'pn_Villa Ortuzar',
       'pn_Villa Pueyrredón', 'pn_Villa Real', 'pn_Villa Riachuelo',
       'pn_Villa Santa Rita', 'pn_Villa Soldati', 'pn_Villa Urquiza',
       'pn_Villa del Parque', 'surface_covered_in_m2', 'surface_total_in_m2']

In [29]:
model = LinearRegression(fit_intercept=True) 

X = datad2[feature_cols] 
y = datad2.price_usd_per_m2
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=1) 

model.fit(Xtrain, ytrain) 

ypred = model.predict(Xtest) 

print ('MAE:', mean_absolute_error(ytest, ypred).round(2)) 
print ('MSE:', mean_squared_error(ytest, ypred).round(2)) 
print ('RMSE:', np.sqrt(mean_squared_error(ytest, ypred)).round(2)) 
print ('R2:', r2_score(ytest, ypred).round(2)) 

MAE: 1114.17
MSE: 14346492.26
RMSE: 3787.68
R2: 0.21


In [24]:
data.shape, datad2.shape

((20869, 101), (20869, 79))