In [None]:
import pandas as pd
import numpy as np
from funciones import *

# Configuracion de display de DataFrames en Jupyter
pd.options.display.max_columns = None

# Localizacion del .CSV del dataset
data_location_raw = './datasets/properatti.csv'

# Localizacion del .CSV donde enviaremos los datos limpios
data_location_clean = './datasets/properatti_clean.csv'

# Tipos de datos a importar como categorias
dtypes_cat = {
    "operation": "category",
    "property_type": "category",
    "country_name": "category",
    "state_name": "category",
}

# Columnas del DataFrame Original
dtypes_non_cat_orig = {
    "place_name": "str",
    "description": "str",
    "title": "str",
}

# Columnas del DataFrame limpio
dtypes_non_cat_clean = {
    "price_clean": "float",
    "currency_clean": "category",
    "surface_covered_in_m2_clean": "float",
    "price_per_m2_covered_clean": "float",
}

# Creamos un nuevo diccionario con todos los elementos
dtypes = {}
dtypes.update(dtypes_cat)
dtypes.update(dtypes_non_cat_orig)
dtypes.update(dtypes_non_cat_clean)

# Hacemos la importación del dataframe clean
raw_data = pd.read_csv(data_location_clean,
                       dtype=dtypes,
                       usecols=['Unnamed: 0']+list(dtypes_cat) +
                               list(dtypes_non_cat_orig) +
                               list(dtypes_non_cat_clean),
                       index_col=0
)

# Copiamos el dataframe a otro para tener una copia literal del mismo
data = raw_data.copy()

# Forma de resetear el index del dataframe si no se usa el index_col=0
## data_default.reset_index(drop=True, inplace=True)
## data_str.index

In [None]:
data.columns

In [None]:
missing_zero_values_table(data)

In [None]:
data.head()

In [None]:
df1 = data.copy()

df1 = df1.dropna()
missing_zero_values_table(df1)
#df2 = df1.copy()

In [None]:
print(f"Cantidad de place_name únicos: {len(df1.place_name.unique())}")

In [None]:
#Le saco a los espacios adelante y atras que pudieran tener los registros del DataFrame
df1.place_name = df1.place_name.apply(lambda x: x.strip())

#df1.equals(df2)
df1.head()

In [None]:
# Cantidad de registros por ubicación úncia
place_name_estats = df1.groupby('place_name')['place_name'].agg('count').sort_values(ascending=False)
place_name_estats

In [None]:
# Forma equivalente de la celda anterior
df1['place_name'].value_counts()

In [None]:
# Lista de regiones únicas
df1['state_name'].unique().tolist()
print(df1['state_name'].unique().tolist())

In [None]:
# Le damos coherencia a la nomenclatura de las regiones de Buenos Aires.
df1["state_name"] = df1["state_name"].str.replace("Buenos Aires Costa Atlántica","Bs.As. Costa Atlántica")
#df1['state_name'].cat.rename_categories("Buenos Aires Costa Atlántica","Bs.As. Costa Atlántica", inplace=True)

df1['state_name'].unique()

In [None]:
# Hago un strip a los state_name
df1.state_name = df1.state_name.apply(lambda x: x.strip())
state_name_stats = df1['state_name'].value_counts(ascending=False)

#state_name_stats

In [None]:
# Hago un strip a los state_name
df1.place_name = df1.place_name.apply(lambda x: x.strip())
place_name_stats = df1['place_name'].value_counts(ascending=False)
place_name_stats

### CREAMOS UN SUBSET DEL DATA FRAME PARA CAPITAL FEDERAL
Esto es para poder visualizar la informacion filtrando la columna place_name para cada estado

In [None]:
df1_subset_caba = df1[df1.state_name == "Capital Federal"].copy()

In [None]:
df1_subset_caba[df1_subset_caba['place_name'].str.contains('Palermo')].groupby('place_name').agg({'surface_covered_in_m2_clean':['count','min','max','mean','median','std'],
                                           'price_per_m2_covered_clean':['count','min','max','mean','median','std']})

In [None]:
# Unificar los 4 palermos ya que la cantidad de registros en baja, y aparentan tener desviaciones razonables
df1_subset_caba["place_name"] = df1_subset_caba["place_name"].str.replace("Palermo Soho","Palermo")
df1_subset_caba["place_name"] = df1_subset_caba["place_name"].str.replace("Palermo Hollywood","Palermo")
df1_subset_caba["place_name"] = df1_subset_caba["place_name"].str.replace("Palermo Chico","Palermo")
df1_subset_caba["place_name"] = df1_subset_caba["place_name"].str.replace("Palermo Viejo","Palermo")

In [None]:
df1_subset_caba["place_name"].value_counts()

In [None]:
df1_subset_caba.shape

In [None]:
# Dropeo los registros que como barrio 
indice_indefinido = df1_subset_caba[(df1_subset_caba['place_name']=="Capital Federal")].index
df1_subset_caba.drop(index=indice_indefinido, inplace=True)

### CREAMOS UN SUBSET DEL DATA FRAME PARA LAS MEDICIONES FUERA DE CAPITAL FEDERAL

In [None]:
df1_subset_notcaba = df1[df1.state_name != "Capital Federal"].copy()
df1_subset_notcaba

### REMOVEMOS OUTLIERS EN BASE AL PRECIO POR M2 EN CABA
Eliminamos files para las cuales, los valores de precio por m2 son mayores a dos desvios estandar e igualmente menores a 2 desvios estandar

In [None]:
# Usamos nuestra función definida en funciones.py

print(f"Cantidad de registros antes de sacar los outliers: {len(df1_subset_caba)}")
df2_subset_caba = remover_ppm2_outliers(df1_subset_caba)
print(f"Cantidad de registros LUEGO de sacar los outliers: {len(df2_subset_caba)}")
df2_subset_caba.shape

### REMOVEMOS OUTLIERS EN BASE AL PRECIO POR M2 FUERA DE CABA
Eliminamos files para las cuales, los valores de precio por m2 son mayores a dos desvios estandar e igualmente menores a 2 desvios estandar

In [None]:
# Usamos nuestra función definida en funciones.py

print(f"Cantidad de registros antes de sacar los outliers: {len(df1_subset_notcaba)}")
df2_subset_notcaba = remover_ppm2_outliers(df1_subset_notcaba)
print(f"Cantidad de registros LUEGO de sacar los outliers: {len(df2_subset_notcaba)}")
df2_subset_notcaba.shape

### PRESENTACIÓN DE LA INFORMACIÓN

La info se presenta segmentada en dos subgrupos: 

- Inmuebles ubicados en CABA
- Inbuebles ubicados fuera de CABA

### ¿CUANTAS MEDICIONES COMPONEN CADA SUBGRUPO?

### Subgrupo CABA

In [None]:
mask_capital_federal = df2_subset_caba['state_name'] == 'Capital Federal'
print(f"Observaciones en Capital Federal: {mask_capital_federal.sum()}.")

In [None]:
mask_no_capital_federal = df2_subset_notcaba['state_name'] != 'Capital Federal'
print(f"Observaciones fuera de Capital Federal: {mask_no_capital_federal.sum()}.")

In [None]:
listado_barrios = df2_subset_caba['place_name'].unique()
listado_barrios = np.sort(listado_barrios)
print(f"Cantidad de Barrios considerados en Capital Federal: {len(listado_barrios)}.")

In [None]:
df2_subset_caba['place_name'].values

In [None]:
df2_subset_caba['property_type'].value_counts()

In [None]:
# Verificamos que nos de el valor de un barrio
listado_barrios[0]

In [None]:
df2_subset_caba["place_name"].value_counts()

In [None]:
#x_data = df2_subset_caba['price_per_m2_covered_clean'].values
#x_data

In [None]:
# Seleccion del barrio a graficar por indice
barrio_index = 4

y_data = ['PH', 'Apartment', 'House', 'Store']

x0 = df2_subset_caba.loc[mask_capital_federal & (df2_subset_caba['place_name'] == listado_barrios[barrio_index]) & (df2_subset_caba['property_type'] == 'PH'),'price_per_m2_covered_clean'].values
x1 = df2_subset_caba.loc[mask_capital_federal & (df2_subset_caba['place_name'] == listado_barrios[barrio_index]) & (df2_subset_caba['property_type'] == 'apartment'),'price_per_m2_covered_clean'].values
x2 = df2_subset_caba.loc[mask_capital_federal & (df2_subset_caba['place_name'] == listado_barrios[barrio_index]) & (df2_subset_caba['property_type'] == 'house'),'price_per_m2_covered_clean'].values
x3 = df2_subset_caba.loc[mask_capital_federal & (df2_subset_caba['place_name'] == listado_barrios[barrio_index]) & (df2_subset_caba['property_type'] == 'store'),'price_per_m2_covered_clean'].values

x_data = [x0, x1, x2, x3]

colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)',
          'rgba(255, 65, 54, 0.5)']

import plotly.graph_objects as go

fig = go.Figure()

for xd, yd, cls in zip(x_data, y_data, colors):
        fig.add_trace(go.Box(
            x=xd,
            name=yd,
            boxpoints='suspectedoutliers',
            jitter=0.5,
            pointpos=0,
            fillcolor=cls,
            whiskerwidth=0.7,
            line_width=1)
        )


fig.update_yaxes(type='category')


fig.update_layout(
    title=
    f'<b>Boxplot por tipo de propiedad (con Limpieza)</b>'+
    f'<br>{listado_barrios[barrio_index]}',
    #title=f"Boxplot por tipo de propiedad: {listado_barrios[barrio_index]}",
    #subtitle="By Class of Car",
    xaxis=dict(
        title='Precio por metro<sup>2</sup> cubierto (<b>USD</b>)',
        autorange=True,
        showgrid=True,
        zeroline=False,
        ticks="outside",
        gridwidth=0.5
#        nticks=15
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=True,
    #hovermode='x unified'
)

fig.update_traces(hovertemplate="Precio: %{x:.0f} / m<sup>2</sup>")
fig.show()



#### Histograma por barrio

In [None]:
mybins = np.arange(start=1000, stop=10000, step=500)
df2_subset_caba.hist(column='price_per_m2_covered_clean', by='place_name', grid=True, figsize=(30,40), bins=mybins)

### CUALES SON LOS 5 BARRIOS DE MAYOR PRECIO POR METRO CUADRADO DENTRO DE CABA

In [None]:
df2_top_frequent_caba = df2_subset_caba.groupby(["place_name"])['price_per_m2_covered_clean'].agg('mean').sort_values(ascending=False).round(2).head(8)
print(df2_top_frequent_caba.to_string())

### CUALES SON LOS 5 BARRIOS DE MENOR PRECIO POR METRO CUADRADO DENTRO DE CABA

In [None]:
df2_top_frequent_caba = df2_subset_caba.groupby(["place_name"])['price_per_m2_covered_clean'].agg('mean').sort_values(ascending=True).round(2).head(5)
df2_top_frequent_caba.index

### CUALES SON LOS 5 BARRIOS CON MAS PUBLICACIONES DENTRO DE CABA

In [None]:
#df2_top_frequent_caba = df2_subset_caba.groupby(["place_name"]).agg('count')#.sort_values("operation", ascending=False).head(5).reset_index()
#df2_top_frequent_caba

df2_top_5_caba = df2_subset_caba["place_name"].value_counts().head(5)
print(df2_top_5_caba)
print(f"Top 5 de barrios con mayor cantidad de publicaciones:\n"
      f"{df2_top_5_caba.to_string()}")


-----------------------

In [None]:
## ACA TERMINÓ MIGUEL

In [None]:
## ACA EMPEZO FEDE Y DARIO

In [None]:
df1_subset_caba

plt.style.use("ggplot")

df_price_usd = df1_subset_caba["price_per_m2_covered_clean"]
df_place = df1_subset_caba["place_name"]

fig = plt.figure(figsize = (10,10))
# creo los axes
ax = plt.axes()
# dibujo en axes:

ax.scatter(df_price_usd, df_place, marker = 'o', color = "green", label='serie sin', alpha = 0.3)

In [None]:
df_precios = df1_subset_caba["price_per_m2_covered_clean"]

q_superior = df1_subset_caba["price_per_m2_covered_clean"].quantile(0.90)
q_superior

q_inferior = df1_subset_caba["price_per_m2_covered_clean"].quantile(0.05)
q_inferior



df_subset_caba_clean = df1_subset_caba[(df1_subset_caba["price_per_m2_covered_clean"] < q_superior) & (df1_subset_caba["price_per_m2_covered_clean"] > q_inferior)]



In [None]:
plt.style.use("ggplot")

df_price_usd = df_subset_caba_clean["price_per_m2_covered_clean"]
df_place = df_subset_caba_clean["place_name"]

fig = plt.figure(figsize = (15,15))
# creo los axes
ax = plt.axes()
# dibujo en axes:

ax.scatter(df_price_usd, df_place, marker = 'o', color = "green", label='serie sin', alpha = 0.3)


In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline


## df_subset_gba = df1[df1.state_name == "Bs.As. G.B.A. Zona Norte" & df1.state_name == "Bs.As. G.B.A. Zona Sur" & df.state_name == "Bs.As. G.B.A. Zona Oeste"]



## Bs.As. G.B.A. Zona Norte    22518
## Bs.As. G.B.A. Zona Sur       9907
## Bs.As. Costa Atlántica       6924
## Bs.As. G.B.A. Zona Oeste


In [None]:
df_subset_gba_norte = df1[df1.state_name == "Bs.As. G.B.A. Zona Norte"]
df_subset_gba_sur = df1[df1.state_name == "Bs.As. G.B.A. Zona Sur"]
df_subset_gba_oeste = df1[df1.state_name == "Bs.As. G.B.A. Zona Oeste"]
df_subset_costa = df1[df1.state_name == "Bs.As. Costa Atlántica "]

In [None]:
## -------------------------------------------------------------------------Histograma Conurbano Norte---------------------------------------------------------------------------------

In [None]:
df_subset_gba_norte = df1[df1.state_name == "Bs.As. G.B.A. Zona Norte"]

q_superior = df_subset_gba_norte["price_per_m2_covered_clean"].quantile(0.90)
q_superior

q_inferior = df_subset_gba_norte["price_per_m2_covered_clean"].quantile(0.10)
q_inferior



df_subset_gba_norte_clean = df_subset_gba_norte[(df_subset_gba_norte["price_per_m2_covered_clean"] < q_superior) & (df_subset_gba_norte["price_per_m2_covered_clean"] > q_inferior)]

In [None]:
plt.style.use("ggplot")

df_price_usd_norte = df_subset_gba_norte_clean["price_per_m2_covered_clean"]
df_place_norte = df_subset_gba_norte_clean["place_name"]

fig = plt.figure(figsize = (20,20))
# creo los axes
ax = plt.axes()
# dibujo en axes:

ax.scatter(df_price_usd_norte, df_place_norte, marker = 'o', color = "green", label='serie sin', alpha = 0.3)

In [None]:
df_subset_gba_norte_clean["place_name"].value_counts().head(10)


# df_subset_gba_norte_clean[: "price_per_m2_covered_clean"] >

In [None]:
Tigre = df_subset_gba_norte_clean[df_subset_gba_norte_clean["place_name"] == "Tigre"]
Nordelta = df_subset_gba_norte_clean[df_subset_gba_norte_clean["place_name"] == "Nordelta"]
Pilar = df_subset_gba_norte_clean[df_subset_gba_norte_clean["place_name"] == "Pilar"]
Olivos = df_subset_gba_norte_clean[df_subset_gba_norte_clean["place_name"] == "Olivos"]
San_Isidro = df_subset_gba_norte_clean[df_subset_gba_norte_clean["place_name"] == "San Isidro"]
Martinez = df_subset_gba_norte_clean[df_subset_gba_norte_clean["place_name"] == "Martínez"]


In [None]:
tgr = ['Tigre', 2091]
nrd = ['Nordelta', 2634]
plr = ['Pilar', 1825]
olv = ['Olivos', 2459]
sns = ['San Isidro', 2346]
mrz = ['Martinez', 2159]

Lista_conurbano = [tgr, nrd, plr, olv, sns, mrz]

df_conurbano = pd.DataFrame(Lista_conurbano, columns = ["Lugar", "Precio"])
df_conurbano

In [None]:
hist_conurbano_norte = df_conurbano.sort_values("Precio", ascending= True)
plt.bar(hist_conurbano_norte["Lugar"],hist_conurbano_norte["Precio"])
plt.show

In [None]:
#---------------------------------------------Histograma Conurbano sur-------------------------------------------------------------------

In [None]:
df_subset_gba_sur = df1[df1.state_name == "Bs.As. G.B.A. Zona Sur"]

q_superior = df_subset_gba_sur["price_per_m2_covered_clean"].quantile(0.90)
q_superior

q_inferior = df_subset_gba_sur["price_per_m2_covered_clean"].quantile(0.10)
q_inferior



df_subset_gba_sur_clean = df_subset_gba_sur[(df_subset_gba_sur["price_per_m2_covered_clean"] < q_superior) & (df_subset_gba_sur["price_per_m2_covered_clean"] > q_inferior)]

In [None]:
df_subset_gba_sur_clean.describe()

In [None]:
plt.style.use("ggplot")

df_price_usd_sur = df_subset_gba_sur_clean["price_per_m2_covered_clean"]
df_place_sur = df_subset_gba_sur_clean["place_name"]

fig = plt.figure(figsize = (20,20))
# creo los axes
ax = plt.axes()
# dibujo en axes:

ax.scatter(df_price_usd_sur, df_place_sur, marker = 'o', color = "blue", label='serie sin', alpha = 0.3)

In [None]:
df_subset_gba_sur = df1[df1.state_name == "Bs.As. G.B.A. Zona Sur"]
df_subset_gba_oeste = df1[df1.state_name == "Bs.As. G.B.A. Zona Oeste"]
df_subset_costa = df1[df1.state_name == "Bs.As. Costa Atlántica "]

q1 = df_subset_gba_sur["price_per_m2_covered_clean"].quantile(0.90)

q2 = df_subset_gba_sur["price_per_m2_covered_clean"].quantile(0.10)

df_subset_gba_sur_clean = df_subset_gba_sur[(df_subset_gba_sur["price_per_m2_covered_clean"] < q_superior) & (df_subset_gba_sur["price_per_m2_covered_clean"] > q_inferior)]
df_subset_gba_sur_clean["place_name"].value_counts().head(6)

In [None]:
lpt = ['La Plata', 1758]
ldz = ['Lomas', 2009]
bfd = ['Banfield', 1932]
lns = ['Lanús', 1706]
age = ['Adrogué', 1942]
tpy = ['Temperley', 1740]

Lista_conurbano_sur = [lpt, ldz, bfd, lns, age, tpy]

df_conurbano_sur = pd.DataFrame(Lista_conurbano, columns = ["Lugar", "Precio"])
df_conurbano_sur

In [None]:
df_subset_gba_sur_clean[df_subset_gba_sur_clean["place_name"] == "Temperley"].describe()

In [None]:
hist_conurbano_sur = df_conurbano_sur.sort_values("Precio", ascending= True)
plt.bar(hist_conurbano_sur["Lugar"],hist_conurbano_sur["Precio"])
plt.show

In [None]:
#---------------------------------------------Histograma Conurbano oeste-------------------------------------------------------------------

In [None]:
df_subset_gba_oeste = df1[df1.state_name == "Bs.As. G.B.A. Zona Oeste"]


q1 = df_subset_gba_oeste["price_per_m2_covered_clean"].quantile(0.90)

q2 = df_subset_gba_oeste["price_per_m2_covered_clean"].quantile(0.10)

df_subset_gba_oeste_clean = df_subset_gba_oeste[(df_subset_gba_oeste["price_per_m2_covered_clean"] < q_superior) & (df_subset_gba_oeste["price_per_m2_covered_clean"] > q_inferior)]
df_subset_gba_oeste_clean["place_name"].value_counts().head(6)

In [None]:
plt.style.use("ggplot")

df_price_usd_oeste = df_subset_gba_oeste_clean["price_per_m2_covered_clean"]
df_place_oeste = df_subset_gba_oeste_clean["place_name"]

fig = plt.figure(figsize = (20,20))
# creo los axes
ax = plt.axes()
# dibujo en axes:

ax.scatter(df_price_usd_oeste, df_place_oeste, marker = 'o', color = "green", label='serie sin', alpha = 0.2)

In [None]:
lpt = ['La Plata', 1758]
ldz = ['Lomas', 2009]
bfd = ['Banfield', 1932]
lns = ['Lanús', 1706]
age = ['Adrogué', 1942]
tpy = ['Temperley', 1740]

Lista_conurbano_sur = [lpt, ldz, bfd, lns, age, tpy]

df_conurbano_norte = pd.DataFrame(Lista_conurbano, columns = ["Lugar", "Precio"])
df_conurbano_norte

In [None]:
df_subset_gba_oeste_clean["place_name"].value_counts().head(6)

In [None]:
df_subset_gba_oeste_clean[df_subset_gba_oeste_clean["place_name"] == "Caseros"].describe()

In [None]:
rmj = ['Ramos', 1966]
mrn = ['Morón', 1781]
hdo = ['Haedo', 1833]
csr = ['Castelar', 1855]
izo = ['Ituzaingó', 1589]
crs = ['Caseros', 1704]

Lista_conurbano_oeste = [rmj, mrn, hdo, csr, izo, crs]

df_conurbano_oeste = pd.DataFrame(Lista_conurbano_oeste, columns = ["Lugar", "Precio"])
df_conurbano_oeste

In [None]:
hist_conurbano_oeste = df_conurbano_oeste.sort_values("Precio", ascending= True)
plt.bar(hist_conurbano_oeste["Lugar"],hist_conurbano_oeste["Precio"])
plt.show

In [None]:
# ------------------------------------------------------La costa ----------------------------------------------------------------------

In [None]:
df_subset_costa = df1[df1.state_name == "Bs.As. Costa Atlántica"]

q1 = df_subset_costa["price_per_m2_covered_clean"].quantile(0.90)

q2 = df_subset_costa["price_per_m2_covered_clean"].quantile(0.10)

df_subset_costa_clean = df_subset_costa[(df_subset_costa["price_per_m2_covered_clean"] < q_superior) & (df_subset_costa["price_per_m2_covered_clean"] > q_inferior)]
df_subset_costa_clean["place_name"].value_counts().head(6)

In [None]:
plt.style.use("ggplot")

df_price_usd_costa = df_subset_costa_clean["price_per_m2_covered_clean"]
df_place_costa = df_subset_costa_clean["place_name"]

fig = plt.figure(figsize = (20,20))
# creo los axes
ax = plt.axes()
# dibujo en axes:

ax.scatter(df_price_usd_costa, df_place_costa, marker = 'o', color = "blue", label='serie sin', alpha = 0.2)

In [None]:
df_subset_costa_clean["place_name"].value_counts().head(7)

In [None]:
df_subset_costa_clean[df_subset_costa_clean["place_name"] == "Güemes"].describe()

In [None]:
mdq = ["MDQ", 1922]
pnr = ['Pinamar', 1943]
vgl = ['Gesell', 1716]
csl = ['Esmeralda', 1553]
pmr = ['Mitre', 2353]
gms = ['Güemes', 2525]

Lista_costa = [mdq, pnr, vgl, csl, pmr, gms]

df_costa_atl = pd.DataFrame(Lista_costa, columns = ["Lugar", "Precio"])
df_costa_atl

In [None]:
hist_costa_atl = df_costa_atl.sort_values("Precio", ascending= True)
plt.bar(hist_costa_atl["Lugar"],hist_costa_atl["Precio"])
plt.show