In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [15]:
dtypes = {
    'country_name': str,
    'expenses': object,
    'id': str,
    'title': str,
    'state_name': str,
    'lat': float,
    'lon': float
}
cols = range(6) + range(8, 18) + [20, 21] + range(23, 27)
df = pd.read_csv('Desktop/Homework/data_gba_total.csv', dtype=dtypes, usecols=cols)

In [16]:
def split_region(x):
    state = x.split('|')[2]
    return state

def split_neighborhood(x):
    state = x.split('|')[3]
    return state

df['state'] = df.place_with_parent_names.apply(split_region)
df['barrio'] = df.place_with_parent_names.apply(split_neighborhood)

In [17]:
df = df.loc[(df['created_on'] >= '2016-08-08'),:]

In [18]:
list_estados = []

def state(row):
    if pd.isnull(row['state']): #Es nula, seteo valor 0 por defecto
        return 0
    if row['state'] in list_estados:
        return list_estados.index(row['state'])+1
    
    list_estados.append(row['state'])
    
    return list_estados.index(row['state'])+1

df['state_n'] = df.apply(state, axis=1)

In [19]:
list_barrio = []

def barrio(row):
    if pd.isnull(row['barrio']): #Es nula, seteo valor 0 por defecto
        return 0
    if row['barrio'] in list_barrio:
        return list_barrio.index(row['barrio'])+1
    
    list_barrio.append(row['barrio'])
    
    return list_barrio.index(row['barrio'])+1

df['barrio_n'] = df.apply(barrio, axis=1)

In [20]:
list_property = []

def property_type(row):
    if pd.isnull(row['property_type']): #Es nula, seteo valor 0 por defecto
        return 0
    if row['property_type'] in list_property:
        return list_property.index(row['property_type'])+1
    
    list_property.append(row['property_type'])
    
    return list_property.index(row['property_type'])+1

df['property_type_n'] = df.apply(property_type, axis=1)

In [21]:
df.shape

(479104, 27)

# Tomo solamente los datos que me interesan

In [22]:
df = df.dropna(subset=['price_aprox_usd'], how='any')

In [23]:
df = df.loc[df['price_aprox_usd']>=5000.0, :]

In [24]:
df = df.loc[:, ['lat', 'lon', 'price_aprox_usd', 'rooms', 'surface_covered_in_m2', 'state_n', 'barrio_n', 'property_type_n', 
                'title']]

In [25]:
df.shape

(424470, 9)

In [26]:
def lowercase(row):
    if pd.isnull(row['title']):
        return row['title']   
    
    return row['title'].lower()

def extra(row, list_of_words):
    if pd.isnull(row['title']):
        return 0
    for word in list_of_words:
        if word in row['title']:
            return 1
    return 0

df['title'] = df.apply(lowercase, axis=1)
df['pileta'] = df.apply((lambda x: extra(x, ['pile'])), axis=1)
df['jardin'] = df.apply((lambda x: extra(x, ['patio', 'jardin', 'parque'])), axis=1)
df['garage'] = df.apply((lambda x: extra(x, ['garage', 'coch'])), axis=1)
df['sotano'] = df.apply((lambda x: extra(x, ['sotano', 'sótano'])), axis=1)
df['baulera'] = df.apply((lambda x: extra(x, ['baúl', 'baulera'])), axis=1)
df['aire'] = df.apply((lambda x: extra(x, ['aire', 'acond'])), axis=1)
df['living'] = df.apply((lambda x: extra(x, ['livin'])), axis=1)
df['cocina'] = df.apply((lambda x: extra(x, ['cocina'])), axis=1)
df['placard'] = df.apply((lambda x: extra(x, ['placard'])), axis=1)
df['duplex'] = df.apply((lambda x: extra(x, ['duplex'])), axis=1)
df['balcon'] = df.apply((lambda x: extra(x, ['balcon', 'balcón'])), axis=1)
df['parrilla'] = df.apply((lambda x: extra(x, ['barbacoa', 'parill', 'grill', ])), axis=1)
df['lavadero'] = df.apply((lambda x: extra(x, ['lavadero'])), axis=1)
df['loft'] = df.apply((lambda x: extra(x, ['loft'])), axis=1)
df['triplex'] = df.apply((lambda x: extra(x, ['triplex'])), axis=1)
df['seguridad'] = df.apply((lambda x: extra(x, ['segur'])), axis=1)
df['deposito'] = df.apply((lambda x: extra(x, ['deposito', 'depósito'])), axis=1)
df['vista'] = df.apply((lambda x: extra(x, ['vista'])), axis=1)
df['playroom'] = df.apply((lambda x: extra(x, ['juegos', 'playroom'])), axis=1)
df['quincho'] = df.apply((lambda x: extra(x, ['quincho'])), axis=1)
df['comedor'] = df.apply((lambda x: extra(x, ['comedor'])), axis=1)
df['terraza'] = df.apply((lambda x: extra(x, ['terraza'])), axis=1)
df['ascensor'] = df.apply((lambda x: extra(x, ['ascen'])), axis=1)
df['chalet'] = df.apply((lambda x: extra(x, ['chale', 'chalé'])), axis=1)
df['reciclar'] = df.apply((lambda x: extra(x, ['recic'])), axis=1)
df['estacionamiento'] = df.apply((lambda x: extra(x, ['estac'])), axis=1)
df['amenities'] = df.apply((lambda x: extra(x, ['ameni'])), axis=1)
df['estrenar'] = df.apply((lambda x: extra(x, ['estrenar'])), axis=1)
df['estudio'] = df.apply((lambda x: extra(x, ['studio'])), axis=1)
df['gimnasio'] = df.apply((lambda x: extra(x, ['gim', 'gym'])), axis=1)

df['ambientes'] = df.apply((lambda x: extra(x, ['monoamb'])), axis=1)



In [27]:
df.groupby('ambientes').count()

Unnamed: 0_level_0,lat,lon,price_aprox_usd,rooms,surface_covered_in_m2,state_n,barrio_n,property_type_n,title,pileta,...,comedor,terraza,ascensor,chalet,reciclar,estacionamiento,amenities,estrenar,estudio,gimnasio
ambientes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,305839,305839,412000,227899,377960,412000,412000,412000,412000,412000,...,412000,412000,412000,412000,412000,412000,412000,412000,412000,412000
1,10948,10948,12470,9667,11892,12470,12470,12470,12470,12470,...,12470,12470,12470,12470,12470,12470,12470,12470,12470,12470


In [28]:
def ambientes(row):
    if pd.isnull(row['title']):
        return 0
    if not ('amb' in row['title']):
        return 0
    
    valor = row['title'][row['title'].index('amb') - 2]
    
    if(valor.isdigit()):
        return int(valor)
    
    return row['ambientes']

df['ambientes'] = df.apply(ambientes, axis=1)


In [29]:
df.groupby('ambientes').count()

Unnamed: 0_level_0,lat,lon,price_aprox_usd,rooms,surface_covered_in_m2,state_n,barrio_n,property_type_n,title,pileta,...,comedor,terraza,ascensor,chalet,reciclar,estacionamiento,amenities,estrenar,estudio,gimnasio
ambientes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,219989,219989,316744,158989,289151,316744,316744,316744,316744,316744,...,316744,316744,316744,316744,316744,316744,316744,316744,316744,316744
1,15741,15741,18070,14092,17311,18070,18070,18070,18070,18070,...,18070,18070,18070,18070,18070,18070,18070,18070,18070,18070
2,27677,27677,30906,23141,28904,30906,30906,30906,30906,30906,...,30906,30906,30906,30906,30906,30906,30906,30906,30906,30906
3,28797,28797,31522,22549,28984,31522,31522,31522,31522,31522,...,31522,31522,31522,31522,31522,31522,31522,31522,31522,31522
4,17293,17293,19061,13569,17778,19061,19061,19061,19061,19061,...,19061,19061,19061,19061,19061,19061,19061,19061,19061,19061
5,4919,4919,5492,3467,5140,5492,5492,5492,5492,5492,...,5492,5492,5492,5492,5492,5492,5492,5492,5492,5492
6,1501,1501,1683,1043,1628,1683,1683,1683,1683,1683,...,1683,1683,1683,1683,1683,1683,1683,1683,1683,1683
7,585,585,661,457,634,661,661,661,661,661,...,661,661,661,661,661,661,661,661,661,661
8,204,204,234,187,225,234,234,234,234,234,...,234,234,234,234,234,234,234,234,234,234
9,81,81,97,72,97,97,97,97,97,97,...,97,97,97,97,97,97,97,97,97,97


In [30]:
del df['title'] #Ya no me sirve

# Creo un set de pruebas como el dado por kaggle

In [31]:
df_test = df.loc[(np.isnan(df['surface_covered_in_m2']))|(np.isnan(df['lat']))|(np.isnan(df['lon']))|(np.isnan(df['rooms']))]

In [32]:
df_test.shape

(226024, 39)

# Relleno de valores NaN del set de pruebas y filtros del dataset

In [33]:
df = df.dropna()

In [34]:
df = df.loc[df['surface_covered_in_m2']>=10.0, :]

# Mínimo 1000 dolares por metro cuadrado y máximo 6000

In [35]:
df = df.loc[((df['price_aprox_usd']/df['surface_covered_in_m2'])>= 1000.0)&
                (df['price_aprox_usd']/df['surface_covered_in_m2']<= 6000.0), :]

In [36]:
df.shape

(177804, 39)

In [37]:
df = df.sample(frac=1) #Shuffle al azar del dataset

In [38]:
max_barrios_prom = df.sort_values('barrio_n', ascending=False).iloc[0, df.columns.get_loc("barrio_n")]

In [39]:
#medias_lat = df.loc[:, ['barrio_n', 'lat']].groupby('barrio_n').median()
#medias_lon = df.loc[:, ['barrio_n', 'lon']].groupby('barrio_n').median()
medias_lat = df.loc[:, ['barrio_n', 'lat']].groupby('barrio_n', as_index=False).median()
medias_lon = df.loc[:, ['barrio_n', 'lon']].groupby('barrio_n', as_index=False).median()

In [40]:
#medias_superficie = df.loc[:, ['property_type_n', 'surface_covered_in_m2']].groupby('property_type_n').median()
#medias_lat = df.loc[:, ['barrio_n', 'lat']].groupby('barrio_n').median()
#medias_lon = df.loc[:, ['barrio_n', 'lon']].groupby('barrio_n').median()

In [41]:
#def superficie_valor(row):
 #   if not(np.isnan(row['surface_covered_in_m2'])):
 #       if row['surface_covered_in_m2']>=10.0:
 #           return row['surface_covered_in_m2']
 #       else:
 #           return medias_superficie.iloc[int(row['property_type_n'])-1, 0]    
        
 #   return medias_superficie.iloc[int(row['property_type_n'])-1, 0]

def lat_valor(row):
    if not(np.isnan(row['lat'])):
        return row['lat']
    
    if int(row['barrio_n'])>max_barrios_prom:
        return df['lat'].median()
    
    return medias_lat.loc[medias_lat['barrio_n']==int(row['barrio_n']),:].iloc[0, 1]
    #return medias_lat.iloc[int(row['barrio_n'])-1, 0]

def lon_valor(row):
    if not(np.isnan(row['lon'])):
        return row['lon']

    if int(row['barrio_n'])>max_barrios_prom:
        return df['lon'].median()   
    return medias_lon.loc[medias_lon['barrio_n']==int(row['barrio_n']),:].iloc[0, 1]

    #return medias_lon.iloc[int(row['barrio_n'])-1, 0]

In [42]:
values = { 
          'rooms': np.trunc(df['rooms'].median()),
         }
df_test = df_test.fillna(value=values)

In [43]:
#df_test['surface_covered_in_m2'] = df_test.apply(superficie_valor, axis=1)
df_test['lat'] = df_test.apply(lat_valor, axis=1)
df_test['lon'] = df_test.apply(lon_valor, axis=1)

In [45]:
random_forest = RandomForestRegressor(n_estimators=100)

In [46]:
df_superficie = df.loc[:, df.columns != 'price_aprox_usd']
superficie = df_superficie['surface_covered_in_m2']
del df_superficie['surface_covered_in_m2']

In [47]:
grid = RandomForestRegressor(n_estimators=100)

In [48]:
grid.fit(df_superficie, superficie)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [49]:
grid.score(df_superficie, superficie)

0.98620907267349756

In [431]:
precios_test = df_test['price_aprox_usd']
del df_test['price_aprox_usd']

df_test_superficies_nan = df_test.loc[np.isnan(df_test['surface_covered_in_m2']), df_test.columns != 'surface_covered_in_m2']

prediccion_superficie = grid.predict(df_test_superficies_nan)
prediccion_superficie_df = pd.DataFrame({'surface_covered_in_m2': prediccion_superficie}, df_test_superficies_nan.index)

In [432]:
df_test.update(prediccion_superficie_df)

In [433]:
df_test

Unnamed: 0,lat,lon,rooms,surface_covered_in_m2,state_n,barrio_n,property_type_n,pileta,jardin,garage,...,terraza,ascensor,chalet,reciclar,estacionamiento,amenities,estrenar,estudio,gimnasio,ambientes
60408,-34.598706,-58.393056,3.0,62.000000,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3
60413,-34.596168,-58.479062,3.0,142.900000,1,4,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60414,-34.618444,-58.375829,3.0,400.000000,1,4,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60415,-34.549864,-58.482403,3.0,39.000000,1,5,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
60416,-34.596168,-58.479062,3.0,72.000000,1,4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60417,-34.550648,-58.458857,3.0,49.000000,1,6,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
60420,-34.601213,-58.432153,3.0,28.000000,1,9,3,0,1,0,...,0,0,0,0,0,0,0,0,0,1
60421,-34.563978,-58.456565,3.0,60.000000,1,3,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60426,-34.639608,-58.361043,3.0,125.000000,1,13,2,0,0,0,...,0,0,0,0,0,0,0,0,0,3
60430,-34.596168,-58.479062,3.0,110.000000,1,4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [434]:
df_test['price_aprox_usd'] = precios_test

In [435]:
df.to_csv('Desktop/Homework/dataset.csv')
df_test.to_csv('Desktop/Homework/dataset_pruebas.csv')

# Creo dataset para predecir

In [3]:
properati = pd.read_csv('Desktop/Homework/properati_dataset_testing_noprice.csv')

In [438]:
properati.head()

Unnamed: 0_level_0,id,created_on,property_type,operation,place_name,place_with_parent_names,country_name,state_name,lat-lon,lat,lon,surface_total_in_m2,surface_covered_in_m2,floor,rooms,expenses,description
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
3632,3632,2017-08-24,departamento,venta,Puerto Madero,|Argentina|Capital Federal|Puerto Madero|,Argentina,Capital Federal,"-34.6109877599,-58.3634635778",-34.610988,-58.363464,0.0,,,,,Edificio BA Houses situado frente al Dique 3 d...
3633,3633,2017-08-25,departamento,venta,Buenos Aires Interior,|Argentina|Buenos Aires Interior|,Argentina,Buenos Aires Interior,,,,0.0,,,,,El departamento cuenta con un living-comedor a...
2263404,2263404,2017-08-01,departamento,venta,Palermo Soho,|Argentina|Capital Federal|Palermo|Palermo Soho|,Argentina,Capital Federal,"-34.5893633232,-58.4128798588",-34.589363,-58.41288,53.0,48.0,,,1500.0,IMPECABLE TORRE COY III – DEPA...
2263405,2263405,2017-08-01,departamento,venta,Chacarita,|Argentina|Capital Federal|Chacarita|,Argentina,Capital Federal,,,,39.0,39.0,,,,AMBIENTE DIVISIBLE CON PISOS D...
2263406,2263406,2017-08-01,departamento,venta,Chacarita,|Argentina|Capital Federal|Chacarita|,Argentina,Capital Federal,,,,51.0,51.0,,,,LIVING COMEDOR CON PISOS DE PO...


In [439]:
properati['state'] = properati.place_with_parent_names.apply(split_region)
properati['barrio'] = properati.place_with_parent_names.apply(split_neighborhood)

# Paso a ingles los tipo de propiedad

In [440]:
properati.groupby('property_type').count()

Unnamed: 0_level_0,id,created_on,operation,place_name,place_with_parent_names,country_name,state_name,lat-lon,lat,lon,surface_total_in_m2,surface_covered_in_m2,floor,rooms,expenses,description,state,barrio
property_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
casa,3409,3409,3409,3409,3409,3409,3409,2401,2401,2401,2535,3029,55,1731,75,3409,3409,3409
departamento,9761,9761,9761,9761,9761,9761,9761,7330,7330,7330,8507,9079,1242,5167,2407,9761,9761,9761
ph,996,996,996,996,996,996,996,756,756,756,811,897,71,602,61,996,996,996


In [441]:
df_para_ver_tipos = pd.read_csv('Desktop/Homework/data_gba_total.csv', dtype=dtypes, usecols=cols)

In [442]:
df_para_ver_tipos.groupby('property_type').count()

Unnamed: 0_level_0,country_name,created_on,currency,expenses,floor,geonames_id,lat,lat-lon,lon,operation,...,place_with_parent_names,price,price_aprox_local_currency,price_aprox_usd,price_per_m2,rooms,surface_covered_in_m2,surface_in_m2,surface_total_in_m2,title
property_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PH,48669,144101,130846,3874,7542,125466,124634,124634,124634,144101,...,144101,132168,132168,132168,42286,91076,46619,61109,38707,86727
apartment,431839,1379444,1220754,114012,394859,1169563,1132140,1132140,1132129,1379444,...,1379444,1228704,1228702,1228702,385720,965352,425289,656427,337330,816304
house,274241,833897,694308,12033,18258,619257,549195,549195,549195,833897,...,833897,701426,701426,701426,224084,357266,258852,397424,234632,543895
store,25436,52029,42793,1727,761,44419,40728,40728,40728,52029,...,52029,43391,43391,43391,19154,4668,23294,15310,19794,44216


In [443]:
def pasar_a_ingles(row):
    if row['property_type']=='casa':
        return 'house'
    if row['property_type']=='departamento':
        return 'apartment'
    if row['property_type']=='ph':
        return 'PH' 
    
    return 0

properati['property_type'] = properati.apply(pasar_a_ingles, axis=1)

In [444]:
properati.groupby('property_type').count()

Unnamed: 0_level_0,id,created_on,operation,place_name,place_with_parent_names,country_name,state_name,lat-lon,lat,lon,surface_total_in_m2,surface_covered_in_m2,floor,rooms,expenses,description,state,barrio
property_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
PH,996,996,996,996,996,996,996,756,756,756,811,897,71,602,61,996,996,996
apartment,9761,9761,9761,9761,9761,9761,9761,7330,7330,7330,8507,9079,1242,5167,2407,9761,9761,9761
house,3409,3409,3409,3409,3409,3409,3409,2401,2401,2401,2535,3029,55,1731,75,3409,3409,3409


In [445]:
properati['state_n'] = properati.apply(state, axis=1)
properati['barrio_n'] = properati.apply(barrio, axis=1)
properati['property_type_n'] = properati.apply(property_type, axis=1)

# Tomo solo las columnas que me interesan

In [446]:
properati['title']=properati['description']

In [447]:
properati = properati.loc[:, ['lat', 'lon', 'rooms', 'surface_covered_in_m2', 'state_n', 'barrio_n', 'property_type_n'
               , 'title']]

In [448]:
#properati = properati.loc[:, ['surface_covered_in_m2', 'state_n', 'barrio_n', 'property_type_n'
#               , 'title']]

In [449]:
properati['title'] = properati.apply(lowercase, axis=1)
properati['pileta'] = properati.apply((lambda x: extra(x, ['pile'])), axis=1)
properati['jardin'] = properati.apply((lambda x: extra(x, ['patio', 'jardin', 'parque'])), axis=1)
properati['garage'] = properati.apply((lambda x: extra(x, ['garage', 'coch'])), axis=1)
properati['sotano'] = properati.apply((lambda x: extra(x, ['sotano', 'sótano'])), axis=1)
properati['baulera'] = properati.apply((lambda x: extra(x, ['baúl', 'baulera'])), axis=1)
properati['aire'] = properati.apply((lambda x: extra(x, ['aire', 'acond'])), axis=1)
properati['living'] = properati.apply((lambda x: extra(x, ['livin'])), axis=1)
properati['cocina'] = properati.apply((lambda x: extra(x, ['cocina'])), axis=1)
properati['placard'] = properati.apply((lambda x: extra(x, ['placard'])), axis=1)
properati['duplex'] = properati.apply((lambda x: extra(x, ['duplex'])), axis=1)
properati['balcon'] = properati.apply((lambda x: extra(x, ['balcon', 'balcón'])), axis=1)
properati['parrilla'] = properati.apply((lambda x: extra(x, ['barbacoa', 'parill', 'grill', ])), axis=1)
properati['lavadero'] = properati.apply((lambda x: extra(x, ['lavadero'])), axis=1)
properati['loft'] = properati.apply((lambda x: extra(x, ['loft'])), axis=1)
properati['triplex'] = properati.apply((lambda x: extra(x, ['triplex'])), axis=1)
properati['seguridad'] = properati.apply((lambda x: extra(x, ['segur'])), axis=1)
properati['deposito'] = properati.apply((lambda x: extra(x, ['deposito', 'depósito'])), axis=1)
properati['vista'] = properati.apply((lambda x: extra(x, ['vista'])), axis=1)
properati['playroom'] = properati.apply((lambda x: extra(x, ['juegos', 'playroom'])), axis=1)
properati['quincho'] = properati.apply((lambda x: extra(x, ['quincho'])), axis=1)
properati['comedor'] = properati.apply((lambda x: extra(x, ['comedor'])), axis=1)
properati['terraza'] = properati.apply((lambda x: extra(x, ['terraza'])), axis=1)
properati['ascensor'] = properati.apply((lambda x: extra(x, ['ascen'])), axis=1)
properati['chalet'] = properati.apply((lambda x: extra(x, ['chale', 'chalé'])), axis=1)
properati['reciclar'] = properati.apply((lambda x: extra(x, ['recic'])), axis=1)
properati['estacionamiento'] = properati.apply((lambda x: extra(x, ['estac'])), axis=1)
properati['amenities'] = properati.apply((lambda x: extra(x, ['ameni'])), axis=1)
properati['estrenar'] = properati.apply((lambda x: extra(x, ['estrenar'])), axis=1)
properati['estudio'] = properati.apply((lambda x: extra(x, ['studio'])), axis=1)
properati['gimnasio'] = properati.apply((lambda x: extra(x, ['gim', 'gym'])), axis=1)

properati['ambientes'] = properati.apply((lambda x: extra(x, ['monoamb'])), axis=1)

In [450]:
properati['ambientes'] = properati.apply(ambientes, axis=1)


In [451]:
del properati['title']

In [452]:
properati.head()

Unnamed: 0_level_0,lat,lon,rooms,surface_covered_in_m2,state_n,barrio_n,property_type_n,pileta,jardin,garage,...,terraza,ascensor,chalet,reciclar,estacionamiento,amenities,estrenar,estudio,gimnasio,ambientes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3632,-34.610988,-58.363464,,,1,22,1,0,0,1,...,0,0,0,0,0,0,0,0,0,3
3633,,,,,5,4,1,1,0,1,...,0,0,0,0,0,0,0,0,1,0
2263404,-34.589363,-58.41288,,48.0,1,12,1,0,0,1,...,0,0,0,0,1,0,1,0,0,0
2263405,,,,39.0,1,45,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2263406,,,,51.0,1,45,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [453]:
#properati['surface_covered_in_m2'] = properati.apply(superficie_valor, axis=1)
properati['lat'] = properati.apply(lat_valor, axis=1)
properati['lon'] = properati.apply(lon_valor, axis=1)
properati = properati.fillna(value=values)

In [454]:
properati.head()

Unnamed: 0_level_0,lat,lon,rooms,surface_covered_in_m2,state_n,barrio_n,property_type_n,pileta,jardin,garage,...,terraza,ascensor,chalet,reciclar,estacionamiento,amenities,estrenar,estudio,gimnasio,ambientes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3632,-34.610988,-58.363464,3.0,,1,22,1,0,0,1,...,0,0,0,0,0,0,0,0,0,3
3633,-34.596168,-58.479062,3.0,,5,4,1,1,0,1,...,0,0,0,0,0,0,0,0,1,0
2263404,-34.589363,-58.41288,3.0,48.0,1,12,1,0,0,1,...,0,0,0,0,1,0,1,0,0,0
2263405,-34.586411,-58.450426,3.0,39.0,1,45,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2263406,-34.586411,-58.450426,3.0,51.0,1,45,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [455]:
properati_superficies_nan = properati.loc[np.isnan(properati['surface_covered_in_m2']), properati.columns != 'surface_covered_in_m2']

prediccion_superficie = grid.predict(properati_superficies_nan)
prediccion_superficie_df = pd.DataFrame({'surface_covered_in_m2': prediccion_superficie}, properati_superficies_nan.index)

In [456]:
properati.update(prediccion_superficie_df)

In [457]:
properati.head()

Unnamed: 0_level_0,lat,lon,rooms,surface_covered_in_m2,state_n,barrio_n,property_type_n,pileta,jardin,garage,...,terraza,ascensor,chalet,reciclar,estacionamiento,amenities,estrenar,estudio,gimnasio,ambientes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3632,-34.610988,-58.363464,3.0,114.5,1,22,1,0,0,1,...,0,0,0,0,0,0,0,0,0,3
3633,-34.596168,-58.479062,3.0,63.5,5,4,1,1,0,1,...,0,0,0,0,0,0,0,0,1,0
2263404,-34.589363,-58.41288,3.0,48.0,1,12,1,0,0,1,...,0,0,0,0,1,0,1,0,0,0
2263405,-34.586411,-58.450426,3.0,39.0,1,45,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2263406,-34.586411,-58.450426,3.0,51.0,1,45,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [458]:
properati.to_csv('Desktop/Homework/properati.csv')