# Housing Analysis

This notebook analyzes real estate data from https://www.idealista.com.

## 1. Get the Data!

In [13]:
import idealista_api
import pandas as pd
import datetime
import time

In [19]:
def query(num_items):
    '''
    Quick and dirty custom query.
    Query parameters are hard-coded.
    Idealista API is queried with the parameters.
    '''
    # Housing Parameters
    country = 'es' # values: es, it, pt
    locale = 'es' # values: es, it, pt, en, ca
    language = 'es'
    operation = 'sale' # rent
    property_type = 'homes'
    order = 'priceDown'
    # Bidebieta: 43.32222093197415, -1.940486953608932
    # Lasarte: 43.27044123284123, -2.0200276593105064
    # Hernani: 43.266944705720235, -1.9751387613285116
    # Zarautz: 43.28563835253485, -2.167198244296092
    center = '43.32222093197415,-1.940486953608932'
    distance = '5000'
    sort = 'desc'
    flat = 'true'
    #elevator= 'true'

    # Search parameters
    max_items = '50'
    #page_limit = int(num_items/int(max_items))
    page_limit = 2
    df_tot = pd.DataFrame()

    # Search in a loop
    # Watch out: free tier: 100 req/month, 1 req/sec
    for page in range(1, page_limit):
        url = ('https://api.idealista.com/3.5/'+country+'/search?operation='+operation+
            '&maxItems='+max_items+
            '&order='+order+
            '&center='+center+
            '&distance='+distance+
            '&propertyType='+property_type+
            '&sort='+sort+ 
            '&flat='+flat+
#            '&elevator='+elevator+
            '&numPage=%s'+
            '&language='+language) %(page)  
        a = idealista_api.search_api(idealista_api.get_oauth_token(), url)
        df = pd.DataFrame.from_dict(a['elementList'])
        df_tot = pd.concat([df_tot,df])
        time.sleep(1.5)

    df_tot = df_tot.reset_index()
    df_tot['date'] = datetime.date.today()
    return df_tot

In [None]:
# Custom query - quick and dirty for now
num_items = 2500
df = query(num_items)
df.to_csv('./data/test.csv')

## 2. Clean the Data

In [28]:
#df = pd.read_csv('./data/test.csv')
df = pd.read_csv('./data/Week1_data_idealista_flat_elevatorTrueDistance6000.csv')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1155 entries, 0 to 1154
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              1155 non-null   int64  
 1   index                   1155 non-null   int64  
 2   propertyCode            1155 non-null   int64  
 3   thumbnail               1149 non-null   object 
 4   externalReference       877 non-null    object 
 5   numPhotos               1155 non-null   int64  
 6   floor                   1135 non-null   object 
 7   price                   1155 non-null   float64
 8   propertyType            1155 non-null   object 
 9   operation               1155 non-null   object 
 10  size                    1155 non-null   float64
 11  exterior                1155 non-null   bool   
 12  rooms                   1155 non-null   int64  
 13  bathrooms               1155 non-null   int64  
 14  address                 1155 non-null   

In [30]:
df['propertyCode'].nunique()

1128