# Bibliotecas

In [59]:
import pandas         as pd
import numpy          as np
import seaborn        as sns

import plotly.express as px
import ipywidgets     as widgets

from ipywidgets            import fixed
from matplotlib            import pyplot as plt
from matplotlib            import gridspec
from geopy.geocoders       import Nominatim

In [60]:
pd.set_option('display.float_format', lambda x: '%.2f' % x) 

# Funções

In [61]:
def show_data (data):
    print(data.columns)
    return None

In [62]:
def show_dtypes (data):
    print( data.dtypes )
    return None

In [63]:
def show_dimensions (data):
    # data dimensions
    print( 'Número de linhas: {}'.format(data.shape[0]))
    print( 'Número de colunas: {}'.format(data.shape[1]))
    return None

In [64]:
def collect_geodata(data, cols): 
    #initialize
    geolocator = Nominatim(user_agent = 'geopirequest')

    # create empty
    data.loc[:, cols[0]] = 'NA'
    data.loc[:, cols[1]] = 'NA'

    for i in range( len(data) ):
        print( 'Loop: {}/{}'.format(i, len(data)))

        query = str(data.loc[i, 'lat']) + ',' + str(data.loc[i, 'long'])
        response = geolocator.reverse(query)

        if cols[0] in response.raw['address']:
            data.loc[i, 'house_number'] = response.raw['address'][cols[0]]

        if cols[1] in response.raw['address']:
            data.loc[i, 'road'] = response.raw['address'][cols[1]]
            
    return data

In [65]:
def date_season(date):
    
    year = str(date.year)
    seasons = {'spring': pd.date_range(start='21/03/'+year, end='20/06/'+year),
               'summer': pd.date_range(start='21/06/'+year, end='22/09/'+year),
               'fall': pd.date_range(start='23/09/'+year, end='20/12/'+year)}
    if date in seasons['spring']:
        return 'spring'
    if date in seasons['summer']:
        return 'summer'
    if date in seasons['fall']:
        return 'fall'
    else:
        return 'winter'

In [66]:
def data_collect(path):
    data = pd.read_csv(path)
    
    show_dimensions(data)
    show_dtypes(data)
    
    return data

In [67]:
def data_transform(data):
    
    # convert objetct em date
    #data['date'] = pd.to_datetime(data['date'])
    data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
    data['yr_built'] = pd.to_datetime( data['yr_built'], format='%Y')
    
    # estatisticar descritivas
    num_attributes = data.select_dtypes( include=['int64', 'float64'])
    media = pd.DataFrame( num_attributes.apply(np.mean, axis=0))
    mediana = pd.DataFrame( num_attributes.apply(np.median, axis=0))

    std = pd.DataFrame( num_attributes.apply(np.median, axis=0))
    min_ = pd.DataFrame( num_attributes.apply(np.min, axis=0))
    max_ = pd.DataFrame( num_attributes.apply(np.max, axis=0))

    df1 = pd.concat([max_, min_, media, mediana, std], axis=1).reset_index()
    df1.columns = ['atributos', 'maximo', 'minimo', 'media', 'mediana', 'desvio_padrao']
    
    # criação da variável dormitory_type
    data['dormitory_type'] = 'NA'
    for i in range (len(data)):
        if data.loc[i, 'bedrooms'] == 1:
            data.loc[i, 'dormitory_type'] = 'studio'
        if data.loc[i, 'bedrooms'] == 2:
            data.loc[i, 'dormitory_type'] = 'apartment'
        else:
            data.loc[i, 'dormitory_type'] = 'house'

    # média de preço por zipcode
    mz = data[['price','zipcode']].groupby('zipcode').median().reset_index().rename(columns={'price': 'median_price'})
    data = pd.merge(data, mz, on='zipcode', how='left')
    
    #decisão de compra
    data['decision'] = data[['price', 'median_price', 'condition']].apply(lambda x: 1 if ((x['price'] <= x['median_price']) & (x['condition']>=3)) else 0, axis=1)
    
    # sugestão de preço de venda
    data['selling_suggestion'] = data[['price', 'median_price', 'condition']].apply(lambda x: x['price']*1.25
                                    if ((x['price'] <= x['median_price']) & (x['condition']>=3)) else 0, axis=1)
    
    #retorno esperado
    data['expected_profit'] = data[['price', 'selling_suggestion']].apply(lambda x: 0 if x['selling_suggestion']==0
                                                                else (x['selling_suggestion'] - x['price']), axis=1)
    
    #estações do ano
    data['season'] = data['date'].map(date_season)
    
    # agrupamento por zipcode e média de preço por estação do ano
    aux = data[['price','zipcode','season']].groupby(['zipcode', 'season']).median().reset_index()
    aux1 = aux.pivot(index='zipcode', columns='season', values='price').reset_index()
    aux1 = aux1.rename(columns={'fall':'med_fall', 'spring':'med_spring', 'summer':'med_summer', 'winter':'med_winter'})
    data = pd.merge(data, aux1, on='zipcode', how='left')
    
    # melhor estação do ano para venda 
    data['season_sell'] = ''
    for i in range (len(data)):
        cols = ['med_fall', 'med_spring', 'med_summer', 'med_winter']
        if data.loc[i, 'decision']!=0:
            if data.loc[i, cols[0]] >= data.loc[i,'price']:
                data.loc[i, 'season_sell'] = data.loc[i, 'season_sell'] + 'autumn '
            if data.loc[i, cols[1]] >= data.loc[i,'price']:
                data.loc[i, 'season_sell'] = data.loc[i, 'season_sell'] + 'spring '
            if data.loc[i, cols[2]] >= data.loc[i,'price']:
                data.loc[i, 'season_sell'] = data.loc[i, 'season_sell'] + 'summer '
            if data.loc[i, cols[3]] >= data.loc[i,'price']:
                data.loc[i, 'season_sell'] = data.loc[i, 'season_sell'] + 'winter '
    
    # exclui duplicados e imóveis que não sejam qualificados para compra
    data = data[data['decision']!=0].copy()
    data = data.sort_values('date', ascending=True)
    data = data.drop_duplicates(subset='id', keep='last').copy()
    show_dimensions(data)

    return data

In [68]:
def data_load(data):
    houses = data[data['decision']!=0][['id','lat','long','price','expected_profit']].copy()

    fig = px.scatter_mapbox(houses,
                      lat='lat',
                      lon='long',
                      color='expected_profit',
                      size='price',
                      color_continuous_scale=px.colors.cyclical.IceFire,
                      size_max=15,
                      zoom=10)

    fig.update_layout(mapbox_style='open-street-map')
    fig.update_layout(height=600, margin={'r':0, 'l':0, 'b':0, 't':0})
    fig.show()
    
    return None

In [69]:
if __name__ == '__main__':

    # Collect
    data_raw = data_collect('../data/kc_house_data.csv')
    
    # Transform
    data_processing = data_transform( data_raw )
    
    # Load
    data_load(data_processing)

Número de linhas: 21613
Número de colunas: 21
id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object
Número de linhas: 10649
Número de colunas: 32
