# Projeto Analytics V1

### Importação de objetos

In [19]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName('analise-imoveis-v1')
    .getOrCreate()
)
spark

### Configurações

In [2]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

# Analisando os dados

### Carregando os dados | source json

In [3]:

df_imoveis = (
    spark.read.format('json')
    .load('data/source-4-ds-train.json')
)

                                                                                

### Numero de linhas e colunas

In [4]:
print((df_imoveis.count(), len(df_imoveis.columns)))



(133964, 19)


                                                                                

### Checando o schema

In [5]:
df_imoveis.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- district: string (nullable = true)
 |    |-- geoLocation: struct (nullable = true)
 |    |    |-- location: struct (nullable = true)
 |    |    |    |-- lat: double (nullable = true)
 |    |    |    |-- lon: double (nullable = true)
 |    |    |-- precision: string (nullable = true)
 |    |-- locationId: string (nullable = true)
 |    |-- neighborhood: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- streetNumber: string (nullable = true)
 |    |-- unitNumber: string (nullable = true)
 |    |-- zipCode: string (nullable = true)
 |    |-- zone: string (nullable = true)
 |-- bathrooms: long (nullable = true)
 |-- bedrooms: long (nullable = true)
 |-- createdAt: string (nullable = true)
 |-- description: string (nullable = true)
 |-- id: string (nullable = true)
 |-- images: array (

### Selecionando todas colunas | 10 linhas apenas

In [21]:
df_imoveis.show(10)

+--------------------+---------+--------+--------------------+--------------------+----------+--------------------+-------------+-----+-------------+--------------------+---------------+-----------+------+--------------------+----------+--------------------+--------------------+-----------+
|             address|bathrooms|bedrooms|           createdAt|         description|        id|              images|listingStatus|owner|parkingSpaces|        pricingInfos|publicationType|publisherId|suites|               title|totalAreas|           unitTypes|           updatedAt|usableAreas|
+--------------------+---------+--------+--------------------+--------------------+----------+--------------------+-------------+-----+-------------+--------------------+---------------+-----------+------+--------------------+----------+--------------------+--------------------+-----------+
|{São Paulo, BR, ,...|        3|       4|2017-02-07T13:21:40Z|04 dorms sendo 01...|787c7bd19d|[https://s3-sa-ea...|       AC

In [7]:
df_imoveis.columns

['address',
 'bathrooms',
 'bedrooms',
 'createdAt',
 'description',
 'id',
 'images',
 'listingStatus',
 'owner',
 'parkingSpaces',
 'pricingInfos',
 'publicationType',
 'publisherId',
 'suites',
 'title',
 'totalAreas',
 'unitTypes',
 'updatedAt',
 'usableAreas']

### Selecionando colunas e analisando amostragem de 10 linhas

In [8]:
cols = [
         'address.country', 'address.city', 'address.neighborhood', 'address.state', 'address.street', 'address.zone', 
         'bathrooms', 'bedrooms', 'description', 'listingStatus', 'owner', 'parkingSpaces', 'pricingInfos.businessType',
         'pricingInfos.monthlyCondoFee','pricingInfos.period', 'pricingInfos.price', 'pricingInfos.rentalTotalPrice', 'pricingInfos.yearlyIptu',
         'publicationType', 'suites', 'title', 'totalAreas', 'unitTypes', 'usableAreas'
       ]

df_imoveis.select(cols).limit(10)

country,city,neighborhood,state,street,zone,bathrooms,bedrooms,description,listingStatus,owner,parkingSpaces,businessType,monthlyCondoFee,period,price,rentalTotalPrice,yearlyIptu,publicationType,suites,title,totalAreas,unitTypes,usableAreas
BR,São Paulo,Jardim da Saúde,São Paulo,Rua Juvenal Galeno,Zona Sul,3,4,04 dorms sendo 01...,ACTIVE,False,6,SALE,,,700000,,,STANDARD,1,PRÓXIMO A AVENIDA...,388.0,TWO_STORY_HOUSE,388
BR,São Paulo,Vila Santa Teresa...,São Paulo,Rua Juruaba,Zona Sul,2,3,03 dorms sendo 01...,ACTIVE,False,2,SALE,,,336000,,,STANDARD,1,PRÓXIMO A FACULDA...,129.0,HOME,129
,São Paulo,Bela Vista,São Paulo,Avenida Paulista,,4,0,"Andar com 395,70m...",ACTIVE,False,5,RENTAL,4900.0,MONTHLY,24929,29829.0,4040.0,STANDARD,0,Excelente Conjunt...,,COMMERCIAL_PROPERTY,396
,São Paulo,Vila Olímpia,São Paulo,Rua Alvorada,,2,3,Viva melhor e de ...,ACTIVE,False,2,SALE,686.0,,739643,,1610.0,STANDARD,1,Apartamento com 8...,80.0,APARTMENT,80
,São Paulo,Paraíso,São Paulo,Rua Curitiba,,5,4,Sua área é ocupad...,ACTIVE,False,5,SALE,6230.0,,7520099,,18900.0,STANDARD,4,Apartamento 332m²...,332.0,APARTMENT,3322
BR,São Paulo,Vila Uberabinha,São Paulo,Rua Pintassilgo,Zona Sul,0,1,O Moema Comfort R...,ACTIVE,False,1,RENTAL,560.0,MONTHLY,1889,2450.0,87.0,PREMIUM,1,Flat em Moema par...,45.0,FLAT,45
,São Paulo,Pinheiros,São Paulo,Rua Mateus Grou,,1,3,Apartamento em ót...,ACTIVE,False,2,SALE,1120.0,,630700,,489.0,STANDARD,0,APARTAMENTO EM PI...,0.0,APARTMENT,94
,São Paulo,Vila Santa Clara,São Paulo,Rua Dona Luísa de...,,1,3,"3 DORMITÓRIOS, (S...",ACTIVE,False,2,SALE,0.0,,385000,,0.0,STANDARD,1,Oportunidade,92.0,CONDOMINIUM,110
,São Paulo,Vila Formosa,São Paulo,Rua Oswaldo Arouca,,2,0,PREDIO COMERCIAL ...,ACTIVE,False,20,SALE,0.0,,1400000,,5600.0,STANDARD,0,OTIMO PREDIO COME...,0.0,SHED_DEPOSIT_WARE...,590
,São Paulo,Aclimação,São Paulo,Rua Almeida Torres,,4,4,O apartamento tem...,ACTIVE,False,2,SALE,0.0,,1106000,,0.0,PREMIUM,3,Apartamento para ...,141.0,APARTMENT,141


### Buscando dados distintos | método distinct()

In [9]:
df_imoveis.select('address.neighborhood').distinct()

                                                                                

neighborhood
Jardim Coimbra
Jardim Jussara
Jardim Fonte do M...
Vila Mendes
Vila Beatriz
Jardim Satélite
Colonia - Zona Leste
Vila Antonieta
Vila Guilherme
Vila Palmeiras


### Contando dados agrupados e contando | método groupby() e count()

In [10]:
df_imoveis.groupBy('address.neighborhood').count()

                                                                                

neighborhood,count
Jardim Coimbra,2
Jardim Jussara,28
Jardim Fonte do M...,63
Vila Mendes,42
Vila Beatriz,29
Jardim Satélite,4
Colonia - Zona Leste,3
Vila Antonieta,168
Vila Guilherme,378
Vila Palmeiras,128


In [11]:
df_imoveis.groupBy('unitTypes').count()

                                                                                

unitTypes,count
FARM,10
RESIDENTIAL_ALLOT...,1430
SHED_DEPOSIT_WARE...,2197
HOME,9030
PENTHOUSE,2772
TWO_STORY_HOUSE,18068
BUSINESS,663
STORE,694
RESIDENTIAL_BUILDING,142
FLAT,7661


### Contando dados agrupados | método groupby() informando coluna

In [12]:
from pyspark.sql import functions as F

df_imoveis \
        .groupBy('address.neighborhood') \
        .agg(F.count("*").alias("qty"))

                                                                                

neighborhood,qty
Jardim Coimbra,2
Jardim Jussara,28
Jardim Fonte do M...,63
Vila Mendes,42
Vila Beatriz,29
Jardim Satélite,4
Colonia - Zona Leste,3
Vila Antonieta,168
Vila Guilherme,378
Vila Palmeiras,128


### Melhorando identação do código

In [13]:
from pyspark.sql import functions as F #repetido apenas para fins didáticos

(
    df_imoveis
            .groupBy(F.col("address.neighborhood"))
            .agg(F.count("*").alias("qty"))
)

                                                                                

neighborhood,qty
Jardim Coimbra,2
Jardim Jussara,28
Jardim Fonte do M...,63
Vila Mendes,42
Vila Beatriz,29
Jardim Satélite,4
Colonia - Zona Leste,3
Vila Antonieta,168
Vila Guilherme,378
Vila Palmeiras,128


### Pesquisando por uma coluna | método where()

In [14]:
from pyspark.sql import functions as F #repetido apenas para fins didáticos

(
    df_imoveis
            .where(F.col("address.neighborhood") == "Jardim Coimbra")
            .select("*")
)

                                                                                

address,bathrooms,bedrooms,createdAt,description,id,images,listingStatus,owner,parkingSpaces,pricingInfos,publicationType,publisherId,suites,title,totalAreas,unitTypes,updatedAt,usableAreas
"{São Paulo, BR, ,...",1,1,2018-07-04T14:22:...,"1 dormitório, 1 b...",9ea003cf0a,[http://www.jalim...,ACTIVE,False,0,"{RENTAL, 0, MONTH...",STANDARD,58b77d191a,0,Locação Casa SÃO ...,0.0,HOME,2018-10-24T01:20:...,0
"{São Paulo, , , {...",3,3,2018-04-16T15:35:...,LINDO SOBRADO FRO...,026e79aa13,[http://www.aterr...,ACTIVE,False,9,"{SALE, 0, null, 4...",STANDARD,2bfafc55fa,1,Sobrado para Vend...,,HOME,2018-11-20T18:32:...,205


### Pesquisando por uma coluna | método where() | por contagem

In [15]:
from pyspark.sql import functions as F #repetido apenas para fins didáticos

df_imoveis_contagem = (
                        df_imoveis
                                .groupBy(F.col("address.neighborhood"))
                                .agg(F.count("*").alias("qty"))
)

(
    df_imoveis_contagem
            .where(F.col("qty") > 10)
            .select("*")
)

                                                                                

neighborhood,qty
Jardim Jussara,28
Jardim Fonte do M...,63
Vila Mendes,42
Vila Beatriz,29
Vila Antonieta,168
Vila Guilherme,378
Vila Palmeiras,128
Água Rasa,493
Vila Almeida,12
Vila Miriam,41


### Contando dados agrupados com mais de uma coluna | método groupBy()

In [16]:
from pyspark.sql import functions as F #repetido apenas para fins didáticos

(
    df_imoveis
            .groupBy(
                        F.col("address.neighborhood"),
                        F.col("unitTypes")
                     )
            .agg(F.count("*").alias("qty"))
)

                                                                                

neighborhood,unitTypes,qty
Sacomã,APARTMENT,248
Vila Moinho Velho,RESIDENTIAL_ALLOT...,2
Vila Dom Pedro II,TWO_STORY_HOUSE,87
Vila Inah,HOME,8
Vila Bela,RESIDENTIAL_ALLOT...,7
Vila Frugoli,COUNTRY_HOUSE,1
Vila São Paulo,TWO_STORY_HOUSE,2
Vila Pereira Barreto,APARTMENT,69
Vila Medeiros,APARTMENT,115
Jardim Mirante,SHED_DEPOSIT_WARE...,1


### Contando dados agrupados com mais de uma coluna | método groupBy() com orderBy()

In [17]:
from pyspark.sql import functions as F #repetido apenas para fins didáticos

(
    df_imoveis
            .groupBy(
                        F.col("address.neighborhood"),
                        F.col("unitTypes")
                     )
            .agg(F.count("*").alias("qty"))
            .orderBy(F.col("address.neighborhood"))
)

                                                                                

neighborhood,unitTypes,qty
,OFFICE,1
,APARTMENT,13
,CONDOMINIUM,1
,TWO_STORY_HOUSE,1
,RESIDENTIAL_BUILDING,4
,FLAT,1
176,APARTMENT,3
204,APARTMENT,4
A. Pinheiros,APARTMENT,4
Aclimação,RESIDENTIAL_ALLOT...,5


### Consultando por casas e trazendo dados financieros | criar dataframe de casas para locação

In [18]:
from pyspark.sql import functions as F #repetido apenas para fins didáticos

tipos_imoveis = ['FARM', 'HOME', 'COUNTRY_HOUSE']

cols = [
         'address.country', 'address.city', 'address.neighborhood', 'address.state', 'address.street', 'address.zone', 
         'bathrooms', 'bedrooms', 'description', 'listingStatus', 'owner', 'parkingSpaces', 'pricingInfos.businessType',
         'pricingInfos.monthlyCondoFee','pricingInfos.period', 'pricingInfos.price', 'pricingInfos.rentalTotalPrice', 'pricingInfos.yearlyIptu',
         'publicationType', 'suites', 'title', 'totalAreas', 'unitTypes', 'usableAreas'
       ]

(
      df_imoveis.select(cols)
                           .where(
                              (F.col("unitTypes").isin(tipos_imoveis)) &
                              (F.col("pricingInfos.businessType") == "RENTAL")
                           )
)

country,city,neighborhood,state,street,zone,bathrooms,bedrooms,description,listingStatus,owner,parkingSpaces,businessType,monthlyCondoFee,period,price,rentalTotalPrice,yearlyIptu,publicationType,suites,title,totalAreas,unitTypes,usableAreas
BR,São Paulo,Jardim Guedala,São Paulo,Rua Olegário Mariano,Zona Sul,6.0,0.0,"Excelente imóvel,...",ACTIVE,False,4.0,RENTAL,0.0,MONTHLY,7000,7000,0.0,STANDARD,3.0,Casa Clássica no ...,480.0,HOME,480.0
,São Paulo,Perdizes,São Paulo,Rua Tucuna,,3.0,3.0,Casa residencial/...,ACTIVE,False,2.0,RENTAL,0.0,MONTHLY,4130,4130,244.0,STANDARD,0.0,PERDIZES - CASA -...,230.0,HOME,170.0
BR,São Paulo,Sumaré,São Paulo,Rua Cayowaá,Zona Oeste,3.0,3.0,"03 Dormitórios, s...",ACTIVE,False,2.0,RENTAL,0.0,MONTHLY,2450,2450,230.0,STANDARD,1.0,Casa em São Paulo,0.0,HOME,0.0
BR,São Paulo,Vila Isolina Mazzei,São Paulo,,Zona Norte,1.0,3.0,Sobrado - 03 dorm...,ACTIVE,False,6.0,RENTAL,0.0,MONTHLY,2800,2800,161.0,STANDARD,2.0,SãO PAULO - Casa ...,0.0,HOME,0.0
BR,São Paulo,Vila Penteado,São Paulo,Rua Emílio Kemp,Zona Norte,1.0,1.0,Trata-se de uma c...,ACTIVE,False,,RENTAL,0.0,MONTHLY,454,454,17.0,STANDARD,0.0,casa - Vila Pente...,0.0,HOME,0.0
BR,São Paulo,Campo Belo,São Paulo,Rua Nicolau Zarvos,Zona Sul,0.0,3.0,CASA TERREA NO JA...,ACTIVE,False,5.0,RENTAL,0.0,MONTHLY,2660,2660,0.0,STANDARD,0.0,CASA RESIDENCIAL ...,250.0,HOME,250.0
BR,São Paulo,Vila Medeiros,São Paulo,Rua Joaquim de So...,Zona Norte,1.0,1.0,,ACTIVE,False,0.0,RENTAL,0.0,MONTHLY,630,630,37.0,STANDARD,0.0,CASA TÉRREA-VILA ...,1.0,HOME,1.0
BR,São Paulo,Vila Noca,São Paulo,Rua José Fava,Zona Sul,4.0,2.0,Sobrado bem local...,ACTIVE,False,2.0,RENTAL,,MONTHLY,2240,2240,280.0,STANDARD,2.0,SOBRADO 02 SUÍTES,275.0,HOME,155.0
,São Paulo,Parque Casa de Pedra,São Paulo,Rua do Buruí,Zona Norte,2.0,3.0,Cod- 717Térreo- 3...,ACTIVE,False,4.0,RENTAL,0.0,MONTHLY,1750,1750,60.0,STANDARD,1.0,Venda ou Locação ...,125.0,HOME,151.0
,São Paulo,Parque Taipas,São Paulo,,Zona Norte,,,CODIGO 0063 &lt;...,ACTIVE,False,,RENTAL,0.0,MONTHLY,525,525,0.0,STANDARD,,Casa a Locação em...,,HOME,
