### Pergunta a ser respondida
<p> Quanto vale o aluguel da sua casa?</p>

In [1]:
# Bibliotecas
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Confiração no pandas
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

# Configuração no Matplotlib
plt.rcParams['figure.figsize'] = (15, 6)
plt.style.use('seaborn-darkgrid')

### Exploração dos dados

In [3]:
# Lendo os dados
base_dados = pd.read_csv('data/house_data.csv')

In [4]:
# Dimensão
base_dados.shape

(10692, 13)

In [5]:
# Verificando os dados
base_dados.head()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),total (R$)
0,São Paulo,70,2,1,1,7,acept,furnished,2065,3300,211,42,5618
1,São Paulo,320,4,4,0,20,acept,not furnished,1200,4960,1750,63,7973
2,Porto Alegre,80,1,1,1,6,acept,not furnished,1000,2800,0,41,3841
3,Porto Alegre,51,2,1,0,2,acept,not furnished,270,1112,22,17,1421
4,São Paulo,25,1,1,0,1,not acept,not furnished,0,800,25,11,836


In [6]:
# Removendo colunas
base_dados.drop(columns=['fire insurance (R$)','total (R$)'], inplace=True)

In [7]:
base_dados.columns

Index(['city', 'area', 'rooms', 'bathroom', 'parking spaces', 'floor',
       'animal', 'furniture', 'hoa (R$)', 'rent amount (R$)',
       'property tax (R$)'],
      dtype='object')

In [8]:
# Analisar campos vazios
base_dados.isnull().sum().sort_values( ascending=False )

property tax (R$)    0
rent amount (R$)     0
hoa (R$)             0
furniture            0
animal               0
floor                0
parking spaces       0
bathroom             0
rooms                0
area                 0
city                 0
dtype: int64

In [9]:
# Campo unicos
base_dados.nunique()

city                    5
area                  517
rooms                  11
bathroom               10
parking spaces         11
floor                  35
animal                  2
furniture               2
hoa (R$)             1679
rent amount (R$)     1195
property tax (R$)    1243
dtype: int64

In [10]:
# Tipos das columns
base_dados.dtypes

city                 object
area                  int64
rooms                 int64
bathroom              int64
parking spaces        int64
floor                object
animal               object
furniture            object
hoa (R$)              int64
rent amount (R$)      int64
property tax (R$)     int64
dtype: object

In [11]:
base_dados.dtypes.value_counts()

int64     7
object    4
dtype: int64

### Exploração Analítica (EDA)

In [12]:
# Filtar os tipos de dados
colunas_categoricas = base_dados.columns[base_dados.dtypes == object ]
colunas_numericas = base_dados.columns[base_dados.dtypes != object ]

In [13]:
colunas_categoricas, colunas_numericas

(Index(['city', 'floor', 'animal', 'furniture'], dtype='object'),
 Index(['area', 'rooms', 'bathroom', 'parking spaces', 'hoa (R$)',
        'rent amount (R$)', 'property tax (R$)'],
       dtype='object'))

### Mudança do modelo no tempo
<p>Por exemplo:</p>
<ul>
<li>No primeiro semestre: 90% das casas aceitava animal.</li>
<li>No segundo semestre: 50% das casas aceitava animal.</li>
    <li><b>Resultado:</b> modif. os padrões da base de dados. == <b>Piorar o resultado do modelo</b></li>
</ul>

#### Fazer uma fotografia da base de dados no TEMPO
<p>Caso a base de dados tenha o tempo, ajudará.</p>
<p>Caso não tenha, faça uma <b>fotografia</b></p>

In [14]:
# Analise dos campos Object
# city
base_dados['city'].value_counts()

São Paulo         5887
Rio de Janeiro    1501
Belo Horizonte    1258
Porto Alegre      1193
Campinas           853
Name: city, dtype: int64

In [15]:
# normalize
base_dados['city'].value_counts( normalize=True )
base_dados['city'].value_counts( normalize=True ) * 100

São Paulo         55.059858
Rio de Janeiro    14.038533
Belo Horizonte    11.765806
Porto Alegre      11.157875
Campinas           7.977927
Name: city, dtype: float64

In [16]:
# Normalizar todos os campos categóricos
for coluna in colunas_categoricas:
    analise = base_dados[coluna].value_counts( normalize=True ) * 100
    print(coluna)
    print(analise)
    print('')

city
São Paulo         55.059858
Rio de Janeiro    14.038533
Belo Horizonte    11.765806
Porto Alegre      11.157875
Campinas           7.977927
Name: city, dtype: float64

floor
-      23.017209
1      10.110363
2       9.212495
3       8.707445
4       6.995885
5       5.611672
6       5.041152
7       4.648335
8       4.582866
9       3.451178
10      3.338945
11      2.833895
12      2.403666
13      1.870557
14      1.589974
15      1.374860
16      1.019454
17      0.897868
18      0.701459
19      0.495698
20      0.411523
21      0.392817
25      0.233820
23      0.233820
22      0.224467
26      0.187056
24      0.177703
27      0.074822
28      0.056117
29      0.046764
32      0.018706
301     0.009353
51      0.009353
46      0.009353
35      0.009353
Name: floor, dtype: float64

animal
acept        77.777778
not acept    22.222222
Name: animal, dtype: float64

furniture
not furnished    75.626637
furnished        24.373363
Name: furniture, dtype: float64



#### Tirar a fotografia daqui ^

In [17]:
### Correção nos dados de floor

# Ajustando o Andar  .loc (localizando)
base_dados.loc[ base_dados['floor']=='301' ]


Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$)
2562,Belo Horizonte,80,3,2,2,301,acept,not furnished,750,2600,164


In [18]:
# Acessando o índice .iloc
base_dados.iloc[ 2562, 5 ]

'301'

In [19]:
# Ajustando o valor 301 para 30

base_dados.iloc[ 2562, 5 ] = 30

In [20]:
base_dados.iloc[ 2562, 5 ]

30

In [21]:
base_dados['floor'].replace('-',0,inplace=True)

In [22]:
base_dados['floor'].value_counts()

0     2461
1     1081
2      985
3      931
4      748
5      600
6      539
7      497
8      490
9      369
10     357
11     303
12     257
13     200
14     170
15     147
16     109
17      96
18      75
19      53
20      44
21      42
25      25
23      25
22      24
26      20
24      19
27       8
28       6
29       5
32       2
35       1
51       1
30       1
46       1
Name: floor, dtype: int64

In [23]:
# Ajustar o '-'
base_dados['floor'] = base_dados['floor'].apply( lambda Registro: 0 if Registro == '-' else Registro)

In [28]:
base_dados['floor'].value_counts()

0     0.230172
1     0.101104
2     0.092125
3     0.087074
4     0.069959
5     0.056117
6     0.050412
7     0.046483
8     0.045829
9     0.034512
10    0.033389
11    0.028339
12    0.024037
13    0.018706
14    0.015900
15    0.013749
16    0.010195
17    0.008979
18    0.007015
19    0.004957
20    0.004115
21    0.003928
25    0.002338
23    0.002338
22    0.002245
26    0.001871
24    0.001777
27    0.000748
28    0.000561
29    0.000468
32    0.000187
51    0.000094
30    0.000094
46    0.000094
35    0.000094
Name: floor, dtype: float64

In [25]:
base_dados['floor'] = pd.to_numeric(base_dados['floor'])

In [26]:
base_dados['floor'].dtypes

dtype('int64')

In [27]:
base_dados.head()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$)
0,São Paulo,70,2,1,1,7,acept,furnished,2065,3300,211
1,São Paulo,320,4,4,0,20,acept,not furnished,1200,4960,1750
2,Porto Alegre,80,1,1,1,6,acept,not furnished,1000,2800,0
3,Porto Alegre,51,2,1,0,2,acept,not furnished,270,1112,22
4,São Paulo,25,1,1,0,1,not acept,not furnished,0,800,25


In [31]:
base_dados['floor'].value_counts(normalize=True)

0     0.230172
1     0.101104
2     0.092125
3     0.087074
4     0.069959
5     0.056117
6     0.050412
7     0.046483
8     0.045829
9     0.034512
10    0.033389
11    0.028339
12    0.024037
13    0.018706
14    0.015900
15    0.013749
16    0.010195
17    0.008979
18    0.007015
19    0.004957
20    0.004115
21    0.003928
25    0.002338
23    0.002338
22    0.002245
26    0.001871
24    0.001777
27    0.000748
28    0.000561
29    0.000468
32    0.000187
51    0.000094
30    0.000094
46    0.000094
35    0.000094
Name: floor, dtype: float64