In [2]:
#Libs necessárias
import pandas as pd
import numpy as np

#Libs gráficas
import matplotlib.pyplot as plt
import seaborn as sns

#Avisos
import warnings
warnings.filterwarnings('ignore')

#Configuração do Pandas
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

# Configuração Matplotlib
plt.rcParams["figure.figsize"] = (15,6)
plt.style.use('seaborn-darkgrid')

### Exploração dos dados

In [3]:
# Lendo os dados
base_dados = pd.read_csv('house_data.csv')

#dimensão
base_dados.shape

(10692, 13)

In [4]:
#Verifição
base_dados.head()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),total (R$)
0,São Paulo,70,2,1,1,7,acept,furnished,2065,3300,211,42,5618
1,São Paulo,320,4,4,0,20,acept,not furnished,1200,4960,1750,63,7973
2,Porto Alegre,80,1,1,1,6,acept,not furnished,1000,2800,0,41,3841
3,Porto Alegre,51,2,1,0,2,acept,not furnished,270,1112,22,17,1421
4,São Paulo,25,1,1,0,1,not acept,not furnished,0,800,25,11,836


In [5]:
# Removendo colunas
base_dados.drop(columns=['fire insurance (R$)', 'total (R$)'], inplace=True)

In [6]:
# Verificando base de dados
base_dados.shape

(10692, 11)

In [7]:
# Campos vazios
base_dados.isnull().sum().sort_values(ascending=False)

city                 0
area                 0
rooms                0
bathroom             0
parking spaces       0
floor                0
animal               0
furniture            0
hoa (R$)             0
rent amount (R$)     0
property tax (R$)    0
dtype: int64

In [8]:
# Campos únicos
base_dados.nunique()

city                    5
area                  517
rooms                  11
bathroom               10
parking spaces         11
floor                  35
animal                  2
furniture               2
hoa (R$)             1679
rent amount (R$)     1195
property tax (R$)    1243
dtype: int64

In [9]:
# Tipos das colunas
base_dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10692 entries, 0 to 10691
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   city               10692 non-null  object
 1   area               10692 non-null  int64 
 2   rooms              10692 non-null  int64 
 3   bathroom           10692 non-null  int64 
 4   parking spaces     10692 non-null  int64 
 5   floor              10692 non-null  object
 6   animal             10692 non-null  object
 7   furniture          10692 non-null  object
 8   hoa (R$)           10692 non-null  int64 
 9   rent amount (R$)   10692 non-null  int64 
 10  property tax (R$)  10692 non-null  int64 
dtypes: int64(7), object(4)
memory usage: 919.0+ KB


In [10]:
# Tipos de colunas
base_dados.dtypes.value_counts()

int64     7
object    4
dtype: int64

### Exploração Analítica (EDA)

In [11]:
# Filtrar os tipos de colunas
col_categorias = base_dados.columns[base_dados.dtypes == object]
col_numericas = base_dados.columns[base_dados.dtypes != object]

col_categorias, col_numericas


(Index(['city', 'floor', 'animal', 'furniture'], dtype='object'),
 Index(['area', 'rooms', 'bathroom', 'parking spaces', 'hoa (R$)',
        'rent amount (R$)', 'property tax (R$)'],
       dtype='object'))

In [12]:
# Análise dos campos objetos
base_dados['city'].value_counts(normalize=True) * 100 # Porcentagem dos valores

São Paulo         55.059858
Rio de Janeiro    14.038533
Belo Horizonte    11.765806
Porto Alegre      11.157875
Campinas           7.977927
Name: city, dtype: float64

In [13]:

for coluna in col_categorias:
    
    #Fazendo analise
    analise = base_dados[coluna].value_counts(normalize=True) * 100
    
    # exibindo dados
    print(coluna, '\n', analise, '\n' )
 

city 
 São Paulo         55.059858
Rio de Janeiro    14.038533
Belo Horizonte    11.765806
Porto Alegre      11.157875
Campinas           7.977927
Name: city, dtype: float64 

floor 
 -      23.017209
1      10.110363
2       9.212495
3       8.707445
4       6.995885
5       5.611672
6       5.041152
7       4.648335
8       4.582866
9       3.451178
10      3.338945
11      2.833895
12      2.403666
13      1.870557
14      1.589974
15      1.374860
16      1.019454
17      0.897868
18      0.701459
19      0.495698
20      0.411523
21      0.392817
25      0.233820
23      0.233820
22      0.224467
26      0.187056
24      0.177703
27      0.074822
28      0.056117
29      0.046764
32      0.018706
35      0.009353
46      0.009353
301     0.009353
51      0.009353
Name: floor, dtype: float64 

animal 
 acept        77.777778
not acept    22.222222
Name: animal, dtype: float64 

furniture 
 not furnished    75.626637
furnished        24.373363
Name: furniture, dtype: float64 



In [25]:
# Correção dos dados

# Ajustando Andar
base_dados.loc[base_dados['floor'] == '301']
base_dados.iloc[2562, 5] = 30

# Ajustado o " - "
base_dados['floor'] = base_dados['floor'].apply(lambda Registro : 0 if Registro == '-' else Registro) # Se tiver - substitui por 0
base_dados['floor'] = pd.to_numeric(base_dados['floor'])

# Verificar
base_dados.head()


Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$)
0,São Paulo,70,2,1,1,7,acept,furnished,2065,3300,211
1,São Paulo,320,4,4,0,20,acept,not furnished,1200,4960,1750
2,Porto Alegre,80,1,1,1,6,acept,not furnished,1000,2800,0
3,Porto Alegre,51,2,1,0,2,acept,not furnished,270,1112,22
4,São Paulo,25,1,1,0,1,not acept,not furnished,0,800,25


In [23]:
# Convertando para tipo Inteiro
pd.to_numeric(base_dados['floor'])

0         7
1        20
2         6
3         2
4         1
         ..
10687     5
10688    17
10689     8
10690     8
10691     0
Name: floor, Length: 10692, dtype: int64