# 0.0 Imports

In [13]:
import re
import os
import inflection

import pandas  as  pd
import numpy   as  np 
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## 0.1 Helper Functions

## 0.2 Loading data

In [3]:
path = "../data/"
df_raw = pd.read_csv(path + "/raw/house_data.csv", sep = ",")

In [5]:
df_raw.sample()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),total (R$)
5895,Rio de Janeiro,78,2,2,1,2,acept,not furnished,180,1650,0,26,1856


In [9]:
# Will not use columns fire insurance (R$) and total (R$)
df_raw.drop(columns= ["fire insurance (R$)", "total (R$)"], inplace = True)
df_raw.head()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$)
0,São Paulo,70,2,1,1,7,acept,furnished,2065,3300,211
1,São Paulo,320,4,4,0,20,acept,not furnished,1200,4960,1750
2,Porto Alegre,80,1,1,1,6,acept,not furnished,1000,2800,0
3,Porto Alegre,51,2,1,0,2,acept,not furnished,270,1112,22
4,São Paulo,25,1,1,0,1,not acept,not furnished,0,800,25


# 1.0 Data Description

In [8]:
df1 = df_raw

## 1.1 Rename Columns

In [28]:
df1.columns

Index(['city', 'area', 'rooms', 'bathroom', 'parking_spaces', 'floor',
       'animal', 'furniture', 'hoa_(R$)', 'rent_amount_(R$)',
       'property_tax_(R$)', 'fire_insurance_(R$)', 'total_(R$)'],
      dtype='object')

In [29]:
cols_new = ['city',
'area',
'rooms',
'bathroom',
'parking_spaces',
'floor',
'animal',
'furniture',
'hoa_(R$)',
'rent_amount_(R$)',
'property_tax_(R$)',
'fire_insurance_(R$)',
'total_(R$)']

df1.columns = cols_new

## 1.2. Data Dimensions

In [30]:
print('Number of Rows: {}'.format(df1.shape[0]))
print('Number of Columns: {}'.format(df1.shape[1]))

Number of Rows: 10692
Number of Columns: 13


## 1.3. Data Types

In [31]:
df1.dtypes

city                   object
area                    int64
rooms                   int64
bathroom                int64
parking_spaces          int64
floor                  object
animal                 object
furniture              object
hoa_(R$)                int64
rent_amount_(R$)        int64
property_tax_(R$)       int64
fire_insurance_(R$)     int64
total_(R$)              int64
dtype: object

## 1.4. Check NA

In [32]:
df1.isnull().sum()

city                   0
area                   0
rooms                  0
bathroom               0
parking_spaces         0
floor                  0
animal                 0
furniture              0
hoa_(R$)               0
rent_amount_(R$)       0
property_tax_(R$)      0
fire_insurance_(R$)    0
total_(R$)             0
dtype: int64

## 1.5. Check Unique Values

In [33]:
df1.nunique()

city                      5
area                    517
rooms                    11
bathroom                 10
parking_spaces           11
floor                    35
animal                    2
furniture                 2
hoa_(R$)               1679
rent_amount_(R$)       1195
property_tax_(R$)      1243
fire_insurance_(R$)     216
total_(R$)             5751
dtype: int64

## 1.6. Descriptive Statistical

In [36]:
df1.dtypes

city                   object
area                    int64
rooms                   int64
bathroom                int64
parking_spaces          int64
floor                  object
animal                 object
furniture              object
hoa_(R$)                int64
rent_amount_(R$)        int64
property_tax_(R$)       int64
fire_insurance_(R$)     int64
total_(R$)              int64
dtype: object

In [39]:
num_attributes = df1.select_dtypes(include = ["int64"])
cat_attributes = df1.select_dtypes(exclude = ["int64"])

In [40]:
# Central Tendency - mean, median
ct1 = pd.DataFrame( num_attributes.apply(np.mean)).T
ct2 = pd.DataFrame( num_attributes.apply(np.median)).T

# Dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame(num_attributes.apply(np.std)).T
d2 = pd.DataFrame(num_attributes.apply(min)).T
d3 = pd.DataFrame(num_attributes.apply(max)).T
d4 = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min())).T
d5 = pd.DataFrame(num_attributes.apply(lambda x: x.skew())).T
d6 = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis())).T

# Concatenate
m = pd.concat( [ d2, d3, d4, ct1, ct2, d1, d5, d6]).T.reset_index()
m.columns = ["attributes","min","max","range","mean","median","std","skew","kurtosis"]

In [41]:
m

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,area,11.0,46335.0,46324.0,149.21792,90.0,536.991829,69.596804,5548.308334
1,rooms,1.0,13.0,12.0,2.506079,2.0,1.171211,0.702391,1.487659
2,bathroom,1.0,10.0,9.0,2.236813,2.0,1.407132,1.21381,1.134852
3,parking_spaces,0.0,12.0,12.0,1.609147,1.0,1.589446,1.487534,2.769075
4,hoa_(R$),0.0,1117000.0,1117000.0,1174.021698,560.0,15591.576074,69.039381,4912.249106
5,rent_amount_(R$),450.0,45000.0,44550.0,3896.247194,2661.0,3408.386117,1.838877,4.624228
6,property_tax_(R$),0.0,313700.0,313700.0,366.704358,125.0,3107.686983,96.013594,9667.782564
7,fire_insurance_(R$),3.0,677.0,674.0,53.300879,36.0,47.765797,1.9704,5.934963
8,total_(R$),499.0,1120000.0,1119501.0,5490.487,3581.5,16483.955004,58.960803,3926.019305


In [42]:
cat_attributes.apply(lambda x: x.unique().shape[0])

city          5
floor        35
animal        2
furniture     2
dtype: int64

# 2.0 Feature Engineering

# 3.0 Data Filtering

# 4.0 EDA

# 5.0 Data Preparation

# 6.0 Feature Selection

# 7.0 Machine Learning Modelling

# 8.0 Hyperparameter Fine Tuning

# 9.0 Translation and Interpretration Error