In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from dtype_diet import report_on_dataframe, optimize_dtypes

In [15]:
houses = pd.read_csv('melb_data.csv')
report = report_on_dataframe(houses)
houses = optimize_dtypes(houses, report)

In [16]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Suburb         13580 non-null  category
 1   Address        13580 non-null  object  
 2   Rooms          13580 non-null  int8    
 3   Type           13580 non-null  category
 4   Price          13580 non-null  float32 
 5   Method         13580 non-null  category
 6   SellerG        13580 non-null  category
 7   Date           13580 non-null  category
 8   Distance       13580 non-null  float64 
 9   Postcode       13580 non-null  float32 
 10  Bedroom2       13580 non-null  float16 
 11  Bathroom       13580 non-null  float16 
 12  Car            13518 non-null  float16 
 13  Landsize       13580 non-null  float32 
 14  BuildingArea   7130 non-null   float64 
 15  YearBuilt      8205 non-null   float16 
 16  CouncilArea    12211 non-null  category
 17  Lattitude      13580 non-null  

In [19]:
houses_features = houses.drop('Price', axis=1)
x = houses_features.select_dtypes(exclude=['object', 'category']) 
y = houses['Price']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2)

In [22]:
# definindo a qualidade do modelo

def dataset_score(x_train, x_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    
    return mean_absolute_error(y_test, predict)

### DROP COLUMNS

In [31]:
names_with_missing_values = [col for col in x_train.columns if x_train[col].isnull().any()]

x_train_reduce = x_train.drop(names_with_missing_values, axis=1)
x_test_reduce = x_test.drop(names_with_missing_values, axis=1)

mae_drop_columns = dataset_score(x_train_reduce, x_test_reduce, y_train, y_test)

print(f"MAE WITH DROP COLUMNS IS: {mae_drop_columns}")

MAE WITH DROP COLUMNS IS: 180145.00965778658


### Imputer

In [34]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()

x_train_imputed = pd.DataFrame(imputer.fit_transform(x_train))
x_test_imputed = pd.DataFrame(imputer.fit_transform(x_test))

In [36]:
mea_imputed_columns = dataset_score(x_train_imputed, x_test_imputed, y_train, y_test)
print(f"MAE WITH IMPUTED COLUMNS IS: {mea_imputed_columns}")


MAE WITH IMPUTED COLUMNS IS: 173572.4802872747
