In [34]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn. preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from dtype_diet import optimize_dtypes, report_on_dataframe

# Importando e limpando os dados

In [14]:
impute = SimpleImputer()

houses_df = pd.read_csv('melb_data.csv')
cols_with_null = [col for col in houses_df.columns if houses_df[col].isnull().any()]

houses_df.drop('CouncilArea', axis=1, inplace=True)
houses_df[cols_with_null] = pd.DataFrame(impute.fit_transform(houses_df[cols_with_null]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13580 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   13580 non-null  float64
 15  YearBuilt      13580 non-null  float64
 16  Lattitude      13580 non-null  float64
 17  Longtitude     13580 non-null  float64
 18  Region

# DEIXAR O DATAFRAME MAIS OPTIMIZADO

In [30]:
houses_df = pd.read_csv('melb_data.csv')

report_df = report_on_dataframe(houses_df)
houses_df = optimize_dtypes(houses_df, report_df)  # reatribuindo com tipos mais performáticos

houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Suburb         13580 non-null  category
 1   Address        13580 non-null  object  
 2   Rooms          13580 non-null  int8    
 3   Type           13580 non-null  category
 4   Price          13580 non-null  float32 
 5   Method         13580 non-null  category
 6   SellerG        13580 non-null  category
 7   Date           13580 non-null  category
 8   Distance       13580 non-null  float64 
 9   Postcode       13580 non-null  float32 
 10  Bedroom2       13580 non-null  float16 
 11  Bathroom       13580 non-null  float16 
 12  Car            13518 non-null  float16 
 13  Landsize       13580 non-null  float32 
 14  BuildingArea   7130 non-null   float64 
 15  YearBuilt      8205 non-null   float16 
 16  CouncilArea    12211 non-null  category
 17  Lattitude      13580 non-null  

# VAMOS TRABALHAR COM PIPELINES

In [39]:
y = houses_df['Price']
X = houses_df.drop('Price', axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=0)

categorical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == 'category']
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int8', 'float16', 'float32', 'float64']]

my_cols = categorical_cols + numerical_cols

X_train = X_train_full.copy()
X_valid = X_valid_full.copy()
X_train.info()


<class 'pandas.core.frame.DataFrame'>
Index: 10864 entries, 12167 to 2732
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Suburb         10864 non-null  category
 1   Address        10864 non-null  object  
 2   Rooms          10864 non-null  int8    
 3   Type           10864 non-null  category
 4   Method         10864 non-null  category
 5   SellerG        10864 non-null  category
 6   Date           10864 non-null  category
 7   Distance       10864 non-null  float64 
 8   Postcode       10864 non-null  float32 
 9   Bedroom2       10864 non-null  float16 
 10  Bathroom       10864 non-null  float16 
 11  Car            10815 non-null  float16 
 12  Landsize       10864 non-null  float32 
 13  BuildingArea   5708 non-null   float64 
 14  YearBuilt      6557 non-null   float16 
 15  CouncilArea    9792 non-null   category
 16  Lattitude      10864 non-null  float64 
 17  Longtitude     10864 non-null  fl

In [37]:
numerical_transform = SimpleImputer(strategy='constant')  # pre processamento de variaveis numericas

categorical_transform = Pipeline(steps=[  # pre processamento de variaveis categoricas
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])


# JUNÇÃO DE TODOS OS PRE PROCESSAMENTOS
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transform, numerical_cols),
        ('cat', categorical_transform, categorical_cols),
    ]
)

In [40]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model),
])

my_pipeline.fit(X_train, y_train)

pred = my_pipeline.predict(X_valid)
mea = mean_absolute_error(y_valid, pred)

print(mea)

160628.49889543446
