In [2]:
!pip install category_encoders
!pip install pandas
!pip install scikit-learn

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
     -------------------------------------- 81.9/81.9 kB 906.6 kB/s eta 0:00:00
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [3]:
import pandas as pd
import numpy as np

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_percentage_error,
    mean_absolute_error)

from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV


In [6]:
def load_data(train_path:str, test_path:str) -> (pd.DataFrame, pd.DataFrame):
    """
    Loads the train and test data into pandas DataFrames
    """
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test

train, test = load_data('train.csv', 'test.csv')

In [7]:
train_cols = [
    col for col in train.columns if col not in ['id', 'target']
    ]

categorical_cols = ["type", "sector"]
target           = "price"

In [8]:
categorical_transformer = TargetEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('categorical',
          categorical_transformer,
          categorical_cols)
    ])

steps = [
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(**{
        "learning_rate":0.01,
        "n_estimators":300,
        "max_depth":5,
        "loss":"absolute_error"
    }))
]

pipeline = Pipeline(steps)


In [9]:
pipeline.fit(train[train_cols], train[target])

               type        sector  net_usable_area  net_area  n_rooms  \
0      departamento      vitacura            140.0     170.0      4.0   
1              casa      la reina            225.0     659.0      4.0   
2              casa    las condes            110.0     200.0      3.0   
3      departamento  lo barnechea            250.0     250.0      3.0   
4      departamento   providencia             70.0      79.0      2.0   
...             ...           ...              ...       ...      ...   
16207  departamento    las condes            140.0     176.0      4.0   
16208          casa    las condes            600.0    3683.0      5.0   
16209  departamento   providencia             68.0      75.0      3.0   
16210  departamento    las condes             74.0      92.0      2.0   
16211  departamento    las condes            140.0     154.0      4.0   

       n_bathroom  latitude  longitude  price  
0             4.0 -33.40123  -70.58056  11900  
1             3.0 -33.44340

In [14]:
print(test[train_cols])

              type       sector  net_usable_area  net_area  n_rooms  \
0             casa     vitacura            152.0     257.0      3.0   
1     departamento   las condes            140.0     165.0      4.0   
2     departamento     la reina            101.0     101.0      4.0   
3     departamento  providencia             80.0     112.0      1.0   
4     departamento     vitacura            200.0     200.0      3.0   
...            ...          ...              ...       ...      ...   
6944  departamento        nunoa             45.0      57.0      1.0   
6945  departamento  providencia             66.0      78.0      2.0   
6946  departamento   las condes             58.0      58.0      1.0   
6947  departamento   las condes            135.0     135.0      4.0   
6948  departamento  providencia            111.0     152.0      3.0   

      n_bathroom  latitude  longitude  price  
0            3.0 -33.37940  -70.54470  18500  
1            4.0 -33.41135  -70.56977  14500  
2     

In [16]:
test_predictions = pipeline.predict(test[train_cols])
test_target = test[target].values
print(test_predictions)

[21534.11150632 10450.68312672  7935.39629154 ... 10450.68312672
 10450.68312672  7424.73125166]


In [34]:
type(test_predictions), type(test_target)

(numpy.ndarray, numpy.ndarray)

In [35]:
def print_metrics(predictions, target):
    print("RMSE: ", np.sqrt(mean_squared_error(predictions, target)))
    print("MAPE: ", mean_absolute_percentage_error(predictions, target))
    print("MAE : ", mean_absolute_error(predictions, target))

In [36]:
print_metrics(test_predictions, test_target)

RMSE:  10254.155686652393
MAPE:  0.40042979298798137
MAE :  5859.374796053153
