In [7]:
import pickle
import pathlib
from sklearn.model_selection import train_test_split
import pandas as pd

def get_data():
    DATA_DIR = pathlib.Path.cwd().parent / 'data'
    print(DATA_DIR)

    clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'
    with open(clean_data_path, 'rb') as file:
        data = pickle.load(file)

    model_data = data.copy()

    categorical_columns = []
    ordinal_columns = []
    for col in model_data.select_dtypes('category').columns:
        if model_data[col].cat.ordered:
            ordinal_columns.append(col)
        else:
            categorical_columns.append(col)

    for col in ordinal_columns:
        codes, _ = pd.factorize(data[col], sort=True)
        model_data[col] = codes

    original_data = model_data['Exterior']
    encoded_data = pd.get_dummies(original_data, drop_first=True)

    aux_dataframe = encoded_data
    aux_dataframe['Exterior'] = original_data.copy()



    model_data = pd.get_dummies(model_data, drop_first=True)

    X = model_data.drop(columns=['SalePrice']).copy()
    y = model_data['SalePrice'].copy()


    Xtrain, Xtest, ytrain, ytest = train_test_split(
        X,
        y,
        test_size=0.25,
        random_state=42,
    )

    return Xtrain, Xtest, ytrain, ytest


In [8]:
Xtrain, Xtest, ytrain, ytest = get_data()

/Users/marcelomarchetto/Desktop/projetinho ml/data


# Pipeline com scalers

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures    

pipe = Pipeline([
    # ("scaler", StandardScaler()),
    # ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("model", LinearRegression()),
])


In [10]:
y_pred = pipe.fit(Xtrain, ytrain).predict(Xtest)

from sklearn.metrics import mean_squared_error
import numpy as np

RMSE = np.sqrt(mean_squared_error(ytest, y_pred))
print(f"RMSE: {RMSE}")

error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

RMSE: 0.061127758761017816
Average error is 15.11%


# Pipeline testando Ridge, Lasso e ElasticNet

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import  Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, PolynomialFeatures    

pipe = Pipeline([
    # ("scaler", StandardScaler()),
    # ("poly", PolynomialFeatures(degree=2)),
    ("model", Ridge()),
])

In [12]:
y_pred = pipe.fit(Xtrain, ytrain).predict(Xtest)

from sklearn.metrics import mean_squared_error
import numpy as np

RMSE = np.sqrt(mean_squared_error(ytest, y_pred))
print(f"RMSE: {RMSE}")

error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

RMSE: 0.061075854853779525
Average error is 15.10%


# Pipeline com Random Forest

In [13]:
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures    

pipe = Pipeline([
    # ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=3, include_bias=False)),
    ("model", RandomForestRegressor()),
])

In [14]:
y_pred = pipe.fit(Xtrain, ytrain).predict(Xtest)

from sklearn.metrics import mean_squared_error
import numpy as np

RMSE = np.sqrt(mean_squared_error(ytest, y_pred))
print(f"RMSE: {RMSE}")

error_percent = 100 * (10**RMSE - 1)

print(f'Average error is {error_percent:.2f}%')

otimizacao de hiper params
