# Housing Prices Advanced Regression Techiques 
The housing prices dataset is dataset used to display more advanced regression techniques. The dataset has numeric attributes, objects and empty values and will require hot one encoding, imputers and XGBoost to load and handle the data.

The goal of this machine learning project is to calculate the sale price of each house based on the various features of the house.

In [None]:
import pandas as pd

def get_data():
    #Import data
    train_data = pd.read_csv('../house-prices-advanced-regression-techniques/train.csv')
    #test_data = pd.read_csv('../house-prices-advanced-regression-techniques/test.csv')

    #Drops data where the Sale Price is missing
    train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

    y = train_data.SalePrice
    X = train_data.drop(['Id', 'SalePrice'], axis=1)

    #Getting the desired columns in numbers and objects
    low_cardinality_cols = [cname for cname in X.columns if 
                                    X[cname].nunique() < 10 and X[cname].dtype == "object"]
    numeric_cols = [cname for cname in X.columns if 
                                    X[cname].dtype in ['int64', 'float64']]
    my_cols = numeric_cols + low_cardinality_cols
    X_predictors = X[my_cols]

    #One-hot encodings
    X = pd.get_dummies(X_predictors)
    return X, y
    

# Random Forest and Cross Validation

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer

X_rf, y_rf = get_data()

my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', RandomForestRegressor(n_estimators=50, random_state=0))
                             ])
scores = -1 * cross_val_score(my_pipeline, X_rf, y_rf,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("Mean Absolute Error CV:\n", scores.mean())

# XGBoost and Mean Absolute Error

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

X_xg, y_xg = get_data()

#Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_xg, y_xg, train_size=0.7, test_size=0.3, random_state=0)

#Imputer
my_imputer = SimpleImputer()
X_train = my_imputer.fit_transform(X_train)
X_test = my_imputer.transform(X_test)

#Training the model for XGBoost
my_model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)

#Predicting using model and mean absolute error
predictions = my_model.predict(X_test)
mae = round(mean_absolute_error(predictions, y_test), -1)

#Output
print("Mean Absolute Error: " + str(mae))

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

#Import data
train_data = pd.read_csv('../house-prices-advanced-regression-techniques/train.csv')
#test_data = pd.read_csv('../house-prices-advanced-regression-techniques/test.csv')

#Drops data where the Sale Price is missing
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

y = train_data.SalePrice
X = train_data.drop(['Id', 'SalePrice'], axis=1)

#Getting the desired columns in numbers and objects
low_cardinality_cols = [cname for cname in X.columns if 
                                X[cname].nunique() < 10 and X[cname].dtype == "object"]
numeric_cols = [cname for cname in X.columns if 
                                X[cname].dtype in ['int64', 'float64']]
my_cols = numeric_cols + low_cardinality_cols
X_predictors = X[my_cols]

#One-hot encodings
X = pd.get_dummies(X_predictors)

my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05))
                             ])
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("Mean Absolute Error CV:\n", scores.mean())