# Predicting House Prices (Regression)

## 1) Import, Load data, Splitting

Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

# Regression models
from sklearn.tree import DecisionTreeRegressor

# Metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error     #for mean_squared_error and root_mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score               # good overall measure for linear relationships



Load data

In [None]:
# iteration 7
url = 'https://drive.google.com/file/d/1WN_JAZVHmpi_KMk4Zgyv7S3G3BekEn82/view?usp=share_link'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

In [None]:
data.info()

Split X and y

In [None]:
X = data.copy()
y = X.pop('SalePrice')

In [None]:
y

Please drop the ID column before the train-test split

In [None]:
# for example Menna dropped these columns:
#X = X.drop(['Alley','PoolQC','Fence','MiscFeature' ,'FireplaceQu'],axis=1)

In [None]:
# missing data - shows the 10 columns with the highest percentage of missing values

per = data.isna().sum()/len(data)*100
per.nlargest(10)

Split train- and testdata

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=8)

## 2) EDA and Dummy Model

In [None]:
# let's find a column that is highly correlated
data.corrwith(data['SalePrice']).sort_values(ascending=False).head()

In [None]:
# plot the relationship
sns.relplot(x=X_train['GrLivArea'],y=y_train,height=6)


Conclude about a good relation

In [None]:
# GrLivArea * 100

In [None]:
# calculate the endpoint for the red line
5000 * 100

In [None]:
# plot the relationship again, this time with our dumb-model prediction line
sns.relplot(x=X_train['GrLivArea'],y=y_train,height=6)
plt.plot([0, 5000], [0, 500000], 'r-')
plt.show();

yay! Dummy model looks ok. Let's check how it performs.

In [None]:
# get the prediction for test
dummy_model_predictions = X_test['GrLivArea'] *100
dummy_model_predictions

--- with the four regression scores:

In [None]:
# Mean absolute error
dummy_mae = mean_absolute_error(y_true = y_test, y_pred = dummy_model_predictions)

# Root mean squared error
dummy_rmse = mean_squared_error(y_true = y_test, y_pred = dummy_model_predictions, squared=False)

# mean absolute percentage error
dummy_mape = mean_absolute_percentage_error(y_true = y_test, y_pred = dummy_model_predictions)

# R2
dummy_r2 = r2_score(y_true = y_test, y_pred = dummy_model_predictions)

pd.DataFrame({"MAE": [dummy_mae],
              "RMSE": [dummy_rmse],
              "MAPE": [dummy_mape],
              "R2": [dummy_r2]},
             index=["dummy_model"])

- The Mean absolute Error tells us, that our estimates are on average 45.000 EUR off

- And the Root mean squared error is higher than Mean absolute Error, so it tells us that we have outliers that contribute more to this value.

- The Mean Percentage Error tells us, that we are on average 23% wrong

- And the R2 value tells us that 26% of the variance of the data can be explained with our prediction.


## 3) Preprocessing

In [None]:
X_cat = X_train.select_dtypes(exclude="number").copy()
X_num = X_train.select_dtypes(include="number").copy()

### Numeric pipe

In [None]:
numerical_pipe = make_pipeline(SimpleImputer())

### Ordinal pipe (OPTIONAL)

In [None]:
# Column index for ordinal encoded columns - we need this later
ordinal_columns = X.columns.get_indexer(["LotShape", "Utilities", "LandSlope"])

# ---

# Order of values for each column in a list
LotShape_cats = ["Reg", "IR1", "IR2", "IR3"]
Utilities_cats = ["AllPub", "NoSewr", "NoSeWa", "ELO"]
LandSlope_cats = ["Gtl", "Mod", "Sev"]

# List of lists for each column
ordinal_cats = [LotShape_cats, Utilities_cats, LandSlope_cats]

# initialize the OrdinalEncoder with list of columns and their ordered categories
ordinal_encoder = OrdinalEncoder(categories=ordinal_cats)

In [None]:
# create an ordinal_Pipeline
ordinal_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="N_A"),
                                 ordinal_encoder)

### OneHotEncoding pipe

In [None]:
# remaining categorical columns need to go to OneHotEncoding - we will need this later
list_of_categorical_column_names_for_OHE = list(set(X_cat) - set(ordinal_columns))
ohe_columns = X.columns.get_indexer(list_of_categorical_column_names_for_OHE)

In [None]:
# initialize OneHotEncoder
ohe_encoder = OneHotEncoder(handle_unknown="ignore")

In [None]:
# create a OHE_pipeline
ohe_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="N_A"),
                                 ohe_encoder)

### All pipes together

In [None]:
# Final preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numerical_pipe, X_num.columns),
        ("ordinal_pipe", ordinal_pipe, ordinal_columns),
        ("one_hot_pipe", ohe_pipe, ohe_columns),
    ]
)

## 4) Model with pipeline and Search

### DecisionTreeRegressor

In [None]:
dt_pipeline = make_pipeline(preprocessor,
                            DecisionTreeRegressor())

dt_pipeline

In [None]:
dt_pipeline.fit(X_train,y_train)

In [None]:
# we want to compare the two models, so we predict X_test
dt_predictions = dt_pipeline.predict(X_test)

In [None]:
# Evaluate the performance
dt_mae = mean_absolute_error(y_true = y_test, y_pred = dt_predictions)
dt_rmse = mean_squared_error(y_true = y_test, y_pred = dt_predictions, squared=False)
dt_mape = mean_absolute_percentage_error(y_true = y_test, y_pred = dt_predictions)
dt_r2 = r2_score(y_true = y_test, y_pred = dt_predictions)

In [None]:
# Creating a Dataframe with all results
results = pd.DataFrame({"MAE": [dummy_mae, dt_mae],
              "RMSE": [dummy_rmse, dt_rmse],
              "MAPE": [dummy_mape, dt_mape],
              "R2": [dummy_r2, dt_r2]},
             index=["dummy_model", "decision_tree"])
results

In [None]:
results.plot.bar(subplots=True,
                             figsize=(4, 8));

**To put everything into a searchCV**
- create a paramgrid
- create the search (RandomizedSearchCV or GridSearchCV)
    - pipeline that contains the preprocessor and Tree
    - paramgrid
    - some more parameters
    - change the scoring to your measure of regression
        - "neg..." if you use a score that is minimized (MAE...)
- search.fit(X_train, y_train)
- search.predict(X_train)
- search.predict(X_test)
- check evalutation / scores

You can explore the parameters of the search.


### Other model 2 - empty

### Other model 3 - empty