In [161]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from category_encoders import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
import seaborn as sns

# Prepare Data

## Data Wrangle

In [155]:
features = ['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 
            'Finished_basement', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
            'GrLivArea', 'FullBath', 'HalfBath','BedroomAbvGr' ,'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 
            'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'Neighborhood', 'BsmtQual']
target = 'SalePrice'

df = pd.read_csv('IA_House_Price_Original_Data.csv')

df.fillna(0, inplace=True)

# compute Finished_basement
df['Finished_basement'] = df['BsmtFinSF1'] + df['BsmtFinSF2']

# transform bsmt_qual into numeric value
bsmt_qual_mapping = {0: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['BsmtQual'] = df['BsmtQual'].map(bsmt_qual_mapping)

#
x_std_list = ['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 
            'Finished_basement', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
            'GrLivArea', 'FullBath', 'HalfBath','BedroomAbvGr' ,'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 
            'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'BsmtQual', 'SalePrice']
for col in x_std_list:
    mean = df[col].mean()
    std = df[col].std()
    df[col] = (df[col] - mean) / std



# Creating One hot vector for Neighborhood
# Separate Neighborhood
continuous_features = [col for col in features if col != 'Neighborhood']
categorical_features = ['Neighborhood']
one_hot_encoder = OneHotEncoder()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', continuous_features),   # Pass continuous features as they are
        ('cat', one_hot_encoder, categorical_features) # Apply OneHotEncoder to 'Neighborhood'
    ]
)
X = df[features]
y = df[target]
X_transformed = preprocessor.fit_transform(X)


## Split

In [156]:
X_train, X_temp, y_train, y_temp = train_test_split(X_transformed, y, train_size=1800, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp,test_size=(len(df)-1800-600), random_state=42)

encoded_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

# Combine continuous and one-hot encoded column names
all_feature_names = continuous_features + list(encoded_feature_names)

# Convert the training, validation, and test features back into DataFrames
X_train_df = pd.DataFrame(X_train, columns=all_feature_names)
X_val_df = pd.DataFrame(X_val, columns=all_feature_names)
X_test_df = pd.DataFrame(X_test, columns=all_feature_names)

print(f"Training set size: {X_train_df.shape}, Validation set size: {X_val_df.shape}, Test set size: {X_test_df.shape}")


Training set size: (1800, 47), Validation set size: (600, 47), Test set size: (508, 47)


## Explore

### price

In [None]:
print(df[target].describe())
sns.displot(df[target])

### neighborhood

In [None]:
# intercept = model.named_steps['ridge'].intercept_
# coefficients = model.named_steps['ridge'].coef_

# feature_names = model.named_steps['onehotencoder'].get_feature_names()
# feat_imp = pd.Series(coefficients, index=feature_names)
# feat_imp.sort_values(key=abs).tail(15).plot(kind='barh') #obtain the top 15 neighborhoods that have greater absolute(coef_)
# plt.xlabel('Importance [USD]')
# plt.ylabel('Feature')
# plt.title('Feature Importance for Apt Price')

# Biuld Model

In [163]:
def MODEL(X, y, para, type):
    '''Run Ridge or Lasso Regression, return model itself'''
    if type == 'Ridge':
        model = Ridge(alpha = para)
        model.fit(X, y)
    if type == 'Lasso':
        model = Lasso(alpha = para)
        model.fit(X, y)
    return model

def MAE(X, y, model):
    '''Run X on model and get predicted y, calculate mean_absolute_error'''
    y_pred = model.predict(X)
    mae = mean_absolute_error(y, y_pred)
    return mae

def R2(X, y, model):
    '''Run X on model and get predicted y, calculate r square'''
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    return r2

## Ridge Regression

In [166]:
model_dict = {}
mae_list = []
r2_list = []

Ridge_alpha = [0.10, 0.30, 0.60]

for alpha in Ridge_alpha:
    model = MODEL(X_train_df, y_train, alpha, type = 'Ridge')
    model_dict ['Ridge_{}'.format(alpha)] = model

print(model_dict)
for model in model_dict.keys():
    mae = MAE(X_val_df, y_val, model_dict[model])
    mae_list.append(mae)

for model in model_dict.keys():
    r2 = R2(X_val_df, y_val, model_dict[model])
    r2_list.append(r2)
print(mae_list)
print(r2_list)

{'Ridge_0.1': Ridge(alpha=0.1), 'Ridge_0.3': Ridge(alpha=0.3), 'Ridge_0.6': Ridge(alpha=0.6)}
[0.23865776911304362, 0.238680839744581, 0.23875604250280294]
[0.876146328515824, 0.8761189592575097, 0.876063503237191]


## Lasso Regression

In [167]:
Lasso_alpha = [0.02, 0.06, 0.10]

for alpha in Lasso_alpha:
    model = MODEL(X_train_df, y_train, alpha, type = 'Lasso')
    model_dict ['Lasso_{}'.format(alpha)] = model

print(model_dict)
i = list(model_dict.keys())
for model in i[3:]:
    mae = MAE(X_val_df, y_val, model_dict[model])
    mae_list.append(mae)

for model in i[3:]:
    r2 = R2(X_val_df, y_val, model_dict[model])
    r2_list.append(r2)

print(mae_list)
print(r2_list)

{'Ridge_0.1': Ridge(alpha=0.1), 'Ridge_0.3': Ridge(alpha=0.3), 'Ridge_0.6': Ridge(alpha=0.6), 'Lasso_0.02': Lasso(alpha=0.02), 'Lasso_0.06': Lasso(alpha=0.06), 'Lasso_0.1': Lasso(alpha=0.1)}
[0.23865776911304362, 0.238680839744581, 0.23875604250280294, 0.27579459343707263, 0.28434590939251325, 0.29686077635091607]
[0.876146328515824, 0.8761189592575097, 0.876063503237191, 0.8411753430308408, 0.8227990017010182, 0.8026640083214707]
