In [13]:
""" Import Pacakges """
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import matplotlib.pyplot as plt
import itertools
from tqdm import tqdm

In [14]:
""" Define Functions """

def save_csv(data):
    """ Save data to csv file """
    data.to_csv('data.csv', index=False)


def data_split(train_df, selected_columns):
    train_label = train_df[selected_columns].values
    train_target = train_df["SalePrice"].values
    x_train, x_test, y_train, y_test = train_test_split(
        train_label, train_target, test_size=0.2)
    # train_df = train_df.sample(frac=0.8, random_state=101)
    # validate_df = train_df.drop(train_df.index)
    return x_train, x_test, y_train, y_test


def build_test_module(train_df):
    selected_columns = train_df.columns.tolist()
    x_train, x_test, y_train, y_test = data_split(train_df, selected_columns)

    # standardize data
    # print(x_train)
    
    std = StandardScaler()
    x_train = std.fit_transform(x_train)
    x_test = std.transform(x_test)

    # SDG Linear Regression
    sgd_regressor = SGDRegressor()
    sgd_regressor.fit(x_train, y_train)
    sgd_validate = sgd_regressor.predict(x_test)

    sgd_mean_square = mean_squared_error(y_test, sgd_validate)

    # return sgd_mean_square, std, sgd_regressor
    return sgd_mean_square


def build_module(train_df):
    selected_columns = train_df.columns.tolist()
    selected_columns.remove("SalePrice")
    x_train = train_df[selected_columns].values
    y_train = train_df["SalePrice"].values
    
    # standardize data
    std = StandardScaler()

    x_train = std.fit_transform(x_train)
    # x_test = std.transform(test_df)

    # SDG Linear Regression
    sgd_regressor = SGDRegressor()
    sgd_regressor.fit(x_train, y_train)
    # sgd_validate = sgd_regressor.predict(x_test)

    # sgd_mean_square = mean_squared_error(y_test, sgd_validate)

    # return std, sgd_regressor
    return std, sgd_regressor


def predict(std, regressor, predict_df, discrete_tags_to_use,continuous_tags_to_use,final_tags):
    predict_df_columns = predict_df.columns.values.tolist()

    print("discrete_df_columns: ", discrete_tags_to_use)
    print("continuous_df_columns: ", continuous_tags_to_use)
    print("predict_df_columns: ", predict_df_columns)

    result_id = predict_df["Id"]
    conti_df = predict_df[continuous_tags_to_use]
    discrete_df = predict_df[discrete_tags_to_use]

    predict_df = pd.get_dummies(
        discrete_df, columns=discrete_tags_to_use, dtype=np.int64
    ).join(conti_df)

    predict_df = predict_df[final_tags]

    # for i in selected_columns:
    #     if i not in predict_df_columns:
    #         predict_df = predict_df.drop(columns=[i])

    for i in predict_df.columns.tolist():
        predict_df[i] = predict_df[i].fillna(predict_df[i].mean())

    # predict_label = predict_df.values
    predict_data = std.transform(predict_df)

    y_predict = regressor.predict(predict_data)
    return pd.DataFrame({
        "Id": result_id,
        "SalePrice": y_predict
    })

In [15]:
""" Import data from csv file """
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Import data from csv file
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# df_original = df.copy()
train_df_original = df.copy()

In [16]:
""" Preprocessing data """
# Dealing the Discrete data
discrete_tags = ["MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope",
                 "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl",
                 "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual", "ExterCond", "Foundation", "BsmtQual",
                 "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Heating", "HeatingQC", "CentralAir",
                 "Electrical", "KitchenQual", "Functional", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual",
                 "GarageCond", "PavedDrive", "PoolQC", "Fence", "MiscFeature", "SaleType", "SaleCondition"]

# divide data into discrete and continuous
train_df_continuous = df.drop(columns=discrete_tags)
train_df_discrete = df[discrete_tags]
# Removing columns with more than 50% missing values
rows_of_data = df.shape[0]
missing_values = df.isna().sum().to_dict()

for i in missing_values:
    if missing_values[i] > rows_of_data/2:
        if i in discrete_tags:
            discrete_tags.remove(i)
            train_df_discrete = train_df_discrete.drop(columns=i)
        else:
            train_df_continuous = train_df_continuous.drop(columns=i)

# remove outliner
train_df_continuous = train_df_continuous[train_df_continuous["TotalBsmtSF"] <= 2900]
train_df_continuous = train_df_continuous[train_df_continuous["1stFlrSF"] <= 2500]
train_df_continuous = train_df_continuous[train_df_continuous["GrLivArea"] <= 3600]
train_df_continuous.drop(columns=["GarageArea"])
print("good")

good


In [17]:
""" Processing data """
# getting correlation between each column
# selecting continuous data
train_df_corr_continuous = train_df_continuous.corr(numeric_only=True)
corr_result_continuous = train_df_corr_continuous['SalePrice']
continuous_df_column = train_df_continuous.columns.tolist()

continous_tags_to_use = []
for i in range(corr_result_continuous.size):
    if (corr_result_continuous[i] >= 0.6):
        if (continuous_df_column[i] != "SalePrice"):
            continous_tags_to_use.append(corr_result_continuous.index[i])

# selecting discrete data

# Translating discrete data into number
spread_discrete_df = pd.get_dummies(
    train_df_discrete, columns=train_df_discrete.columns.tolist(), dtype=np.int64
)

spread_discrete_df = spread_discrete_df.join(train_df_continuous["SalePrice"])

train_df_corr_discrete = spread_discrete_df.corr(numeric_only=True)

corr_result_discrete = train_df_corr_discrete["SalePrice"]
discrete_df_column = spread_discrete_df.columns.tolist()

discrete_tags_to_use = []

for i in range(corr_result_discrete.size):
    if (corr_result_discrete[i] >= 0.05):
        if (discrete_df_column[i] != "SalePrice"):
            discrete_tags_to_use.append(corr_result_discrete.index[i])

print("selected discrete tags", discrete_tags_to_use)

train_df_discrete = spread_discrete_df[discrete_tags_to_use]
train_df_continuous = train_df_continuous[continous_tags_to_use]

# combine continuous and discrete data

all_train_df = train_df_continuous.join(
    train_df_discrete).join(train_df_original["SalePrice"])

# print("all selected tags: ", train_df_continuous.columns.tolist() +
#       train_df_discrete.columns.tolist())

print("all selected tags: ", all_train_df.columns.tolist())

selected discrete tags ['MSZoning_FV', 'MSZoning_RL', 'LotShape_IR1', 'LotShape_IR2', 'LandContour_HLS', 'LandContour_Low', 'LotConfig_CulDSac', 'LandSlope_Mod', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker', 'Condition1_Norm', 'Condition1_PosN', 'Condition2_PosA', 'Condition2_PosN', 'BldgType_1Fam', 'HouseStyle_2Story', 'RoofStyle_Hip', 'RoofMatl_WdShngl', 'Exterior1st_CemntBd', 'Exterior1st_VinylSd', 'Exterior2nd_CmentBd', 'Exterior2nd_VinylSd', 'ExterQual_Ex', 'ExterQual_Gd', 'ExterCond_TA', 'Foundation_PConc', 'BsmtQual_Ex', 'BsmtQual_Gd', 'BsmtCond_Gd', 'BsmtCond_TA', 'BsmtExposure_Av', 'BsmtExposure_Gd', 'BsmtFinType1_GLQ', 'BsmtFinType2_Unf', 'Heating_GasA', 'HeatingQC_Ex', 'CentralAir_Y', 'Electrical_SBrkr', 'KitchenQual_Ex', 'KitchenQual_Gd', 'Functional_Typ', 'FireplaceQu_Ex', 'FireplaceQu_Gd', 'FireplaceQu_TA'

""" Evaluate the model """

sqd_mean_squares = []

for i in tqdm(range(1000), desc="evaluating", ncols=100):
    sgd_mean_square = build_test_module(all_train_df)
    sqd_mean_squares.append(sgd_mean_square)

print("max: {:,}".format(max(sqd_mean_squares)))
print("min: {:,}".format(min(sqd_mean_squares)))
print("average {:,}".format(np.mean(sqd_mean_squares)))
print("median: {:,}".format(np.median(sqd_mean_squares)))

In [18]:
""" Training data """
std, sgd = build_module(all_train_df)
data = predict(
    std, sgd, test_df, discrete_tags, continous_tags_to_use,all_train_df.drop(columns=["SalePrice"]).columns.tolist())
save_csv(data)

discrete_df_columns:  ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
continuous_df_columns:  ['OverallQual', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'GarageCars', 'GarageArea']
predict_df_columns:  ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd'

