In [95]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [96]:
train_data_path = "./train.csv"
predict_data_path = "./test.csv"

train_df = pd.read_csv(train_data_path)
predict_df = pd.read_csv(predict_data_path)


In [97]:
#tell apart continuous data and dsicrete data
continuous_columns = ["Id", "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF" , "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "YearBuilt", "YearRemodAdd", "GarageYrBlt", "SalePrice"]
discrete_columns = []
Id_for_ans = predict_df["Id"].values.tolist()
for i in train_df.columns.values.tolist():
    if (i in continuous_columns)==False:
        discrete_columns.append(i)

In [98]:
#find out the columns that consist of more than half NANs
number_of_data = train_df.shape[0]
number_of_nan = train_df.isna().sum().to_dict()
drop_columns = []
for i in number_of_nan:
    if number_of_nan[i]>=number_of_data/2:
        drop_columns.append(i)

for i in drop_columns:
    if i in continuous_columns:
        continuous_columns.remove(i)
    else:
        discrete_columns.remove(i)

In [99]:
#the final columns that I want to use
select_columns = []

In [100]:
#計算連續型資料的相關係數
continuous_df = train_df[continuous_columns]
corr = continuous_df.corr().values.tolist()
for i, j in zip(continuous_df.columns.values.tolist(), corr[-1]):
    if j>=0.6 and i!="SalePrice":
        select_columns.append(i)

In [101]:
#移除離群值
train_df = train_df[train_df["TotalBsmtSF"]<=2900]
train_df = train_df[train_df["1stFlrSF"]<=2500]
train_df = train_df[train_df["GrLivArea"]<=3600]
train_df = train_df[train_df["GarageArea"]<=1200]
select_columns.remove("GarageCars")

In [102]:
#計算離散型資料 
discrete_df = train_df[discrete_columns]
for i in discrete_columns:
    one_hot = pd.get_dummies(discrete_df[i])
    rename_dic = {}
    for j in one_hot.columns.values.tolist():
        rename_dic[j] = str(i)+"_"+str(j)
    one_hot = one_hot.rename(columns=rename_dic)
    discrete_df = discrete_df.drop(i, axis=1)
    discrete_df = discrete_df.join(one_hot)

discrete_df.replace(True, 1, inplace=True)
discrete_df.replace(False, 0, inplace=True)

discrete_df = discrete_df.join(train_df["SalePrice"])

corr = discrete_df.corr().values.tolist()
for i, j in zip(discrete_df.columns.values.tolist(), corr[-1]):
    if j>=0.05 and i!="SalePrice":
        select_columns.append(i)
        #print(i+" : "+str(j))

In [103]:
for i in discrete_columns:
    one_hot = pd.get_dummies(train_df[i])
    rename_dic = {}
    for j in one_hot.columns.values.tolist():
        rename_dic[j] = str(i)+"_"+str(j)
    one_hot = one_hot.rename(columns=rename_dic)
    train_df = train_df.drop(i, axis=1)
    train_df = train_df.join(one_hot)


In [104]:
def write_ans(predict_target):
    #write the result into ans.csv
    ansDict = {
        "Id" : Id_for_ans,
        "SalePrice" : predict_target
    }
    ansDf = pd.DataFrame(ansDict)
    ansDf.to_csv("ans.csv", index=False)

In [105]:
def data_split(train_df, select_columns):
    #split 20% of the train data to test data
    train_label = train_df[select_columns].values
    train_target = train_df["SalePrice"].values
    x_train, x_test, y_train, y_test = train_test_split(train_label, train_target, test_size=0.2)
    return x_train, x_test, y_train, y_test

In [106]:
def build_test_module(train_df, select_columns):
    x_train, x_test, y_train, y_test = data_split(train_df, select_columns)

    #build a gradient descent module and test whether the columns we choose is good enough
    std = StandardScaler()

    x_train = std.fit_transform(x_train)
    x_test = std.fit_transform(x_test)

    sgd = SGDRegressor()

    sgd.fit(x_train, y_train)
    sgd_pre = sgd.predict(x_test)

    sgd_mean_square = mean_squared_error(y_test, sgd_pre)

    return sgd_mean_square

In [107]:
def build_module(train_df, select_columns):

    #x_train, x_test, y_train, y_test = data_split(train_df, select_columns)

    x_train = train_df[select_columns].values
    y_train = train_df["SalePrice"].values

    #build a gradient descent module and test whether the columns we choose is good enough
    std = StandardScaler()

    x_train = std.fit_transform(x_train)
    #x_test = std.fit_transform(x_test)

    sgd = SGDRegressor()

    sgd.fit(x_train, y_train)
    #sgd_pre = sgd.predict(x_test)

    #sgd_mean_square = mean_squared_error(y_test, sgd_pre)


    return std, sgd

    #0.4 : 612480768.8626099
    #0.2 : 530552347.7190159
    #0.1 : 454339807.24334556
    #0.05 : 433638264.14439946

In [108]:
def predict(std, sgd, predict_df):
    predict_df_columns = predict_df.columns.values.tolist()
    #assume that the module is good enough, predict the data in test.csv with the module
    for i in discrete_columns:
        one_hot = pd.get_dummies(predict_df[i])
        rename_dic = {}
        for j in one_hot.columns.values.tolist():
            rename_dic[j] = str(i)+"_"+str(j)
        one_hot = one_hot.rename(columns=rename_dic)
        predict_df = predict_df.drop(i, axis=1)
        predict_df = predict_df.join(one_hot)

    for i in select_columns:
        if (i in predict_df_columns)==False:
            predict_df[i] = [0 for j in range(predict_df.shape[0])]
    
    predict_df = predict_df[select_columns]
    predict_df.replace(True, 1, inplace=True)
    predict_df.replace(False, 0, inplace=True)

    for i in predict_df.columns.values.tolist():
        predict_df[i] = predict_df[i].fillna(predict_df[i].mean())


    predict_label = predict_df.values
    predict_label = std.fit_transform(predict_label)

    predict_target = sgd.predict(predict_label)
    return predict_target

In [109]:
all_sqd_mean_square = []
for i in tqdm(range(1000), desc="predicting ", ncols=100):
    sgd_mean_square = build_test_module(train_df, select_columns)
    all_sqd_mean_square.append(sgd_mean_square)

print("max : {0}".format(max(all_sqd_mean_square))) 
print("min : {0}".format(min(all_sqd_mean_square)))
print("avg : {0}".format(np.mean(all_sqd_mean_square)))
print("med : {0}".format(np.percentile(all_sqd_mean_square, 50)))

predicting :   0%|                                                         | 0/1000 [00:00<?, ?it/s]

predicting : 100%|██████████████████████████████████████████████| 1000/1000 [00:15<00:00, 63.35it/s]

max : 1253335556.3360887
min : 400162090.9629915
avg : 615815288.759271
med : 601567108.2629598





In [110]:
std, sgd = build_module(train_df, select_columns)
predict_target = predict(std, sgd, predict_df)
write_ans(predict_target)