In [11]:
""" Import data from csv file """

import pandas as pd
import numpy as np

# Import data from csv file
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


# display data with plot
# df_with_nan = df[df.isna().any(axis=1)]
# df_with_nan.head(10)
# drop data with too many NaN

train_df = df.sample(frac=0.8, random_state=101)
validate_df = df.drop(train_df.index)

train_df_original = train_df.copy()
validate_df_original = validate_df.copy()
test_df_original = test_df.copy()

In [12]:
""" Process with data """
# getting correlation between each column

train_df_corr = train_df.corr(numeric_only=True)
corr_result = train_df_corr['SalePrice']

tags_to_use = []
tags_to_drop = []
size = corr_result.size

for i in range(size):
    if ((abs(corr_result[i]) >= 0.27)):
        if corr_result.index[i] != "SalePrice":
            tags_to_use.append(corr_result.index[i])

for i in train_df.columns:
    if i == "SalePrice":
        continue

    if i not in tags_to_use:
        tags_to_drop.append(i)
    else:
        # Replace NaN with mean value
        train_df[i] = train_df[i].fillna(train_df[i].mean())
        validate_df[i] = test_df[i].fillna(test_df[i].mean())
        test_df[i] = test_df[i].fillna(test_df[i].mean())


# train data
tags_to_drop_with_saleprice = tags_to_drop.copy() + ["SalePrice"]

print(tags_to_use)
print(tags_to_drop)
print(tags_to_drop_with_saleprice)

y_train_df = train_df["SalePrice"]
train_df = train_df.drop(columns=(tags_to_drop_with_saleprice))
y_validate_df = validate_df["SalePrice"]
validate_df = validate_df.drop(columns=(tags_to_drop_with_saleprice))

# test data
test_df = test_df.drop(columns=tags_to_drop)

# Standardize data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_df = scaler.fit_transform(train_df)
validate_df = scaler.transform(validate_df)
test_df = scaler.transform(test_df)


['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF']
['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC

In [13]:
""" Train model """
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(train_df, y_train_df)
validate_df_predict = regressor.predict(validate_df)

""" Evaluate model """
# w_0 = regressor.intercept_
# w_1 = regressor.coef_
# print("w_0: ", w_0)
# print("w_1: ", w_1)

mean_square = mean_squared_error(y_validate_df, validate_df_predict)
print("root mean square: ", mean_square)

root mean square:  9650236891.34423


In [14]:
""" Predict """
test_df_predict = regressor.predict(test_df)
# save the result to csv file
dataframe_result = pd.DataFrame({'Id': test_df_original.Id, 'SalePrice': test_df_predict})
dataframe_result.to_csv('result.csv', index=False)

[108226.51739506 169481.32953604 180378.87422749 ... 187835.35896125
 107843.63969264 250987.11449111]
[[ 0.44334852 -0.792531   -0.3505393  ...  1.18632625  0.3731689
  -0.70683352]
 [ 0.48797748 -0.07717072 -0.44982713 ... -0.76122162  2.38750657
  -0.18019339]
 [ 0.17557479 -0.792531    0.84091463 ...  0.0308433   0.94641914
  -0.20945117]
 ...
 [ 4.01366494 -0.792531   -0.38363524 ...  0.46880861  3.0324131
  -0.70683352]
 [-0.35997267 -0.792531    0.67543491 ... -2.21489371 -0.10453964
  -0.23870896]
 [ 0.17557479  0.63818957  0.70853086 ...  0.81358981  0.77125935
  -0.00464668]]


In [15]:
""" Visualize """
import matplotlib.pyplot as plt
# visualize the training set results
plt.scatter(train_df["1stFlrSF"], y_train_df, color = 'red')
plt.plot(train_df["1stFlrSF"], regressor.predict(train_df), color = 'blue')
plt.title('Environment vs SalesPrice (trainning set)')
plt.xlabel("Environment")
plt.ylabel("Sales Price")
plt.show()

# visualize the test set results
plt.scatter(test_df["1stFlrSF"], y_test_df, color = 'red')
plt.plot(test_df["1stFlrSF"], regressor.predict(test_df), color = 'blue')
plt.title('Environment vs SalesPrice (test set)')
plt.xlabel("Environment")
plt.ylabel("Sales Price")
plt.show()

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices