# Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## Initializing encoder object

In [2]:
le = preprocessing.LabelEncoder()

## Loading train and test data

In [3]:
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [4]:
train_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


## Invoke to see all columns' relations to SalePrice

In [5]:
def displayRelations():
    for column in train_data:
        train_data = train_data.sort_values(by=[column])
        fig = plt.scatter(le.fit_transform(train_data[column]), train_data['SalePrice'])
        plt.title(f'SalePrice relation to {column}')
        plt.xlabel(column)
        plt.ylabel('SalePrice')
        plt.show()

# Extracting valuable features

In [6]:
columns = ['MSZoning','LotFrontage', 'LotArea', 'Alley', 'LandContour', 'Utilities', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

## Label encoding str -> int

In [7]:
for i in range(len(columns)):
    train_data[columns[i]] = le.fit_transform(train_data[columns[i]])
    test_data[columns[i]] = le.fit_transform(test_data[columns[i]])

## Initializing the model and the necessary data

In [8]:
linear_model = LinearRegression()
tree_model = DecisionTreeRegressor(random_state=1)
rtree_model = RandomForestRegressor(random_state=1)

X = np.array(train_data[columns])
y = np.array(train_data['SalePrice'])
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

linear_model.fit(train_X, train_y)
tree_model.fit(train_X, train_y)
rtree_model.fit(train_X, train_y)

RandomForestRegressor(random_state=1)

## Model evaluation

In [9]:
linear_predict = linear_model.predict(val_X)
tree_predict = tree_model.predict(val_X)
rtree_predict = rtree_model.predict(val_X)

print(f"MAE (linear): {round(mean_absolute_error(val_y, linear_predict), 2)}$")
print(f"MAE (tree): {round(mean_absolute_error(val_y, tree_predict), 2)}$")
print(f"MAE (rtree): {round(mean_absolute_error(val_y, rtree_predict), 2)}$")

MAE (linear): 23055.56$
MAE (tree): 23929.37$
MAE (rtree): 16604.92$


## Taking a look at the test data

In [10]:
test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,2,56,777,Pave,2,Reg,3,0,...,120,0,2,2,3,0,6,2010,8,4
1,1462,20,3,57,964,Pave,2,IR1,3,0,...,0,0,2,4,0,12500,6,2010,8,4
2,1463,60,3,50,947,Pave,2,IR1,3,0,...,0,0,2,2,3,0,3,2010,8,4
3,1464,60,3,54,587,Pave,2,IR1,3,0,...,0,0,2,4,3,0,6,2010,8,4
4,1465,120,3,19,118,Pave,2,IR1,1,0,...,144,0,2,4,3,0,1,2010,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,4,0,19,Pave,2,Reg,3,0,...,0,0,2,4,3,0,6,2006,8,4
1455,2916,160,4,0,17,Pave,2,Reg,3,0,...,0,0,2,4,3,0,4,2006,8,0
1456,2917,20,3,112,1069,Pave,2,Reg,3,0,...,0,0,2,4,3,0,9,2006,8,0
1457,2918,85,3,38,647,Pave,2,Reg,3,0,...,0,0,2,2,2,700,7,2006,8,4


## Predict the sale price for every test row and save output to submission.csv

In [11]:
submission = {'Id': [], 'SalePrice': []}
for i, r in test_data.iterrows():
    submission['Id'].append(r['Id'])
    submission['SalePrice'].append(rtree_model.predict(np.array(r[columns]).reshape(1, -1))[0])
df = pd.DataFrame(data=submission)
df.to_csv('/kaggle/working/submission.csv', index=False)

In [12]:
df

Unnamed: 0,Id,SalePrice
0,1461,120858.40
1,1462,151120.00
2,1463,187421.52
3,1464,187021.50
4,1465,205920.74
...,...,...
1454,2915,90388.00
1455,2916,95868.16
1456,2917,147220.74
1457,2918,116014.00


## Checking whether the file was saved correctly and is readable

In [13]:
df = pd.read_csv('/kaggle/working/submission.csv')
df

Unnamed: 0,Id,SalePrice
0,1461,120858.40
1,1462,151120.00
2,1463,187421.52
3,1464,187021.50
4,1465,205920.74
...,...,...
1454,2915,90388.00
1455,2916,95868.16
1456,2917,147220.74
1457,2918,116014.00
