# Example: housing price prediction using square footage and bedrooms and bathrooms, evaluation with std

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/house-prices/house-prices.csv


In [2]:
file_path = '/kaggle/input/house-prices/house-prices.csv'  
p = pd.read_csv(file_path)
p

Unnamed: 0,Home,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick,Neighborhood
0,1,114300,1790,2,2,2,No,East
1,2,114200,2030,4,2,3,No,East
2,3,114800,1740,3,2,1,No,East
3,4,94700,1980,3,2,3,No,East
4,5,119800,2130,3,3,3,No,East
...,...,...,...,...,...,...,...,...
123,124,119700,1900,3,3,3,Yes,East
124,125,147900,2160,4,3,3,Yes,East
125,126,113500,2070,2,2,2,No,North
126,127,149900,2020,3,3,1,No,West


In [3]:
enc= OneHotEncoder()
tra_array=enc.fit_transform(p[["Brick","Neighborhood"]])
transformed = pd.DataFrame(tra_array.toarray(),columns=enc.get_feature_names_out())
transformed

Unnamed: 0,Brick_No,Brick_Yes,Neighborhood_East,Neighborhood_North,Neighborhood_West
0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
123,0.0,1.0,1.0,0.0,0.0
124,0.0,1.0,1.0,0.0,0.0
125,1.0,0.0,0.0,1.0,0.0
126,1.0,0.0,0.0,0.0,1.0


In [4]:
hous_prices=pd.concat([p,transformed],axis =1)
new_hous_prices=hous_prices.drop(['Brick','Neighborhood'],axis =1)
new_hous_prices

Unnamed: 0,Home,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick_No,Brick_Yes,Neighborhood_East,Neighborhood_North,Neighborhood_West
0,1,114300,1790,2,2,2,1.0,0.0,1.0,0.0,0.0
1,2,114200,2030,4,2,3,1.0,0.0,1.0,0.0,0.0
2,3,114800,1740,3,2,1,1.0,0.0,1.0,0.0,0.0
3,4,94700,1980,3,2,3,1.0,0.0,1.0,0.0,0.0
4,5,119800,2130,3,3,3,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
123,124,119700,1900,3,3,3,0.0,1.0,1.0,0.0,0.0
124,125,147900,2160,4,3,3,0.0,1.0,1.0,0.0,0.0
125,126,113500,2070,2,2,2,1.0,0.0,0.0,1.0,0.0
126,127,149900,2020,3,3,1,1.0,0.0,0.0,0.0,1.0


In [5]:
target= new_hous_prices["Price"]
feature = new_hous_prices[["SqFt","Bedrooms","Bathrooms","Brick_No","Brick_Yes","Neighborhood_East","Neighborhood_North","Neighborhood_West"]]
x_train,x_test,y_train,y_test=train_test_split(feature,target,test_size=0.33,shuffle=False)


In [6]:
lin_reg=LinearRegression()
lin_reg.fit(x_train,y_train)


In [7]:
y_predict=lin_reg.predict(x_test)
mse=mean_squared_error(y_predict,y_test)
std_test=np.sqrt(mse)

In [8]:
desc = target.describe()
std_value = desc['std']
print(f"Standard Deviation total: {std_value} & Standard Deviation Test:{std_test} ")

Standard Deviation total: 26868.770370734055 & Standard Deviation Test:13160.442354864532 


In [9]:
submission = pd.DataFrame({'SalePrice real': y_test, 'SalePrice predict': y_predict})
submission.to_csv('/kaggle/working/submission.csv')
print("The submission file has been created successfully!")

The submission file has been created successfully!


# Result: The closer the standard deviation total and test are, the better
Standard Deviation total: 26868.770370734055 & Standard Deviation Test:21919.174288659113