In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error

In [2]:
dataset = pd.read_csv("/Users/nikkiruhil/Desktop/Price Prediction/house-prices.csv")
dataset.head()

Unnamed: 0,Home,SqFt,Bedrooms,Bathrooms,Offers,Neighborhood,Price
0,1,1790,2,2,2,East,114300
1,2,2030,4,2,3,East,114200
2,3,1740,3,2,1,East,114800
3,4,1980,3,2,3,East,94700
4,5,2130,3,3,3,East,119800


In [3]:
#check unique values in neighborhood column
print(dataset['Neighborhood'].unique())

['East' 'North' 'West']


In [4]:
label = LabelEncoder()
dataset['Neighborhood'] = label.fit_transform(dataset['Neighborhood'])

In [5]:
dataset.head(7)

Unnamed: 0,Home,SqFt,Bedrooms,Bathrooms,Offers,Neighborhood,Price
0,1,1790,2,2,2,0,114300
1,2,2030,4,2,3,0,114200
2,3,1740,3,2,1,0,114800
3,4,1980,3,2,3,0,94700
4,5,2130,3,3,3,0,119800
5,6,1780,3,2,2,1,114600
6,7,1830,3,3,3,2,151600


In [6]:
x = dataset.iloc[:,:-1]
y = dataset['Price']

In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [8]:
print(x.shape,x_train.shape,x_test.shape)

(128, 6) (102, 6) (26, 6)


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [10]:
lr = LinearRegression()
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1.0)
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()
adr = AdaBoostRegressor()
xgr = XGBRegressor()
knr = KNeighborsRegressor()
svr = SVR()

In [11]:
clfs = {
    'linear':lr,
    'ri':ridge,
    'la':lasso,
    'dt':dtr,
    'rf':rfr,
    'gb':gbr,
    'ad':adr,
    'xg':xgr,
    'kn':knr,
    'sv':svr
}

In [12]:
def trained_model(clfs,x_train,y_train,x_test,y_test):
    clfs.fit(x_train,y_train)
    train_predict = clfs.predict(x_test)
    score = r2_score(y_test,train_predict)
    absolute = mean_absolute_error(y_test,train_predict)
    return score,absolute

In [13]:
trained_model(lr,x_train,y_train,x_test,y_test)

(0.7474051467404754, 10506.64021445457)

In [14]:
trained_model(ridge,x_train,y_train,x_test,y_test)

(0.7485647557860133, 10531.518415333627)

In [15]:
score_final = []
absolute_final = []

for name,clf in clfs.items():
    current_score,current_absolute = trained_model(clf,x_train,y_train,x_test,y_test)

    print("Name:",name)
    print("Score:",current_score)
    print("Absolute:",current_absolute)

    score_final.append(current_score)
    absolute_final.append(current_absolute)

Name: linear
Score: 0.7474051467404754
Absolute: 10506.64021445457
Name: ri
Score: 0.7485647557860133
Absolute: 10531.518415333627
Name: la
Score: 0.747417269324351
Absolute: 10506.741878602592
Name: dt
Score: 0.43679989801281216
Absolute: 15023.076923076924
Name: rf
Score: 0.7185565442391566
Absolute: 11213.26923076923
Name: gb
Score: 0.7222391622799791
Absolute: 10935.178763426324
Name: ad
Score: 0.49543648545843844
Absolute: 14981.977049024486
Name: xg
Score: 0.7505940198898315
Absolute: 10505.685546875
Name: kn
Score: 0.5574864719525627
Absolute: 14263.846153846154
Name: sv
Score: -0.0008984185293297031
Absolute: 21630.034112896145


In [16]:
house_price = (18,1990,3,3,4,1)
house_price_array = np.asarray(house_price).reshape(1,-1)
column = ['Home',	'SqFt',	'Bedrooms',	'Bathrooms',	'Offers',	'Neighborhood']
predicted = xgr.predict(house_price_array)[0]
print(f"Price is : ${ predicted:.1f}")

Price is : $83604.1


In [17]:
import pickle

In [18]:
pickle.dump(xgr,open('model.pkl','wb'))
pickle.dump(label,open('encoder.pkl','wb'))