In [2]:
#Reading data from csv file and displaying first 5 rows of the data
import pandas as pd

data = pd.read_csv('housing.csv')
# data.head()

In [9]:
#Preprocessing data 
data['ocean_proximity'] = data['ocean_proximity'].replace({
    'INLAND': 0,
    'ISLAND': 1,
    'NEAR BAY': 2,
    'NEAR OCEAN': 3,
    '<1H OCEAN': 4
})
# print(data['ocean_proximity'].unique())
bedrooms = data['total_bedrooms'].mean()
bed = int(bedrooms)
data.fillna(bed,inplace=True)
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,2
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,2
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,2
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,2
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,2


In [13]:
#Splitting data into input and output

X = data.drop('median_house_value',axis=1)
#feature name 
feature_names = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity']
y = data['median_house_value']
# X.columns

In [11]:
#Splitting data into training and testing data using train_test_split
from sklearn.model_selection import train_test_split
X_train,  X_test,y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((16512, 9), (16512,), (4128, 9), (4128,))

In [113]:
#splitting data into train test using KFold cross validation
from sklearn.model_selection import KFold

kf = KFold(n_splits=10,shuffle=True,random_state=42)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

X_train.shape, y_train.shape, X_test.shape, y_test.shape


((18576, 9), (18576,), (2064, 9), (2064,))

In [17]:
#apply linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predicting the output
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Saving the model
import pickle
pickle.dump(model, open('linear_regression_model.pkl', 'wb'))

# Loading the model
model = pickle.load(open('linear_regression_model.pkl', 'rb'))

#data to predict 
new_data = pd.DataFrame([[-122.23, 37.88, 41.0, 880.0, 129.0, 322.0, 126.0, 8.3252, 2]], columns=feature_names)

# Predicting the output
print(model.predict(new_data))


Mean Squared Error: 4966764349.221943
[413061.49637018]


In [22]:
#applying decision tree regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Create and fit the decision tree regression model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Predicting the output
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Saving the model
import pickle
pickle.dump(model, open('decision_tree_regression_model.pkl', 'wb'))

# Loading the model
model = pickle.load(open('decision_tree_regression_model.pkl', 'rb'))

#data to predict
new_data = pd.DataFrame([[-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014, 2]], columns=feature_names)

# Predicting the output
print('predicted price: ',model.predict(new_data))


Mean Squared Error: 4878626268.260175
predicted price:  [358500.]


In [23]:
#applying random forest regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Create and fit the random forest regression model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predicting the output
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Saving the model
import pickle
pickle.dump(model, open('random_forest_regression_model.pkl', 'wb'))

# Loading the model
model = pickle.load(open('random_forest_regression_model.pkl', 'rb'))

#data to predict
new_data = pd.DataFrame([[-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014, 2]], columns=feature_names)

# Predicting the output
print('predicted price: ',model.predict(new_data))


Mean Squared Error: 2374804222.1322384
predicted price:  [377727.]


In [34]:
#applying support vector regressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Create and fit the support vector regression model
model = SVR()
model.fit(X_train, y_train)

# Predicting the output
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Saving the model
import pickle
pickle.dump(model, open('support_vector_regression_model.pkl', 'wb'))

# Loading the model
model = pickle.load(open('support_vector_regression_model.pkl', 'rb'))

#data to predict
new_data = pd.DataFrame([[-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014, 2]], columns=feature_names)

# Predicting the output
print('predicted price: ',model.predict(new_data))


Mean Squared Error: 13741698441.391218
predicted price:  [180562.38487727]


In [33]:
#applying KNN regressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Create and fit the KNN regression model
model = KNeighborsRegressor()
model.fit(X_train, y_train)

# Predicting the output
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Saving the model
import pickle
pickle.dump(model, open('knn_regression_model.pkl', 'wb'))

# Loading the model
model = pickle.load(open('knn_regression_model.pkl', 'rb'))

#data to predict
new_data = pd.DataFrame([[-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014, 2]], columns=feature_names)

# Predicting the output
print('predicted price: ',model.predict(new_data))

Mean Squared Error: 10146836404.171522
predicted price:  [408180.4]


In [None]:
#applying polynomial regressor
from sklearn.linear_model import  PoissonRegressor
from sklearn.metrics import mean_squared_error

# Create and fit the polynomial regression model
model = PoissonRegressor()
model.fit(X_train, y_train)

# Predicting the output
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Saving the model
import pickle
pickle.dump(model, open('polynomial_regression_model.pkl', 'wb'))

# Loading the model
model = pickle.load(open('polynomial_regression_model.pkl', 'rb'))

#data to predict
new_data = pd.DataFrame([[-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014, 2]], columns=feature_names)

# Predicting the output
print('predicted price: ',model.predict(new_data))

In [27]:
#applying Ridge regressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Create and fit the Ridge regression model
model = Ridge()

model.fit(X_train, y_train)

# Predicting the output
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Saving the model
import pickle
pickle.dump(model, open('ridge_regression_model.pkl', 'wb'))

# Loading the model
model = pickle.load(open('ridge_regression_model.pkl', 'rb'))

#data to predict
new_data = pd.DataFrame([[-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014, 2]], columns=feature_names)

# Predicting the output
print('predicted price: ',model.predict(new_data))


Mean Squared Error: 4966745753.713151
predicted price:  [413057.89460897]


In [26]:
#applying Lasso regressor
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

# Create and fit the Lasso regression model
model = Lasso()

model.fit(X_train, y_train)

# Predicting the output
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Saving the model
import pickle
pickle.dump(model, open('lasso_regression_model.pkl', 'wb'))

# Loading the model
model = pickle.load(open('lasso_regression_model.pkl', 'rb'))

#data to predict
new_data = pd.DataFrame([[-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014, 2]], columns=feature_names)

# Predicting the output
print('predicted price: ',model.predict(new_data))


Mean Squared Error: 4966755679.213585
predicted price:  [413060.04292171]


In [25]:
#applying ElasticNet regressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

# Create and fit the ElasticNet regression model
model = ElasticNet()

model.fit(X_train, y_train)

# Predicting the output
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Saving the model
import pickle
pickle.dump(model, open('elasticnet_regression_model.pkl', 'wb'))

# Loading the model
model = pickle.load(open('elasticnet_regression_model.pkl', 'rb'))

#data to predict
new_data = pd.DataFrame([[-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014, 2]], columns=feature_names)

# Predicting the output
print('predicted price: ',model.predict(new_data))


Mean Squared Error: 5116110547.726501
predicted price:  [398434.85385292]


In [24]:
#applying gradient boosting regressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Create and fit the Gradient Boosting regression model
model = GradientBoostingRegressor()

model.fit(X_train, y_train)

# Predicting the output
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Saving the model
import pickle
pickle.dump(model, open('gradientboosting_regression_model.pkl', 'wb'))

# Loading the model
model = pickle.load(open('gradientboosting_regression_model.pkl', 'rb'))

#data to predict
new_data = pd.DataFrame([[-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014, 2]], columns=feature_names)

# Predicting the output
print('predicted price: ',model.predict(new_data))

Mean Squared Error: 3244452964.9279847
predicted price:  [417067.47377456]
