# Property price prediction project


In [None]:
import IPython
IPython.display.Image('https://na.rdcpix.com/307f0bb1b63fb77c661794a66a6b500aw-c525726921rd-w628_h354_r4_q80.jpg')

# Problem Objective :

The project aims at building a model of housing prices to predict median house values in California using the provided dataset. This model should learn from the data and be able to predict the median housing price in any district, given all the other metrics.

Districts or block groups are the smallest geographical units for which the US Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people). There are 20,640 districts in the project dataset.

## step 1 : Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn  as sns 
import os 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,root_mean_squared_error,r2_score
# dataset
from sklearn.datasets import fetch_california_housing

import warnings
warnings.filterwarnings('ignore')

print('modules Loaded Successfully')

## step 2 : Load Data Create Df

In [None]:
data_dict = fetch_california_housing()

data_dict.keys()

In [None]:
print(data_dict['feature_names'])

In [None]:
print(data_dict['target_names'])

In [None]:
df = pd.DataFrame(data_dict['data'],
                  columns= data_dict['feature_names'])

df['MedHouseVal'] = data_dict['target']

print('data Loaded Successfully')

## Step 3: Understanding Data using EDA

In [None]:
# shape 
df.shape

In [None]:
# info
df.info()

In [None]:
# checking Null values 
df.isna().sum()

In [None]:
print(data_dict['DESCR'])

In [None]:
# hundreds of thousands of dollars ($100,000) 1 lakh Dollars
# average number of household members in a block 
df.sample()

In [None]:
# all data must be in Numerical, Dataset contains all values in numerical 
# we can proceed thhis for Analysis

In [None]:
# checking data distribution
# it takes time 
sns.pairplot(data = df,corner= True)
plt.show()

In [None]:
# checking data distribution

plt.title('Features vs target Corr')
sns.heatmap(df.corr().round(2),annot= True, cmap= 'mako')
plt.show()

In [None]:
# data describe 
df.describe()

In [None]:
# histogram 
plt.figure(figsize = (15,12))
for i,j in enumerate(df.columns):
  plt.subplot(3,3, i+1)
  plt.hist(df[j],color = 'blue', alpha = 0.3)
  plt.title(j+' Analysis')
plt.show()


In [None]:
# Distribution of target Value MedHouseVal vs MedInc

sns.jointplot(data=df, x= 'MedHouseVal',y='MedInc')
plt.show()

In [None]:
# sns.regplot(data = df, x = 'MedHouseVal', y = 'MedInc')
# plt.show()

In [None]:
# sns.histplot(data = df, x = 'MedHouseVal', color = 'b')
# sns.kdeplot(data = df, x = 'MedHouseVal',color  = 'r')
# plt.show()

In [None]:
plt.title('MedHouseVal Distribution')
sns.histplot(data = df, x = 'MedHouseVal', color = 'b',kde= True)
plt.show()

In [None]:
df['MedHouseVal'].describe()
# 75% price is less than 2.64 lakh dollar 

## Step 4: Feature Engineering and Preprocessing

In [None]:
# convert data to same scale for better prediction
#1. normalisation = minmax_scale ;; formula : Xnew= X-Xmin/Xmax-Xmin
#2. standarization :


In [None]:
from sklearn.preprocessing import MinMaxScaler

X = df.iloc[:,:-1]
y = df['MedHouseVal']

scaler = MinMaxScaler()

scaler.fit(X) #learning: fit data in o to 1 

X_scaled = scaler.transform(X)

print('Done')


In [None]:
X_scaled

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=.2,random_state=50)

In [None]:
a,b,c,d = [23,534,6,65]
print(d)

In [None]:
import random
random.seed(576)
random.randint(1,50)

In [None]:
print('shape of Xtrain',X_train.shape)
print('shape of X_train',X_test.shape)
print('shape of y_train',y_train.shape)
print('shape of y_test',y_test.shape)

In [None]:
df.shape

LINEAR MODEL USING LINEAR REGRESSION<!--  -->

In [None]:
from sklearn.linear_model import LinearRegression
lr_model=LinearRegression()
lr_model.fit(X_train,y_train)

In [None]:
y_pred=lr_model.predict(X_test)

In [None]:
lr_compare_df=pd.DataFrame({'actual house price':y_test,
              'predicted house price':y_pred})
print('done')

In [None]:
lr_compare_df

In [None]:
lr_mae=mean_absolute_error(y_test,y_pred)
print('Lr MAE',lr_mae)

In [None]:
lr_mse=mean_squared_error(y_test,y_pred)
print('lr MSE',lr_mse)

In [None]:
lr_rmse=root_mean_squared_error(y_test,y_pred)
print('lr RMSE',lr_rmse)

In [None]:
lr_train_score = lr_model.score(X_train,y_train)
print('training score',lr_train_score)

lr_test_score = lr_model.score(X_test,y_test)
print('testing score',lr_test_score)

In [None]:
# model Score : apx:60% 

In [None]:
# Checking VIF and Drop column if value vif>10


In [None]:
X_train_df = pd.DataFrame(X_train,columns = data_dict['feature_names'])
X_train_df.sample()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train_df['intercept'] = 1

X_train_df.sample()
# multicollinearity means x depends on x.
# Just to understand we add intercept as C in formula y = mx+C.

In [None]:
X_train_df.shape[1]

In [None]:
X_train_df.columns

In [None]:
X_train_df.values.shape

In [None]:
vif_df = pd.DataFrame()
vif_df['features'] = list(data_dict['feature_names']) + ['Intercept']

vif_value = [variance_inflation_factor(X_train_df.values,i)for i in range(X_train_df.shape[1])]
vif_df['VIF factor'] = vif_value


vif_df.sort_values(by = 'VIF factor',ascending=False)

In [None]:
col=list(X.columns)
col.remove('Latitude')
X_train_vif=X_train_df[col]
X_train_vif.sample()


In [None]:
def built_model(ml_model, col):
    new_X_train = pd.DataFrame(X_train, columns = data_dict['feature_names'])
    final_X_train = new_X_train[col]

    new_X_test =  pd.DataFrame(X_test, columns = data_dict['feature_names'])
    final_X_test =new_X_test[col]

    machine_model = ml_model()
    machine_model.fit(final_X_train,y_train)

    model_y_pred = machine_model.predict(final_X_test)
    
    model_train_score = machine_model.score(final_X_train,y_train)
    model_test_score = machine_model.score(final_X_test,y_test)

    model_mae_error = mean_absolute_error(y_test,model_y_pred)
    model_mse_error = mean_squared_error(y_test,model_y_pred)
    model_rmse_error = root_mean_squared_error(y_test,model_y_pred)
    model_r2_score = r2_score(y_test,model_y_pred)
    
    model_metrics = {'model training score':model_train_score,
                    'model test score':model_test_score,
                    'MAE Error':model_mae_error,
                    'MSE Error':model_mse_error,
                    'RMSE Error':model_rmse_error,
                     'R2 Score':model_r2_score}

    model_matrix = pd.DataFrame(model_metrics,index = [1])

    return model_matrix,machine_model

In [None]:
print(col)

In [None]:
ml_model=LinearRegression
built_model(ml_model,col)[0]

In [None]:
from sklearn.linear_model import Lasso,Ridge

In [None]:
ml_model = Ridge

built_model(ml_model,col)[0]

In [None]:
col =['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup']

In [None]:
all_model=[LinearRegression,Ridge]
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
ss_X = sc.fit_transform(X)

i=1
while True:
    X_train,X_test,y_train,y_test = train_test_split(ss_X,y,random_state=i,test_size=0.1)
    i += 1
    temp_df,final_model=built_model(Ridge,col)
    score = temp_df['model test score'].values[0]
    print('Score is zero',score)
    if score>=.6:
        print('best random state',i)
        display(temp_df)
        break
    display(clear=True)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(ss_X,y,random_state=178,test_size=0.1)
built_model(Ridge,col)[0]

In [None]:
final_model

In [None]:
import pickle

with open('house_priced _pred_ridge_model.pkl','wb')as f:
    pickle.dump(final_model,f)

with open('sc_scalar.pkl','wb')as f:
          pickle.dump(sc,f)
print('ML Model saves successfully')          

In [None]:
print('123')

# WEBSITE BUILDING AND LOCAL HOST DEPLOYMENT STEP 8

In [None]:
# !pip install streamlit

In [None]:
#st.title('california Housing price prediction')


In [None]:
import os
os.getcwd()

In [None]:
# import streamlit as st

In [None]:
col

In [None]:
df[col]

In [None]:
df.to_csv('california.csv')

In [None]:
for i in df[col]:
    min_value, max_value = df[i].agg(['min','max'])

    print('min',i,min_value)
    print('max',i,max_value)

In [None]:
# !streamlit run house.py

# step 9 live deployment using streamlift

In [None]:
# import streamlit as st
# import pandas as pd
# import random
# from sklearn.preprocessing import StandardScaler
# import pickle

In [None]:
# github : folder housing price project
# requirements.txt = we will write those modules which is important for app.py to run

In [None]:
pd.__version__

In [None]:
st.__version__