In this notebook we try to predict the house value based on other features given in the data set

### Imports

In [None]:
import numpy as np  
import pandas as pd 
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import  MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
%matplotlib inline

### Load Data

In [None]:
housing_data = pd.read_csv("../input/hands-on-machine-learning-housing-dataset/housing.csv")


### Data overview

In [None]:
housing_data.head()

In [None]:
housing_data.describe()

In [None]:
housing_data.info()

In [None]:
housing_data['ocean_proximity'].value_counts()

In [None]:
housing_data.hist(bins=50, figsize=(20,15))

### Data  Visualization

In [None]:
housing_data.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4, s=housing_data['population']/100, 
                  label='population', figsize=(10,7), c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True       
                 )

The radius of each circle represents the popultion

### Correlations

In [None]:
cor_mat = housing_data.corr()
cor_mat

In [None]:
scatter_matrix(housing_data[['median_income','housing_median_age', 'total_rooms']], figsize=(12,8))

## Data Preprocessing

## using one hot encoding to convert catagorical value to numeric

In [None]:
data = pd.get_dummies(housing_data)
data.head(5)

## Handling missing data using imputation

In [None]:
print(data.isnull().sum())

In [None]:
imputer = SimpleImputer()
imputed_data = imputer.fit_transform(data)
imputed_data = pd.DataFrame(imputed_data, columns = data.columns)

print(imputed_data.isnull().sum())

### Scaling the data

In [None]:
min_max_scaler = MinMaxScaler()
scaled_data =  min_max_scaler.fit_transform(imputed_data) 
scaled_data = pd.DataFrame(scaled_data, columns = imputed_data.columns)

### Splitting data

In [None]:
X = scaled_data.drop('median_house_value', axis=1)
y = scaled_data['median_house_value']  
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)


## Using Linear Regression

In [None]:
linear_reg = LinearRegression()
linear_model = linear_reg.fit(X_train, y_train)
predictions = linear_model.predict(X_test)
predictions

## Model Evaluation

In [None]:
print('MAE',mean_absolute_error(y_test, predictions))
print('MSE', mean_squared_error(y_test, predictions))
print('R2 score', r2_score(y_test, predictions))

## Using Neural Network

In [None]:
nn = MLPRegressor(random_state=1, max_iter=300,  n_iter_no_change=10)
nn_model = nn.fit(X_train, y_train)

In [None]:
nn_predictions = nn_model.predict(X_test) 

In [None]:
print('mean accuracy on the given test data and labels.', nn_model.score(X_test, y_test))

## Using Decision Tree Regressor

In [None]:
dec_tree_reg = DecisionTreeRegressor()
dec_tree_model = dec_tree_reg.fit(X_train, y_train)

In [None]:
dec_predict = dec_tree_model.predict(X_test)

In [None]:
print('MAE',mean_absolute_error(y_test, dec_predict))
print('MSE', mean_squared_error(y_test, dec_predict))
print('R2 score', r2_score(y_test, dec_predict))

In [None]:
nan