In [76]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np


In [77]:
# Data

boston_dataset = load_boston()
data = pd.DataFrame(data=boston_dataset.data,columns =boston_dataset.feature_names)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [78]:
features = data.drop(['INDUS',"AGE"],axis=1)
# features.head()
log_prices = np.log(boston_dataset.target)
target = pd.DataFrame(log_prices,columns = ["PRICES"])
target.shape

(506, 1)

In [79]:
features.head()

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,0.0,0.538,6.575,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.9,5.33


In [80]:
CRIM_idx = 0
ZN_idx = 1
CHAS_idx = 2
RM_idx = 4
PTRATIO_idx = 8
# property_stats is a dummy data to test our model. it needs to have values for all paramters or columns in our dataset
property_stats = np.array(features.mean()).reshape(1,11)
# property_stats

In [81]:
regr = LinearRegression().fit(features,target)
fitted_vals = regr.predict(features)

# Find mean squared and root mean squared errors
MSE = mean_squared_error(target,fitted_vals)
RMSE = np.sqrt(MSE)
RMSE

0.18751213519713034

In [82]:
def get_log_estimate(number_of_rooms,
                    students_per_classroom,
                     next_to_river= False,
                    high_confidence=True):
    
    property_stats[0][RM_idx] = number_of_rooms
    property_stats[0][PTRATIO_idx] = students_per_classroom
    
    if next_to_river:
        property_stats[0][CHAS_idx] = 1
    else:
        property_stats[0][CHAS_idx] = 0
    log_estimate = regr.predict(property_stats)[0][0]
    
    if high_confidence:
        lower_bound = log_estimate - 2* RMSE
        upper_bound = log_estimate + 2* RMSE
        interval = 95
    else:
        lower_bound = log_estimate - RMSE
        upper_bound = log_estimate + RMSE
        interval = 68
    
    return log_estimate,lower_bound,upper_bound,interval

In [92]:
print(get_log_estimate(number_of_rooms=4,students_per_classroom=10))

(3.136541553104004, 2.7615172827097436, 3.5115658234982647, 95)




In [87]:
old_median_price =np.median(boston_dataset.target)

In [106]:
today_median_price = 583.3
scale_factor = today_median_price / old_median_price


In [136]:
def get_dollar_estimate(number_of_rooms,students_per_classroom,next_to_river=False,high_confidence=True):
    """ Estimate the price of property in boston.
    Parameters: 
    number_of_rooms -- int
        Number of rooms in the property
    students_per_classroom -- Number of students in a classroom in that area
    
    next_to_river -- bool,optional
        True if property is close to river else False
    high_confidence -- bool,optional
        True for high range and false for a lower range
    
    
    """
    
    
    if number_of_rooms != int(number_of_rooms):
        print("Room number can't be float")
        return
    if number_of_rooms<1 or students_per_classroom <1 or students_per_classroom >100:
        print("Enter realistic values and try again !! ")
        return
    
    log_est,lower,upper,conf = get_log_estimate(number_of_rooms,students_per_classroom,
                                                next_to_river,high_confidence)

    # converting to today's dollar estimated value
    dollar_est = np.e ** log_est * 1000 * scale_factor
    dollar_upper = np.e ** upper * 1000 * scale_factor
    dollar_lower = np.e ** lower * 1000 * scale_factor

    rounded_est = round(dollar_est,-3)
    rounded_upper = round(dollar_upper,-3)
    rounded_lower = round(dollar_lower,-3)

    print(f"Estimated price is {rounded_est}")
    print(f'At {conf}% confidence lower end is {rounded_lower} and higher end is {rounded_upper}')


In [137]:
get_dollar_estimate(4.5,15)

Room number can't be float
