# $Boston House Price Prediction$

In [76]:
import warnings
warnings.filterwarnings('ignore')

# Step1: Problem Scoping
# Want to predict house prices in Boston


# Step 2: Data Acquisition
# I supplied the data

import pandas as pd
df = pd.read_csv('boston.csv')
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


```CRIM: Per capita crime rate by town
ZN: Proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS: Proportion of non-retail business acres per town
CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
NOX: Nitric oxide concentration (parts per 10 million)
RM: Average number of rooms per dwelling
AGE: Proportion of owner-occupied units built prior to 1940
DIS: Weighted distances to five Boston employment centers
RAD: Index of accessibility to radial highways
TAX: Full-value property-tax rate per $10,000
PTRATIO: Pupil-teacher ratio by town
B: 1000(Bk - 0.63)^2, where Bk is the proportion of blacks by town
LSTAT: Percentage lower status of the population
PRICE: Median value of owner-occupied homes in $1000's
These features are used to predict housing prices in Boston based on various neighborhood characteristics.```

In [77]:
### Let's only keep things we would understand
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

df = pd.read_csv('boston.csv')

X = df.drop("MEDV", axis=1)
y = df["MEDV"]

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # MODELLING
model = GradientBoostingRegressor(random_state=42)

model.fit(X_train, y_train)

final_preds = model.predict(X_test)

# save the metric scores
final_metrics = {
    "R2": r2_score(y_test, final_preds),
    "RMSE": np.sqrt(mean_squared_error(y_test, final_preds)),
    "MAE": mean_absolute_error(y_test, final_preds)
}

final_metrics

{'R2': 0.9153342280466539,
 'RMSE': np.float64(2.491758688462436),
 'MAE': 1.9122427713612915}

---

In [78]:
import gradio as gr

In [81]:
X_test.head(1)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
173,0.09178,0.0,4.05,0,0.51,6.416,84.1,2.6463,5,296.0,16.6,395.5,9.04


In [98]:
inputs = [0.09178,	0.0	, 4.05,	0,	0.51,	6.416,	'AGE',	2.6463,	5,	'TAX',	16.6,	395.5,	9.04 ]
# inputs = ['CRIM',	'ZN',	'INDUS',	'CHAS',	'NOX',	'RM',	'AGE',	'DIS',	'RAD',	'TAX',	'PTRATIO',	396.90,	9.04]

def price_predict(age, tax):
    inputs = [0.09178,	0.0	, 4.05,	0,	0.51,	6.416,	age,	2.6463,	5,	tax,	16.6,	395.5,	9.04 ]
    predicted_price = model.predict([inputs])

    return f'The Predicted price is: ${predicted_price[0]*100000:.2f}'

In [99]:
price_predict(10, 10000)

'The Predicted price is: $2459948.87'

In [106]:
app = gr.Interface(
    fn = price_predict,
    
    inputs= [
        gr.Slider(1,20, label= "Enter Age of House"),
        gr.Slider(10,50000, label= "Enter Tax per year")
    ],
    outputs= 'text',
    title= "Boston House Price Predictor Application",
    description= "Please enter the inputs, and the model will predict what the value of a house would be, based on historical data of the the houses in boston",
    # api_open = False

)

app.launch(share= True)

* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://0de06d33685a89bb0c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# it sets the seed

np.random.seed(42)
np.random.randint(0,10)

# hitchhikers guide to the galaxy
# 

6

---

### Taking code directly from a Kaggle notebook
- reading the code, and implementing without understand the 100%

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# import joblib

# modify data source
# df = pd.read_csv("/kaggle/input/housing-prices/boston.csv")
df = pd.read_csv('boston.csv')

X = df.drop("MEDV", axis=1)
y = df["MEDV"]

# data transformation
skewed_features = ["CRIM", "ZN", "DIS", "LSTAT", "TAX"]
X[skewed_features] = np.log1p(X[skewed_features])


# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# # MODELLING PIPELINE
models = {
    "LinearRegression": LinearRegression(),

    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

# creating an empty dictionary
# maybe we want to store something
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    
    results[name] = {
        "R2": r2_score(y_test, preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "MAE": mean_absolute_error(y_test, preds)
    }

# results is dictionary, let's store the values in a dataframe
results_df = pd.DataFrame(results).T.sort_values("R2", ascending=False)

# choosing best model
best_model = GradientBoostingRegressor(random_state=42)

# HYPERPARAMETER TUNING
# hyperparamter search --- search the best parameters that make this model work and give good outputs
param_grid = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 4]
}
grid = GridSearchCV(
    best_model,
    param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

# using the best hyperparameters to do the job --- prediction
grid.fit(X_train, y_train)
final_model = grid.best_estimator_

# save final predictions
final_preds = final_model.predict(X_test)

# save the metric scores
final_metrics = {
    "R2": r2_score(y_test, final_preds),
    "RMSE": np.sqrt(mean_squared_error(y_test, final_preds)),
    "MAE": mean_absolute_error(y_test, final_preds)
}

# to save our model --- modelling
# we need joblib package
# joblib.dump(final_model, "boston_house_price_model.pkl")

In [24]:
results_df,final_metrics

(                        R2      RMSE       MAE
 GradientBoosting  0.915344  2.491622  1.912183
 RandomForest      0.891343  2.822803  2.053127
 LinearRegression  0.746005  4.315834  2.945026
 Ridge             0.745663  4.318738  2.935824
 Lasso             0.695350  4.726641  3.096040,
 {'R2': 0.9153342280466539,
  'RMSE': np.float64(2.491758688462436),
  'MAE': 1.9122427713612915})