<a href="https://colab.research.google.com/github/miller00315/ia_studies/blob/main/House_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing libraries

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
print(tf.__version__)

2.17.0


#Loading dataset

In [None]:
from pandas.core import indexing
column_names = ['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','ocean_proximity','median_house_value']

raw_dataset = pd.read_csv('/content/housing.csv', names=column_names,
                      na_values = "?", comment='\t',
                      sep=",",index_col=False,skiprows=1)

dataset = raw_dataset.copy()
dataset.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,INLAND,78100
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,INLAND,77100
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7,INLAND,92300
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,INLAND,84700
20639,-121.24,39.37,16,2785,616.0,1387,530,2.3886,INLAND,89400


Check count of missing values for each column

In [None]:
dataset.isna().sum()

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,207
population,0
households,0
median_income,0
ocean_proximity,0
median_house_value,0


drop rows where we have missing values

In [None]:
dataset = dataset.dropna()

dataset

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,INLAND,78100
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,INLAND,77100
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,INLAND,92300
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,INLAND,84700


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

dataset['total_bedrooms'].fillna(dataset['total_bedrooms'].median(), inplace=True)

# Define features and target
X = dataset.drop('median_house_value', axis=1)
y = dataset['median_house_value']

# One-hot encode categorical features and scale numerical features
numeric_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                   'total_bedrooms', 'population', 'households', 'median_income']

categorical_features = ['ocean_proximity']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

X_preprocessed = preprocessor.fit_transform(X)

X_preprocessed

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['total_bedrooms'].fillna(dataset['total_bedrooms'].median(), inplace=True)


array([[-1.32731375,  1.05171726,  0.98216331, ...,  0.        ,
         1.        ,  0.        ],
       [-1.32232256,  1.04235526, -0.60621017, ...,  0.        ,
         1.        ,  0.        ],
       [-1.33230494,  1.03767426,  1.85576873, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.82320322,  1.77727236, -0.92388486, ...,  0.        ,
         0.        ,  0.        ],
       [-0.87311515,  1.77727236, -0.84446619, ...,  0.        ,
         0.        ,  0.        ],
       [-0.83318561,  1.74918635, -1.00330353, ...,  0.        ,
         0.        ,  0.        ]])

Divide dataset into training and testing parts

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [None]:
train_stats = pd.DataFrame(X_train).describe()
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,16346.0,-0.004904,1.001182,-2.385447,-1.117684,0.52941,0.77897,2.545852
1,16346.0,0.003439,0.999946,-1.447937,-0.797278,-0.642805,0.976821,2.956885
2,16346.0,-0.001436,0.998862,-2.194584,-0.844466,0.029139,0.664489,1.855769
3,16346.0,-0.004673,0.985398,-1.205604,-0.542969,-0.234075,0.226635,16.787125
4,16346.0,-0.004302,0.990735,-1.274093,-0.575783,-0.246504,0.256611,14.018707
5,16346.0,-0.003845,0.996643,-1.254828,-0.56297,-0.229396,0.260376,30.230881
6,16346.0,-0.004761,0.988772,-1.30381,-0.573998,-0.239173,0.270911,14.602967
7,16346.0,-0.000245,0.999252,-1.775054,-0.687964,-0.176129,0.46273,5.859665
8,16346.0,0.442371,0.496683,0.0,0.0,0.0,1.0,1.0
9,16346.0,0.315429,0.464701,0.0,0.0,0.0,1.0,1.0


Seprate output field from other fields

#Model Training

In [None]:
def norm(x):
  return (pd.DataFrame(x) - train_stats['mean']) / train_stats['std']
normed_train_data = norm(X_train)
normed_test_data = norm(X_test)

def build_model():
  model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[len(pd.DataFrame(X_train).keys())]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

In [None]:
model = build_model()
model.summary()

In [None]:
EPOCHS = 1000
history = model.fit(
  normed_train_data, y_train,
  epochs=EPOCHS)

Epoch 1/1000
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 55816638464.0000 - mae: 206408.7188 - mse: 55816638464.0000
Epoch 2/1000
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 55119372288.0000 - mae: 204623.1094 - mse: 55119372288.0000
Epoch 3/1000
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 51735842816.0000 - mae: 197635.4375 - mse: 51735842816.0000
Epoch 4/1000
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 42981765120.0000 - mae: 177902.4531 - mse: 42981765120.0000
Epoch 5/1000
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 31600279552.0000 - mae: 147389.3750 - mse: 31600279552.0000
Epoch 6/1000
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 18898597888.0000 - mae: 107308.0469 - mse: 18898597888.0000
Epoch 7/1000
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

#Model Testing

In [39]:
loss, mae, mse = model.evaluate(normed_test_data, y_test, verbose=0)
loss,mae,mse

(3163462144.0, 37924.19921875, 3163462144.0)

In [41]:
pd.DataFrame(X_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,16346.0,16346.0,16346.0,16346.0,16346.0,16346.0,16346.0,16346.0,16346.0,16346.0,16346.0,16346.0,16346.0
mean,-0.004904,0.003439,-0.001436,-0.004673,-0.004302,-0.003845,-0.004761,-0.000245,0.442371,0.315429,0.000184,0.112688,0.129328
std,1.001182,0.999946,0.998862,0.985398,0.990735,0.996643,0.988772,0.999252,0.496683,0.464701,0.013547,0.316221,0.335573
min,-2.385447,-1.447937,-2.194584,-1.205604,-1.274093,-1.254828,-1.30381,-1.775054,0.0,0.0,0.0,0.0,0.0
25%,-1.117684,-0.797278,-0.844466,-0.542969,-0.575783,-0.56297,-0.573998,-0.687964,0.0,0.0,0.0,0.0,0.0
50%,0.52941,-0.642805,0.029139,-0.234075,-0.246504,-0.229396,-0.239173,-0.176129,0.0,0.0,0.0,0.0,0.0
75%,0.77897,0.976821,0.664489,0.226635,0.256611,0.260376,0.270911,0.46273,1.0,1.0,0.0,0.0,0.0
max,2.545852,2.956885,1.855769,16.787125,14.018707,30.230881,14.602967,5.859665,1.0,1.0,1.0,1.0,1.0


In [42]:
test_predictions = model.predict(normed_test_data).flatten()
test_predictions

[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


array([201159.36 , 144413.16 , 202440.61 , ..., 115257.45 , 113083.805,
       138323.06 ], dtype=float32)

In [43]:
y_test

Unnamed: 0,median_house_value
14416,245800
16383,137900
7731,218200
1410,220800
1335,170500
...,...
8291,500001
6274,157900
2997,100200
13440,127700


#Model Conversion

In [47]:
kearas_file = "house_prediction.h5"
tf.keras.models.save_model(model,kearas_file)
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tfmodel = converter.convert()
open("house_prediction.tflite","wb").write(tfmodel)



Saved artifact at '/tmp/tmpjxep8095'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 13), dtype=tf.float32, name='keras_tensor_16')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  138249324855360: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138249324866272: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138249118563520: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138249118553664: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138249118556304: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138249118555248: TensorSpec(shape=(), dtype=tf.resource, name=None)


22368