In [10]:
! pip install ydata-profiling dtale

Collecting ydata-profiling
  Using cached ydata_profiling-4.12.2-py2.py3-none-any.whl.metadata (20 kB)
Collecting dtale
  Using cached dtale-3.16.1-py2.py3-none-any.whl.metadata (16 kB)
Collecting visions<0.8.0,>=0.7.5 (from visions[type_image_path]<0.8.0,>=0.7.5->ydata-profiling)
  Using cached visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Using cached htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting phik<0.13,>=0.11.1 (from ydata-profiling)
  Downloading phik-0.12.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Using cached multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting statsmodels<1,>=0.13.2 (from ydata-profiling)
  Downloading statsmodels-0.14.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.2 kB)
Collecting typeguard<5,>=3 (from ydata-profiling)
  Using cache

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('../data/housing.csv')

data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
3996,-117.38,34.44,4.0,5083.0,867.0,2541.0,856.0,4.2414,121400.0,INLAND
14953,-118.33,33.98,38.0,3063.0,796.0,2153.0,721.0,1.8472,149100.0,<1H OCEAN
5254,-121.53,38.57,34.0,3395.0,592.0,1518.0,627.0,4.0833,118500.0,INLAND
16312,-117.87,34.09,36.0,1267.0,191.0,640.0,200.0,5.2405,220000.0,<1H OCEAN
10550,-117.12,33.49,4.0,21988.0,4055.0,8824.0,3252.0,3.9963,191100.0,<1H OCEAN


In [3]:
data.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [4]:
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes('object').columns

print(num_cols)
print(cat_cols)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')
Index(['ocean_proximity'], dtype='object')


In [15]:
# from ydata_profiling import ProfileReport

# profile = ProfileReport(data, title="Housing data Report")
# profile.to_file("housing_data_report.html")


In [12]:
# import dtale

# d = dtale.show(data)
# d.open_browser()

In [16]:
# d.kill()

In [41]:
bins = [0, 1.5, 3, 4.5, 6, np.inf]
labels = ['A', 'B', 'C', 'D', 'E']

data['income_cat'] = pd.cut(data['median_income'], bins=bins, labels=labels)

In [42]:
data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'income_cat'],
      dtype='object')

In [56]:
# split the data into train and test
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, 
                               test_size=0.2, 
                               random_state=52,
                               stratify=data['income_cat']
                               )

train.shape, test.shape

((16512, 11), (4128, 11))

In [57]:
print(train['income_cat'].value_counts(normalize=True))
print(test['income_cat'].value_counts(normalize=True))

income_cat
C    0.350594
B    0.318859
D    0.176296
E    0.114462
A    0.039789
Name: proportion, dtype: float64
income_cat
C    0.350533
B    0.318798
D    0.176357
E    0.114341
A    0.039971
Name: proportion, dtype: float64


In [58]:
print(train.shape, test.shape)

train.columns

(16512, 11) (4128, 11)


Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'income_cat'],
      dtype='object')

In [59]:
train_set = train.drop(columns=['income_cat']).copy()
test_set = test.drop(columns=['income_cat']).copy()

train_set.shape, test_set.shape

((16512, 10), (4128, 10))

In [60]:
train_set.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [61]:
X = train_set.drop(columns='median_house_value', axis=1)
y = train_set['median_house_value']

X.shape, y.shape

((16512, 9), (16512,))

In [63]:
num_cols

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [65]:
X[X.columns]

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity'],
      dtype='object')

In [66]:
# import simple imputer
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

num_cols = X.select_dtypes(include=[np.number]).columns
cat_cols = X.select_dtypes('object').columns

# fit the imputer on the training set
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

X.isnull().sum()


longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
dtype: int64

In [67]:
# scale the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X[num_cols] = scaler.fit_transform(X[num_cols])

# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

X[cat_cols] = encoder.fit_transform(X[cat_cols])

In [69]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20219,0.616762,-0.684957,1.303949,-0.598891,-0.590719,-0.621990,-0.549766,0.000757,0.0
11120,1.341665,-1.393465,-1.556549,0.007883,-0.344897,-0.060693,-0.358801,1.250928,0.0
13110,-1.118006,0.746136,-0.364675,-0.101648,-0.402176,-0.164997,-0.309098,1.342745,0.0
13436,0.871728,-0.867949,-0.285216,-0.258841,-0.058503,-0.305542,-0.102437,-0.494422,0.0
9585,0.666755,-0.769414,1.145033,-0.811995,-0.791195,-0.561883,-0.816594,-1.535830,0.0
...,...,...,...,...,...,...,...,...,...
8998,-1.417966,1.018278,-0.285216,-0.667634,0.017869,-0.735133,-0.152140,-0.105331,3.0
16976,1.166689,-1.355928,0.350450,-0.300087,0.182545,-0.324105,0.177471,-0.875854,4.0
13117,-1.762920,1.647020,0.270992,-0.683675,-0.786421,-0.857116,-0.813978,-0.021122,0.0
12847,1.266675,-0.760030,-0.364675,0.929502,1.612129,0.377738,1.613635,-1.275858,1.0


In [70]:
X.shape, y.shape

((16512, 9), (16512,))

# Model Building

In [71]:
# linear regression
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X, y)

In [75]:
# evaluate the model
from sklearn.metrics import mean_squared_error

# prepare the test data

X_test = test_set.drop(columns='median_house_value', axis=1)
y_test = test_set['median_house_value']

X_test[num_cols] = num_imputer.transform(X_test[num_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

X_test[num_cols] = scaler.transform(X_test[num_cols])
X_test[cat_cols] = encoder.transform(X_test[cat_cols])

X_test


# make predictions
preds = model.predict(X_test)

# calculate the rmse
rmse = mean_squared_error(y_test, preds)

print(f"RMSE: {rmse}")

RMSE: 4706913176.1258745


In [80]:
# save the model
import joblib

joblib.dump(model, 'housing_model.pkl')

# save the encoder
joblib.dump(encoder, 'encoder.pkl')

# save the imputer
joblib.dump(num_imputer, 'num_imputer.pkl')

# save the imputer
joblib.dump(cat_imputer, 'cat_imputer.pkl')

# save the scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [78]:
# generate a data
new_data = data.sample(1)

In [79]:
new_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
10139,-117.95,33.78,26.0,4115.0,883.0,2184.0,825.0,3.9536,191000.0,<1H OCEAN,C


In [82]:
# load the model and other files

model = joblib.load('housing_model.pkl')
encoder = joblib.load('encoder.pkl')
num_imputer = joblib.load('num_imputer.pkl')
scaler = joblib.load('scaler.pkl')
cat_imputer = joblib.load('cat_imputer.pkl')

# prepare the data
new_data[num_cols] = num_imputer.transform(new_data[num_cols])
new_data[cat_cols] = cat_imputer.transform(new_data[cat_cols])

new_data[num_cols] = scaler.transform(new_data[num_cols])
new_data[cat_cols] = encoder.transform(new_data[cat_cols])

# delete the target column and income_cat
new_data = new_data.drop(columns=['median_house_value', 'income_cat'])

# predict
result = model.predict(new_data)

print(f"Predicted value: {result}")


ValueError: Found unknown categories [0.0] in column 0 during transform