In [14]:
# ! pip install pandas streamlit matplotlib numpy
# ! pip install dtale ydata-profiling
! pip install setuptools



In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
data = pd.read_csv('../data/housing.csv')
data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
18451,-118.65,34.27,23.0,1724.0,265.0,934.0,306.0,6.0783,229200.0,<1H OCEAN
12298,-119.76,36.83,20.0,3214.0,446.0,1360.0,463.0,5.2595,110900.0,INLAND
4022,-121.8,36.68,18.0,8581.0,1957.0,6071.0,1889.0,3.0,162200.0,<1H OCEAN
8256,-118.29,33.97,48.0,3139.0,587.0,1319.0,506.0,3.5208,134200.0,<1H OCEAN
19976,-120.22,36.49,14.0,1508.0,347.0,1679.0,345.0,2.4786,56000.0,INLAND


# Exploratory Data Analysis (EDA) 

01. Diagnosis
    1.  Loading Data
    2.  shape of data
    3.  Data types
    4.  Columns
2.  Univariant Analysis
    1.  Descriptive Statistics
    2.  boxplot
    3.  histogram
3.  Bivariant Analysis
    1.  Correlation
    2.  Pairplot
    3.  Scatterplot

In [8]:
from ydata_profiling import ProfileReport

profile = ProfileReport(data, title="Housing Data Report")
profile.to_file("housing_data_report.html")

  from .autonotebook import tqdm as notebook_tqdm
Summarize dataset: 100%|██████████| 100/100 [00:05<00:00, 17.61it/s, Completed]                                   
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.77s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 108.60it/s]


In [17]:
import dtale
dtale.show(data)
# d.open_browser()



# Split Data into Train and Test

In [18]:
data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [21]:
data['median_income']

0        8.3252
1        8.3014
2        7.2574
3        5.6431
4        3.8462
          ...  
20635    1.5603
20636    2.5568
20637    1.7000
20638    1.8672
20639    2.3886
Name: median_income, Length: 20640, dtype: float64

In [27]:
bins = [0, 1.5, 3.0, 4.5, 6, np.inf]
labels = ['A', 'B', 'C', 'D', 'E']
data['income_cat'] = pd.cut(data['median_income'], bins=bins, labels=labels)
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,E
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,E
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,E
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,D
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,C


In [30]:
data['income_cat'].value_counts() / len(data) * 100

income_cat
C    35.058140
B    31.884690
D    17.630814
E    11.443798
A     3.982558
Name: count, dtype: float64

In [34]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42, stratify=data['income_cat'])

test_set['income_cat'].value_counts() / len(test_set) * 100

income_cat
C    35.053295
B    31.879845
D    17.635659
E    11.434109
A     3.997093
Name: count, dtype: float64

In [35]:
train_set['income_cat'].value_counts() / len(train_set) * 100

income_cat
C    35.059351
B    31.885901
D    17.629603
E    11.446221
A     3.978924
Name: count, dtype: float64

In [37]:
import missingno as msno

msno.matrix(data)
plt.show()


FigureCanvasAgg is non-interactive, and thus cannot be shown



In [38]:
from sklearn.impute import SimpleImputer

In [48]:
train_set['total_bedrooms'].isnull().sum()

np.int64(168)

In [51]:
type(train_set[['total_bedrooms']])

pandas.core.frame.DataFrame

In [56]:
train_set[['longitude', 'latitude', 'total_bedrooms']].head()

Unnamed: 0,longitude,latitude,total_bedrooms
13096,-122.42,37.8,1115.0
14973,-118.38,34.14,354.0
3785,-121.98,38.36,217.0
14689,-117.11,33.75,851.0
20507,-118.15,33.77,1211.0


In [60]:
num_cols = train_set.select_dtypes(include=[np.number]).columns
cat_cols = train_set.select_dtypes('object').columns

print(num_cols)
print(cat_cols)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')
Index(['ocean_proximity'], dtype='object')


In [62]:
train_set[cat_cols].head()

Unnamed: 0,ocean_proximity
13096,NEAR BAY
14973,<1H OCEAN
3785,INLAND
14689,INLAND
20507,NEAR OCEAN


In [68]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler


num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')
std_scaler = StandardScaler()

num_cols_imputed = num_imputer.fit_transform(train_set[num_cols])
num_train_data = std_scaler.fit_transform(num_cols_imputed)

num_train_data_df = pd.DataFrame(num_train_data, columns=num_cols)

num_train_data_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-1.423037,1.013606,1.861119,0.311912,1.366061,0.137460,1.394812,-0.936491,2.185112
1,0.596394,-0.702103,0.907630,-0.308620,-0.438593,-0.693771,-0.373485,1.171942,2.406254
2,-1.203098,1.276119,0.351428,-0.712240,-0.763478,-0.788768,-0.775727,-0.759789,-0.907406
3,1.231216,-0.884924,-0.919891,0.702262,0.740005,0.383175,0.731375,-0.850281,-0.955971
4,0.711362,-0.875549,0.589800,0.790125,1.593718,0.444376,1.755263,-0.180365,1.348242
...,...,...,...,...,...,...,...,...,...
16507,0.586397,-0.833359,0.987087,-0.184147,0.137663,-0.445315,0.060101,0.444041,2.546753
16508,0.131525,0.319822,-0.443146,0.139847,0.125806,-0.005950,0.083608,-0.685630,-1.019278
16509,1.256209,-1.428701,-1.237721,0.586026,0.559777,1.268299,0.679135,0.101049,-0.498944
16510,0.586397,-0.739605,0.669257,0.522417,0.792177,0.273563,0.882868,0.145396,2.546753


In [70]:
# import OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder

cat_cols_imputed = cat_imputer.fit_transform(train_set[cat_cols])
ordinal_encoder = OrdinalEncoder()
cat_train_data = ordinal_encoder.fit_transform(cat_cols_imputed)

cat_train_data_df = pd.DataFrame(cat_train_data, columns=cat_cols)

cat_train_data_df


Unnamed: 0,ocean_proximity
0,3.0
1,0.0
2,1.0
3,1.0
4,4.0
...,...
16507,0.0
16508,1.0
16509,4.0
16510,0.0


In [71]:
# merge the data
train_data = pd.concat([num_train_data_df, cat_train_data_df], axis=1)

In [73]:
X = train_data.drop('median_house_value', axis=1)
y = train_data['median_house_value']

In [76]:
# train the model
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X, y)

In [77]:
# evaluate the model
from sklearn.metrics import mean_squared_error

predictions = lin_reg.predict(X)
lin_mse = mean_squared_error(y, predictions)

print(lin_mse)

0.3599992333178259


In [78]:
# prepare the test data
num_cols_imputed = num_imputer.transform(test_set[num_cols])
num_test_data = std_scaler.transform(num_cols_imputed)

num_test_data_df = pd.DataFrame(num_test_data, columns=num_cols)

cat_cols_imputed = cat_imputer.transform(test_set[cat_cols])
cat_test_data = ordinal_encoder.transform(cat_cols_imputed)

cat_test_data_df = pd.DataFrame(cat_test_data, columns=cat_cols)

test_data = pd.concat([num_test_data_df, cat_test_data_df], axis=1)

test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-1.188103,0.690153,-0.602061,-0.115504,-0.431479,-0.468151,-0.415276,2.043245,1.659575,0.0
1,0.781342,-0.819296,0.589800,-0.480685,-0.649650,-0.567716,-0.595501,0.404345,-0.029776,0.0
2,0.696366,-0.889612,0.113056,1.498977,2.871916,0.648987,2.619822,0.119655,0.899020,4.0
3,-1.453028,0.985480,1.861119,-0.216638,-0.274965,-0.322914,-0.339529,0.137732,0.936310,4.0
4,-1.408041,1.004230,-1.873380,-0.564887,-0.056793,-0.639878,-0.282066,-0.140986,-0.163329,3.0
...,...,...,...,...,...,...,...,...,...,...
4123,-2.292791,2.415237,1.384374,0.184236,0.137663,-0.200513,0.107116,-0.928669,-1.003668,4.0
4124,0.281483,-0.116137,0.033598,-0.374975,-0.189593,-0.266281,-0.211543,-1.079101,-1.274242,1.0
4125,-1.717953,1.421438,0.351428,-0.789578,-0.929478,-0.942226,-0.961176,0.431884,0.345731,0.0
4126,-1.228091,0.919851,-0.363689,-0.257367,-0.369822,0.057077,-0.289902,0.370886,0.072556,3.0


In [79]:
test_data = test_data.drop('median_house_value', axis=1)

# evaluate the model
predictions = lin_reg.predict(test_data)
lin_mse = mean_squared_error(test_set['median_house_value'], predictions)

print(lin_mse)

57043827617.52539


In [80]:
import joblib

joblib.dump(lin_reg, 'lin_reg.pkl')

['lin_reg.pkl']

In [82]:
# load the model

lin_reg = joblib.load('lin_reg.pkl')

# create a sample data
sample_data = {
    'longitude': [-122.23],
    'latitude': [37.88],
    'housing_median_age': [41],
    'total_rooms': [880],
    'total_bedrooms': [129],
    'population': [322],
    'households': [126],
    'median_income': [8.3252],
    'ocean_proximity': ['NEAR BAY']
}

data = pd.DataFrame(sample_data)

data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41,880,129,322,126,8.3252,NEAR BAY


In [84]:
num_cols

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [85]:
# preprocess the data
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes('object').columns

num_cols_imputed = num_imputer.transform(data[num_cols])
num_data = std_scaler.transform(num_cols_imputed)

num_data_df = pd.DataFrame(num_data, columns=num_cols)

cat_cols_imputed = cat_imputer.transform(data[cat_cols])
cat_data = ordinal_encoder.transform(cat_cols_imputed)

cat_data_df = pd.DataFrame(cat_data, columns=cat_cols)

data = pd.concat([num_data_df, cat_data_df], axis=1)

result = lin_reg.predict(data)

print(result)



ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- median_house_value
