# Data Exploration

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../dataset/housing.csv')

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
from ydata_profiling import ProfileReport

In [5]:
profile = ProfileReport(data, title="Housing Dataset Profiling Report")
profile.to_file("housing_dataset_profiling_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 75.80it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
import dtale

In [9]:
d = dtale.show(data)
d.open_browser()

In [11]:
d.kill()

2025-06-21 20:11:08,174 - INFO     - Shutdown complete


---

In [12]:
data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [14]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,20640.0,-119.569704,2.003532,-124.35,-121.8,-118.49,-118.01,-114.31
latitude,20640.0,35.631861,2.135952,32.54,33.93,34.26,37.71,41.95
housing_median_age,20640.0,28.639486,12.585558,1.0,18.0,29.0,37.0,52.0
total_rooms,20640.0,2635.763081,2181.615252,2.0,1447.75,2127.0,3148.0,39320.0
total_bedrooms,20433.0,537.870553,421.38507,1.0,296.0,435.0,647.0,6445.0
population,20640.0,1425.476744,1132.462122,3.0,787.0,1166.0,1725.0,35682.0
households,20640.0,499.53968,382.329753,1.0,280.0,409.0,605.0,6082.0
median_income,20640.0,3.870671,1.899822,0.4999,2.5634,3.5348,4.74325,15.0001
median_house_value,20640.0,206855.816909,115395.615874,14999.0,119600.0,179700.0,264725.0,500001.0


In [15]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.25, random_state=42)

print(train.shape)
print(test.shape)

(15480, 10)
(5160, 10)


In [20]:
import numpy as np

data['income_cat'] = pd.cut(data['median_income'], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])

In [26]:
data['income_cat'].value_counts() / len(data) * 100

income_cat
3    35.058140
2    31.884690
4    17.630814
5    11.443798
1     3.982558
Name: count, dtype: float64

In [28]:
train_strat, test_strat = train_test_split(data, test_size=0.25, random_state=42, stratify=data['income_cat'])

print(train_strat.shape)
print(test_strat.shape)

(15480, 11)
(5160, 11)


In [29]:
print(train_strat['income_cat'].value_counts() / len(train_strat) * 100)
print(test_strat['income_cat'].value_counts() / len(test_strat) * 100)

income_cat
3    35.058140
2    31.886305
4    17.629199
5    11.447028
1     3.979328
Name: count, dtype: float64
income_cat
3    35.058140
2    31.879845
4    17.635659
5    11.434109
1     3.992248
Name: count, dtype: float64


In [31]:
train_strat.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'income_cat'],
      dtype='object')

In [34]:
housing = train_strat.drop(['income_cat', 'median_house_value'] , axis=1)
housing_labels = train_strat['median_house_value'].copy()

housing_test = test_strat.drop(['income_cat', 'median_house_value'], axis=1)
housing_test_labels = test_strat['median_house_value'].copy()

In [36]:
print(housing.shape)
print(housing_labels.shape)
print("\n")
print(housing_test.shape)
print(housing_test_labels.shape)

(15480, 9)
(15480,)


(5160, 9)
(5160,)


---

# Data Cleaning

In [38]:
# missing values
housing.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        149
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [39]:
housing.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
ocean_proximity        object
dtype: object

In [40]:
num_columns = housing.select_dtypes(include=['float64']).columns
cat_columns = housing.select_dtypes(include=['object']).columns
print("Numerical columns:", num_columns)
print("Categorical columns:", cat_columns)

Numerical columns: Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')
Categorical columns: Index(['ocean_proximity'], dtype='object')


In [41]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [46]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

num_pipe

In [47]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])


cat_pipe

In [52]:
print(num_columns.tolist())
print(cat_columns.tolist())

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
['ocean_proximity']


In [54]:
from sklearn.compose import ColumnTransformer

preprocessing_pipe = ColumnTransformer([
        ('num', num_pipe, num_columns.tolist()),
        ('cat', cat_pipe, cat_columns.tolist())
])

preprocessing_pipe

In [55]:
from sklearn.linear_model import LinearRegression

model = Pipeline([
    ('preprocessing', preprocessing_pipe),
    ('regression', LinearRegression())
])
model

In [56]:
model.fit(housing, housing_labels)
print(model.score(housing, housing_labels))
print(model.score(housing_test, housing_test_labels))

0.6431768361846115
0.6522201662179922


In [None]:
from sklearn.tree import DecisionTreeRegressor

model_tree = Pipeline([
    ('preprocessing', preprocessing_pipe),
    ('regression', DecisionTreeRegressor(random_state=42))
])

model_tree.fit(housing, housing_labels)
print(model_tree.score(housing, housing_labels))
print(model_tree.score(housing_test, housing_test_labels))

1.0
0.6009988537141799
