In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
dataframe = pd.read_csv("../data/housing.csv")
dataframe.shape


(20640, 10)

In [17]:
dataframe.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [18]:
y = dataframe["median_house_value"]   # target
X = dataframe.drop("median_house_value", axis=1)

X.shape, y.shape


((20640, 9), (20640,))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


Verify Split

In [20]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (16512, 9)
X_test shape: (4128, 9)
y_train shape: (16512,)
y_test shape: (4128,)


In [21]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND


In [22]:
y_train.head()

14196    103000.0
8267     382100.0
17445    172600.0
14265     93400.0
2271      96500.0
Name: median_house_value, dtype: float64

Cleaning Data



In [23]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [15]:
numerical_cols = X_train.drop("ocean_proximity", axis=1).columns
categorical_cols = ["ocean_proximity"]
numerical_cols, categorical_cols

(Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
        'total_bedrooms', 'population', 'households', 'median_income'],
       dtype='object'),
 ['ocean_proximity'])

In [27]:
num_medians = X_train[numerical_cols].median()
num_medians

longitude             -118.5100
latitude                34.2600
housing_median_age      29.0000
total_rooms           2129.0000
total_bedrooms         437.0000
population            1167.0000
households             410.0000
median_income            3.5458
dtype: float64

In [28]:
X_train_num = X_train[numerical_cols].fillna(num_medians)
X_test_num = X_test[numerical_cols].fillna(num_medians)

In [29]:
num_means = X_train_num.mean()
num_stds = X_train_num.std()

In [32]:
X_train_num_scaled = (X_train_num - num_means) / num_stds
X_test_num_scaled = (X_test_num - num_means) / num_stds


In [33]:
X_train_num_scaled.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
14196,1.272548,-1.37277,0.34848,0.222563,0.211221,0.768253,0.322896,-0.326186
8267,0.709141,-0.876669,1.618069,0.340283,0.593076,-0.098898,0.672007,-0.035842
17445,-0.44759,-0.460133,-1.952651,-0.342587,-0.495211,-0.449804,-0.430448,0.144697
14265,1.232661,-1.38213,0.586528,-0.561473,-0.409293,-0.007434,-0.380575,-1.017834
2271,-0.108548,0.532068,1.141973,-0.119562,-0.256551,-0.485862,-0.314953,-0.171483


In [34]:
X_test_num_scaled.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
20046,0.285339,0.195094,-0.286315,-0.522846,-0.242232,-0.0303,-0.370076,-1.15505
3024,0.060973,-0.235483,0.110432,0.138411,-0.242232,0.121847,0.220525,-0.708638
15663,-1.424827,1.009447,1.856117,0.546293,-0.242232,-0.102416,1.21536,-0.210395
20484,0.42993,-0.63798,-0.92111,0.188074,-0.242232,0.244972,-0.01309,0.975084
9814,-1.170546,0.457185,0.427829,-0.133817,-0.242232,-0.319644,-0.188958,-0.081791


In [35]:
train_categories = X_train["ocean_proximity"].unique() # get the diff types of values in ocean_proximity column
train_categories


array(['NEAR OCEAN', 'INLAND', '<1H OCEAN', 'NEAR BAY', 'ISLAND'],
      dtype=object)

In [39]:
X_train_cat = pd.get_dummies(
    X_train["ocean_proximity"],
    drop_first=False
)

X_test_cat = pd.get_dummies(
    X_test["ocean_proximity"],
    drop_first=False
)

X_train_cat


Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
14196,False,False,False,False,True
8267,False,False,False,False,True
17445,False,False,False,False,True
14265,False,False,False,False,True
2271,False,True,False,False,False
...,...,...,...,...,...
11284,True,False,False,False,False
11964,False,True,False,False,False
5390,True,False,False,False,False
860,True,False,False,False,False


In [40]:
X_test_cat

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
20046,False,True,False,False,False
3024,False,True,False,False,False
15663,False,False,False,True,False
20484,True,False,False,False,False
9814,False,False,False,False,True
...,...,...,...,...,...
15362,True,False,False,False,False
16623,False,False,False,False,True
18086,True,False,False,False,False
2144,False,True,False,False,False


In [43]:
X_train_cat, X_test_cat = X_train_cat.align(
    X_test_cat,
    join="left",
    axis=1,
    fill_value=0
)

X_train_clean = pd.concat(
    [X_train_num_scaled, X_train_cat],
    axis=1
)

X_test_clean = pd.concat(
    [X_test_num_scaled, X_test_cat],
    axis=1
)


In [44]:
X_train_clean.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
14196,1.272548,-1.37277,0.34848,0.222563,0.211221,0.768253,0.322896,-0.326186,False,False,False,False,True
8267,0.709141,-0.876669,1.618069,0.340283,0.593076,-0.098898,0.672007,-0.035842,False,False,False,False,True
17445,-0.44759,-0.460133,-1.952651,-0.342587,-0.495211,-0.449804,-0.430448,0.144697,False,False,False,False,True
14265,1.232661,-1.38213,0.586528,-0.561473,-0.409293,-0.007434,-0.380575,-1.017834,False,False,False,False,True
2271,-0.108548,0.532068,1.141973,-0.119562,-0.256551,-0.485862,-0.314953,-0.171483,False,True,False,False,False


In [45]:
X_test_clean.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
20046,0.285339,0.195094,-0.286315,-0.522846,-0.242232,-0.0303,-0.370076,-1.15505,False,True,False,False,False
3024,0.060973,-0.235483,0.110432,0.138411,-0.242232,0.121847,0.220525,-0.708638,False,True,False,False,False
15663,-1.424827,1.009447,1.856117,0.546293,-0.242232,-0.102416,1.21536,-0.210395,False,False,False,True,False
20484,0.42993,-0.63798,-0.92111,0.188074,-0.242232,0.244972,-0.01309,0.975084,True,False,False,False,False
9814,-1.170546,0.457185,0.427829,-0.133817,-0.242232,-0.319644,-0.188958,-0.081791,False,False,False,False,True


In [46]:
X_train_clean.shape, X_test_clean.shape


((16512, 13), (4128, 13))

In [47]:
import joblib

joblib.dump(X_train_clean, "../data/X_train_clean.joblib")
joblib.dump(X_test_clean,  "../data/X_test_clean.joblib")
joblib.dump(y_train,       "../data/y_train.joblib")
joblib.dump(y_test,        "../data/y_test.joblib")


['../data/y_test.joblib']

In [49]:
import os

# You already have these from your manual preprocessing:
# numerical_cols, num_medians, num_means, num_stds, train_categories, X_train_clean

preprocess_artifacts = {
    "numerical_cols": list(numerical_cols),
    "num_medians": num_medians,          # pandas Series
    "num_means": num_means,              # pandas Series
    "num_stds": num_stds,                # pandas Series
    "train_categories": list(train_categories),
    "final_columns": list(X_train_clean.columns),
}

os.makedirs("../models", exist_ok=True)
joblib.dump(preprocess_artifacts, "../models/preprocess_artifacts.joblib")


['../models/preprocess_artifacts.joblib']