In [3]:
# Initial Data Analysis (IDA)

## 1. Data Loading
import os
import tarfile
import urllib.request
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

# Paths
DOWNLOAD_ROOT = "https://github.com/ageron/handson-ml2/raw/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

# Function to fetch and load data
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    with tarfile.open(tgz_path) as housing_tgz:
        housing_tgz.extractall(path=housing_path)

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

# Download + load data
fetch_housing_data()
housing = load_housing_data()
housing.head()

  housing_tgz.extractall(path=housing_path)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
housing.info()
housing.describe()
housing["ocean_proximity"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [7]:
# Create income category attribute
import numpy as np

housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index].drop("income_cat", axis=1)
    strat_test_set = housing.loc[test_index].drop("income_cat", axis=1)

strat_train_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,82700.0,INLAND
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN


In [17]:
## 4. Save Training and Testing Sets
# Define output directories
TRAIN_PATH = os.path.join("data", "train")
TEST_PATH = os.path.join("data", "test")

os.makedirs(TRAIN_PATH, exist_ok=True)
os.makedirs(TEST_PATH, exist_ok=True)

# --- Save RAW training set (13 columns) ---
raw_train_file = os.path.join(TRAIN_PATH, "housing_train.csv")
strat_train_set.to_csv(raw_train_file, index=False)

# --- Save RAW testing set (13 columns) ---
raw_test_file = os.path.join(TEST_PATH, "housing_test.csv")
strat_test_set.to_csv(raw_test_file, index=False)

# --- Feature engineering for PROCESSED training set (24 features) ---
housing_train_processed = strat_train_set.copy()

housing_train_processed["rooms_per_household"] = (
    housing_train_processed["total_rooms"] / housing_train_processed["households"]
)
housing_train_processed["bedrooms_per_room"] = (
    housing_train_processed["total_bedrooms"] / housing_train_processed["total_rooms"]
)
housing_train_processed["population_per_household"] = (
    housing_train_processed["population"] / housing_train_processed["households"]
)

# Save processed training set
processed_train_file = os.path.join(TRAIN_PATH, "housing_train_processed.csv")
housing_train_processed.to_csv(processed_train_file, index=False)

print("✅ Data saved:")
print(f"   - Raw training set: {raw_train_file} ({strat_train_set.shape[1]} columns)")
print(f"   - Processed training set: {processed_train_file} ({housing_train_processed.shape[1]} columns)")
print(f"   - Raw testing set: {raw_test_file} ({strat_test_set.shape[1]} columns)")

✅ Data saved:
   - Raw training set: data/train/housing_train.csv (10 columns)
   - Processed training set: data/train/housing_train_processed.csv (13 columns)
   - Raw testing set: data/test/housing_test.csv (10 columns)
