In [1]:
# Import the libraries needed for data manipulation and math
import pandas as pd
import numpy as np

# Function to normalize a vector to a 0–1 range
def normalize(vector):
    return (vector - np.min(vector)) / (np.max(vector) - np.min(vector))

# Function to standardize a vector (convert to z-scores)
def standardize(vector):
    return (vector - np.mean(vector)) / np.std(vector, ddof=1)

# Load the housing data file (make sure the CSV file is in the same folder as this notebook)
df = pd.read_csv("calif_housing_data.csv.csv")

# (a) Get the number of rows in the dataset
num_rows = df.shape[0]
print(f"(a) Number of rows: {num_rows}")

# (b) Identify the target variable for prediction
target_vector = df['median_house_value']
print("(b) Target variable: median_house_value")

# (c) Create a new feature: average bedrooms per household
df['avg_bedrooms_per_household'] = df['total_bedrooms'] / df['households']
print("(c) Created new feature: avg_bedrooms_per_household")

# (d) Create a new DataFrame with selected features
df_selected = df[['housing_median_age', 'median_income', 'avg_bedrooms_per_household']]
print("(d) Selected features added to new DataFrame.")

# (e) Standardize the selected features
df_standardized = df_selected.apply(standardize)
print("(e) Standardized the features.")

# Show the first few rows of the final standardized DataFrame
print("\nPreview of standardized data:")
print(df_standardized.head())


(a) Number of rows: 20640
(b) Target variable: median_house_value
(c) Created new feature: avg_bedrooms_per_household
(d) Selected features added to new DataFrame.
(e) Standardized the features.

Preview of standardized data:
   housing_median_age  median_income  avg_bedrooms_per_household
0            0.982119       2.344709                   -0.153859
1           -0.607004       2.332181                   -0.262930
2            1.856137       1.782656                   -0.049603
3            1.856137       0.932945                   -0.050416
4            1.856137      -0.012881                   -0.033567
