In [1]:
# splitting training data into 2 buckets (1 for inputs, 1 for price outputs) so they don't pollute each other during training
import os
import pandas #required to convert into a DataFrame

PROJECT_FILE_PATH = os.path.join("datasets", "housing")
csv_path = os.path.join(PROJECT_FILE_PATH, "stratified_training_80.csv")
full_stratified_training80_data = pandas.read_csv(csv_path)

training80_inputs_features_df  = full_stratified_training80_data.drop("median_house_value", axis=1)
training80_prices_labels_df = full_stratified_training80_data["median_house_value"].copy()

In [2]:
'''
I wanted a pulse check to see which dimensions are missing values and how bad is the relative damage. If the total_bedrooms had a lot of empty rows
and those empty rows were randomly distributed, it might make sense to remove the rows with empty data. However, if I had reason to believe the empties
showed up in a biased way (e.g. voters claiming to be undecided) AND the dimension wasn't a key feature of predicting prices, I might remove the entire
dimension. Given the percentages shown below and the book's hint, I feel more comfortable now auto-filling blanks with the median bedroom count.
'''

missing_values_counts = training80_inputs_features_df.isnull().sum()
missing_values_percentages = (training80_inputs_features_df.isnull().sum() / len(training80_inputs_features_df)) * 100

# make a spreadsheet through pandas dataframe
missing_values_report = pandas.DataFrame({
    'Missing Values': missing_values_counts,
    'Percentage (%)': missing_values_percentages
})

print(missing_values_report)

                       Missing Values  Percentage (%)
longitude                           0         0.00000
latitude                            0         0.00000
housing_median_age                  0         0.00000
total_rooms                         0         0.00000
total_bedrooms                    158         0.95688
population                          0         0.00000
households                          0         0.00000
median_income                       0         0.00000
ocean_proximity                     0         0.00000
concatenated_position               0         0.00000
position_hash                       0         0.00000


In [3]:
from sklearn.impute import SimpleImputer # so I don't have to fill-in blanks one dimension at a time manually
imputer_tool = SimpleImputer(strategy="median")

# Cloning the training DataFrame, excluding the non-numerical dimensions "ocean_proximity" and "concatenated_position"
numerical_only_training80_df = training80_inputs_features_df.drop(["ocean_proximity", "concatenated_position", "position_hash"], axis=1)

imputer_tool.fit(numerical_only_training80_df)
imputed_numerical_only_training80_df = imputer_tool.transform(numerical_only_training80_df)
imputed_numerical_only_training80_df = pandas.DataFrame(imputed_numerical_only_training80_df, columns=numerical_only_training80_df.columns)

no_missing_values_verification = imputed_numerical_only_training80_df.isnull().sum()
print(no_missing_values_verification) #proves that blank rows are replaced, now move on to encoding non-numerical (OCEAN_PROXIMITY)

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
dtype: int64


In [4]:
from sklearn.preprocessing import OneHotEncoder

# Although there were multiple non-numeric dimensions, 2 of them are trashed b/c they were for ordering/selection.
ocean_proximity_df = training80_inputs_features_df[["ocean_proximity"]] #no need for unique identifier, because row ordering is preserved between DFs

encoder_tool_instance = OneHotEncoder()

ocean_proximity_encoded = encoder_tool_instance.fit_transform(ocean_proximity_df)

# Add details to new columns' names
ocean_proximity_encoded_df = pandas.DataFrame(ocean_proximity_encoded.toarray(), columns=encoder_tool_instance.get_feature_names_out(["ocean_proximity"]))

print(ocean_proximity_encoded_df.head())

   ocean_proximity_<1H OCEAN  ocean_proximity_INLAND  ocean_proximity_ISLAND  \
0                        0.0                     1.0                     0.0   
1                        0.0                     0.0                     0.0   
2                        0.0                     1.0                     0.0   
3                        0.0                     0.0                     0.0   
4                        1.0                     0.0                     0.0   

   ocean_proximity_NEAR BAY  ocean_proximity_NEAR OCEAN  
0                       0.0                         0.0  
1                       0.0                         1.0  
2                       0.0                         0.0  
3                       0.0                         1.0  
4                       0.0                         0.0  


In [5]:
# Re-combine newly filled numerical data with newly one-hot encoded ocean_proximity data
final_training80_df = pandas.concat([imputed_numerical_only_training80_df, ocean_proximity_encoded_df], axis=1)
final_training80_df.head() #proof 5 new columns replaced single column on ocean proximity

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,0.0,1.0,0.0,0.0,0.0
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,0.0,0.0,0.0,0.0,1.0
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,0.0,1.0,0.0,0.0,0.0
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,0.0,0.0,0.0,0.0,1.0
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,1.0,0.0,0.0,0.0,0.0


In [10]:
#An estimator leverages existing data to make predictions, and sci-kit has premade estimators
from sklearn.base import BaseEstimator # this is the template for making fresh estimators, which might include estimator type (regressors) or classes
from sklearn.base import TransformerMixin # a blueprint for making a new transformer that will learn from data and then modify info
import numpy

rooms_index = 3 #yes, I'm aware I could have done this in one single line like the textbook
bedrooms_index = 4
population_index = 5
households_index = 6

# Define the custom transformer class
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):  # Initialize with a hyperparameter
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  # This transformer doesn't need to learn anything from the data, so we just return self

    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_index] / X[:, households_index]
        population_per_household = X[:, population_index] / X[:, households_index]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_index] / X[:, rooms_index]
            return numpy.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return numpy.c_[X, rooms_per_household, population_per_household]

#At the end of the EDA notebook, we discovered that bedrooms:total rooms ratio offered moderately predictive (inverse) value for finding house prices 
#For this reason, I am overriding the suggestion in the textbook to disable "add_bedrooms_per_room"
#attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)

# Apply the transformer to the data (assuming `housing.values` is your data)
housing_with_extra_attribs = CombinedAttributesAdder().transform(final_training80_df.values) #see how we used the whole transformer and its defaults?

# Optionally, convert the result back to a DataFrame for easier handling
housing_extra_attribs_df = pandas.DataFrame(housing_with_extra_attribs, columns=list(final_training80_df.columns) + ["rooms_per_household", "population_per_household","bedrooms_per_rooms"])

print(housing_extra_attribs_df.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -121.46     38.52                29.0       3873.0           797.0   
1    -117.23     33.09                 7.0       5320.0           855.0   
2    -119.04     35.37                44.0       1618.0           310.0   
3    -117.13     32.75                24.0       1877.0           519.0   
4    -118.70     34.28                27.0       3536.0           646.0   

   population  households  median_income  ocean_proximity_<1H OCEAN  \
0      2237.0       706.0         2.1736                        0.0   
1      2015.0       768.0         6.3373                        0.0   
2       667.0       300.0         2.8750                        0.0   
3       898.0       483.0         2.2264                        0.0   
4      1837.0       580.0         4.4964                        1.0   

   ocean_proximity_INLAND  ocean_proximity_ISLAND  ocean_proximity_NEAR BAY  \
0                     1.0                  