In [1]:
# splitting training data into 2 buckets (1 for inputs, 1 for price outputs) so they don't pollute each other during training
import os
import pandas #required to convert into a DataFrame

PROJECT_FILE_PATH = os.path.join("datasets", "housing")
csv_path = os.path.join(PROJECT_FILE_PATH, "stratified_training_80.csv")
full_stratified_training80_data = pandas.read_csv(csv_path)

training80_inputs_features_df  = full_stratified_training80_data.drop("median_house_value", axis=1)
training80_prices_labels_df = full_stratified_training80_data["median_house_value"].copy()

In [2]:
'''
I wanted a pulse check to see which dimensions are missing values and how bad is the relative damage. If the total_bedrooms had a lot of empty rows
and those empty rows were randomly distributed, it might make sense to remove the rows with empty data. However, if I had reason to believe the empties
showed up in a biased way (e.g. voters claiming to be undecided) AND the dimension wasn't a key feature of predicting prices, I might remove the entire
dimension. Given the percentages shown below and the book's hint, I feel more comfortable now auto-filling blanks with the median bedroom count.
'''

missing_values_counts = training80_inputs_features_df.isnull().sum()
missing_values_percentages = (training80_inputs_features_df.isnull().sum() / len(training80_inputs_features_df)) * 100

# make a spreadsheet through pandas dataframe
missing_values_report = pandas.DataFrame({
    'Missing Values': missing_values_counts,
    'Percentage (%)': missing_values_percentages
})

print(missing_values_report)

                       Missing Values  Percentage (%)
longitude                           0         0.00000
latitude                            0         0.00000
housing_median_age                  0         0.00000
total_rooms                         0         0.00000
total_bedrooms                    158         0.95688
population                          0         0.00000
households                          0         0.00000
median_income                       0         0.00000
ocean_proximity                     0         0.00000
concatenated_position               0         0.00000
position_hash                       0         0.00000


In [21]:
from sklearn.impute import SimpleImputer # so I don't have to fill-in blanks one dimension at a time manually
imputer_tool = SimpleImputer(strategy="median")

# Cloning the training DataFrame, excluding the non-numerical dimensions "ocean_proximity" and "concatenated_position"
numerical_only_training80_df = training80_inputs_features_df.drop(["ocean_proximity", "concatenated_position"], axis=1)

imputer_tool.fit(numerical_only_training80_df)
imputed_numerical_only_training80_df = imputer_tool.transform(numerical_only_training80_df)
imputed_numerical_only_training80_df = pandas.DataFrame(imputed_numerical_only_training80_df, columns=numerical_only_training80_df.columns)

no_missing_values_verification = imputed_numerical_only_training80_df.isnull().sum()
print(no_missing_values_verification)

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
position_hash         0
dtype: int64
