In [1]:
'''
THIS NOTEBOOK'S QUESTION:
Why and how will we divide these 20K rows into TRAINING (80%) vs TESTING (20%)?

DATA LEAKAGE:
If we clean the data before splitting, the eventual TESTING data will inform/bias the TRAINING data, likely making the results too clean
'''

"\nTHIS NOTEBOOK'S QUESTION:\nWhy and how will we divide these 20K rows into TRAINING (80%) vs TESTING (20%)?\n\nDATA LEAKAGE:\nIf we clean the data before splitting, the eventual TESTING data will inform/bias the TRAINING data, likely making the results too clean\n"

In [None]:
import os
import pandas #required to convert into a DataFrame

PROJECT_FILE_PATH = os.path.join("datasets", "housing")
csv_path = os.path.join(PROJECT_FILE_PATH, "housing.csv")
housing_dataframe = pandas.read_csv(csv_path)

In [None]:
# to split the dataset in an unbiased way, we want to use randoomized hash values
# but we want the hash values to come from a consistent source, so we are using STABLE position values from longitude/latitude

housing_dataframe["concatenated_position"] = (
    housing_dataframe["longitude"].astype(str) + "_" + housing_dataframe["latitude"].astype(str)
)

In [None]:
# create a hash value rooted in "concatenated_position"
from zlib import crc32
housing_dataframe["position_hash"] = housing_dataframe["concatenated_position"].apply(lambda x: crc32(x.encode()))

In [None]:
#split the data into training (80%) and testing (20%)
from sklearn.model_selection import train_test_split
train_dataframe, test_dataframe = train_test_split(housing_dataframe, test_size=0.2, random_state=42) #using a common utility from scikit package
train_dataframe.head() #this will give us a demonstration of the data

In [None]:
train_csv_path = os.path.join(PROJECT_FILE_PATH, "training80.csv")
test_csv_path = os.path.join(PROJECT_FILE_PATH, "testing20.csv")

train_dataframe.to_csv(train_csv_path, index=False)
print(f"Training data saved to {train_csv_path}")

train_dataframe.to_csv(test_csv_path, index=False)
print(f"Testing data saved to {test_csv_path}")

'''
We just verified that:
1. the training set has 80% a.k.a. 16,512 rows
2. the testing set has 20% a.k.a. 4,128 rows
3. both sets were saved as CSVs in correct folder

However, the downside with using that as that the computer may have weighted the data in the wrong portions expected by something like median income

'''

train_dataframe.describe() # write down the mean longitude here, and compare it to what you see later in stratified sample

In [None]:
#begin stratified sampling (according to median income)
import pandas
import numpy

#creating a new "income_cat" for banding
housing_dataframe["income_cat"] = pandas.cut(
    housing_dataframe["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., numpy.inf],
    labels=[1, 2, 3, 4, 5]
)

#display function
housing_dataframe["income_cat"].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) # we only split once, test_size is a reserved term
for train_index, test_index in split.split(housing_dataframe, housing_dataframe["income_cat"]):
    strat_train_set = housing_dataframe.loc[train_index]
    strat_test_set = housing_dataframe.loc[test_index]

strat_train_set.describe() #take note that the mean longitude differs from the 1 in non-stratified(random) sample, suggesting it worked

In [None]:
strat_train_set["income_cat"].value_counts() / len(strat_train_set)

In [None]:
# eyeballing this, the similarities in distribution between training vs test data shows stratification worked
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
# even though .decribe doesnt show, there was an extra column used as a reference for stratification
print(strat_train_set.columns)
print(strat_test_set.columns)

for set_ in (strat_train_set, strat_test_set):
    try:
        set_.drop("income_cat", axis=1, inplace=True)
        print("\n wiped dimension: income_cat \n")
    except:
        pass
        
print(strat_train_set.columns)
print(strat_test_set.columns)

In [None]:
# save as CSVs so other notebooks can import
strat_train_csv_path = os.path.join(PROJECT_FILE_PATH, "stratified_training_80.csv")
strat_test_csv_path = os.path.join(PROJECT_FILE_PATH, "stratified_testing_20.csv")

strat_train_set.to_csv(strat_train_csv_path, index=False)
print(f"Stratified training data saved to {strat_train_csv_path}")

strat_test_set.to_csv(strat_test_csv_path, index=False)
print(f"Stratified testing data saved to {strat_test_csv_path}")