In [3]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
PATH = '../preprocessing/md_encoded_categorical_subset20.csv'

In [7]:
md = pd.read_csv(PATH)
# Create a combined column for 'building type' and 'climate zone' to ensure even representation
md['building_climate_group'] = md['in.comstock_building_type_group'] + '_' + md['in.building_america_climate_zone']

group_counts = md['building_climate_group'].value_counts()
print(group_counts)
print(len(group_counts))

building_climate_group
Mercantile_Mixed-Humid               857
Mercantile_Cold                      780
Warehouse and Storage_Mixed-Humid    509
Office_Mixed-Humid                   504
Mercantile_Hot-Humid                 446
Warehouse and Storage_Cold           436
Office_Cold                          428
Warehouse and Storage_Hot-Humid      387
Office_Hot-Humid                     344
Mercantile_Hot-Dry                   237
Warehouse and Storage_Hot-Dry        221
Food Service_Cold                    168
Food Service_Mixed-Humid             141
Office_Hot-Dry                       136
Food Service_Hot-Humid               113
Education_Mixed-Humid                 91
Mercantile_Marine                     72
Education_Cold                        67
Warehouse and Storage_Marine          64
Food Service_Hot-Dry                  55
Office_Marine                         53
Lodging_Cold                          39
Education_Hot-Humid                   35
Lodging_Hot-Humid                 

In [8]:
def population_split_buildings(path):
    # Load metadata
    md = pd.read_csv(path)

    # Create a combined column for 'building type' and 'climate zone' to ensure even representation
    md['building_climate_group'] = md['in.comstock_building_type_group'] + '_' + md['in.building_america_climate_zone']

    # Count the number of buildings in each group
    group_counts = md['building_climate_group'].value_counts()
    target_total = int(0.2*md.shape[0])
    target_per_group = target_total // len(group_counts)

    # Separate small groups and large groups
    small_groups = group_counts[group_counts < target_per_group]
    large_groups = group_counts[group_counts >= target_per_group]

    # Include all buildings from small groups
    small_groups_sampled = md[md['building_climate_group'].isin(small_groups.index)]

    # Calculate remaining capacity
    remaining_capacity = target_total - small_groups_sampled.shape[0]

    # Dynamically sample from large groups
    large_groups_sampled = pd.DataFrame()

    while remaining_capacity > 0 and not large_groups.empty:
        # Number of groups remaining
        remaining_groups_count = len(large_groups)

        # Determine how many to sample per group
        sample_per_large_group = max(remaining_capacity // remaining_groups_count, 1)

        # Sample buildings
        new_samples = (
            md[md['building_climate_group'].isin(large_groups.index)]
            .groupby('building_climate_group', group_keys=False)
            .apply(lambda x: x.sample(n=min(sample_per_large_group, len(x)), random_state=42))
        )

        # Add sampled buildings to the final selection
        large_groups_sampled = pd.concat([large_groups_sampled, new_samples])

        # Update remaining capacity
        remaining_capacity -= new_samples.shape[0]

        # Update large_groups to exclude groups already fully sampled
        sampled_counts = new_samples['building_climate_group'].value_counts()
        fully_sampled_groups = sampled_counts[sampled_counts >= group_counts.loc[sampled_counts.index]]
        large_groups = large_groups.drop(fully_sampled_groups.index)

    # Combine the sampled DataFrames
    final_sampled = pd.concat([small_groups_sampled, large_groups_sampled])

    # Extract the building IDs and group by building type and climate zone
    train_ids = []
    test_ids = []

    # Group by 'building_climate_group' to ensure balanced distribution
    for _, group in final_sampled.groupby('building_climate_group'):
        bldg_ids = group['bldg_id']

        # Split 80% train, 20% test within each group
        if len(bldg_ids) > 1:
            train, test = train_test_split(bldg_ids, test_size=0.2, random_state=42)

        else:
            # If only one building in the group, add it to the train set
            train = bldg_ids
            test = []

        # Append '.csv' to each ID and add to the lists
        train_ids.extend([str(x) + '.csv' for x in train])
        test_ids.extend([str(x) + '.csv' for x in test])

        # # Append '.csv' to each ID and add to the lists
        # train_ids.extend(train.astype(str) + '.csv')
        # test_ids.extend(test.astype(str) + '.csv')

    # Convert to JSON format
    split_data = {
        "train_bldg_ids": train_ids,
        "test_bldg_ids": test_ids
    }

    # Save to JSON file
    with open('subset20_20_data.json', 'w') as json_file:
        json.dump(split_data, json_file, indent=4)

    print("Building IDs split with '.csv' appended and saved to subset20_20.json")


In [9]:
population_split_buildings(PATH)

Building IDs split with '.csv' appended and saved to subset20_20.json


  .apply(lambda x: x.sample(n=min(sample_per_large_group, len(x)), random_state=42))
  .apply(lambda x: x.sample(n=min(sample_per_large_group, len(x)), random_state=42))
  .apply(lambda x: x.sample(n=min(sample_per_large_group, len(x)), random_state=42))
