In [2]:
import re

# Handling Gross Living Area
def parse_gla(value: str):
    
    '''
        Extracts and returns an integer value for the gla when provided the value as a string
    '''
    if not value or value.lower in ["n/a", "na", ""]:
        return None
    
    match = re.search(r'[\d,.]', value)
    return int(match.group().replace(",", "")) if match else None

def parse_bath_count(value: str):
    '''
        Returning a total bath count for nested values
    '''
    
    if not value or value.lower in ["n/a", "na", ""]:
        return None
    
    parts = value.split[":"]
    try:
        full = int(parts[0])
        half = int(parts[1]) if len(parts) > 1 else 0
        
        return full + (0.5 * half)
    
    except:
        return None
        
def normalize_string(value: str):
    if not value or isinstance(value, float):
        return None
    
    return value.strip().lower()



In [None]:
import pandas as pd

def appraisal_to_training_rows(appraisal):
    subject = appraisal['subject']
    comps = appraisal.get('comps', [])
    properties = appraisal.get('properties', [])

    # Create a lookup to check if a property was used as a comp
    comp_addresses = set(
        normalize_string(comp['address']) for comp in comps
    )

    rows = []
    for prop in properties:
        row = {}

        # Subject features (prefix: s_)
        row['s_gla'] = parse_gla(subject.get('gla'))
        row['s_bath_count'] = parse_bath_count(subject.get('num_baths'))
        row['s_bed_count'] = int(subject.get('num_beds') or 0)
        row['s_style'] = normalize_string(subject.get('style'))
        row['s_structure'] = normalize_string(subject.get('structure_type'))
        row['s_heating'] = normalize_string(subject.get('heating'))
        row['s_cooling'] = normalize_string(subject.get('cooling'))
        row['s_condition'] = normalize_string(subject.get('condition'))

        # Candidate property features (prefix: c_)
        row['c_gla'] = prop.get('gla')
        row['c_bath_count'] = (
            prop.get('full_baths', 0) +
            0.5 * (prop.get('half_baths') or 0)
        )
        row['c_bed_count'] = prop.get('bedrooms', 0)
        row['c_style'] = normalize_string(prop.get('style'))
        row['c_structure'] = normalize_string(prop.get('structure_type'))
        row['c_heating'] = normalize_string(prop.get('heating'))
        row['c_cooling'] = normalize_string(prop.get('cooling'))
        row['c_condition'] = normalize_string(prop.get('basement'))  # proxy if no direct condition

        # Match: is this candidate one of the comps?
        row['is_comp'] = 1 if normalize_string(prop['address']) in comp_addresses else 0

        rows.append(row)

    return pd.DataFrame(rows)
