# Visual 2: Living Area vs. Price Relationship

This scatter plot explores how property size and amenities influence housing prices, showing that larger homes with better features and proximity to strong school districts tend to command higher prices.

In [None]:
import pandas as pd
import altair as alt
import numpy as np
from scipy import stats

# Enable tooltips
alt.data_transformers.enable('default')
alt.renderers.enable('default')

In [None]:
# Load the housing data
df = pd.read_csv('data.csv')

# Display initial data info
print(f"Total records: {len(df)}")
print(f"Columns: {df.columns.tolist()}")

In [None]:
# Prepare data for visualization
# Filter out records with missing or invalid values
viz_data = df[
    (df['livingAreaSqFt'].notna()) & 
    (df['latestPrice'].notna()) & 
    (df['livingAreaSqFt'] > 0) & 
    (df['latestPrice'] > 0) &
    (df['avgSchoolRating'].notna())
].copy()

# Create school quality categories for better visualization
viz_data['school_quality'] = pd.cut(
    viz_data['avgSchoolRating'],
    bins=[0, 3, 5, 7, 10],
    labels=['Below Average (0-3)', 'Average (3-5)', 'Good (5-7)', 'Excellent (7-10)'],
    include_lowest=True
)

# Create amenity score (sum of various amenity indicators)
amenity_cols = ['hasGarage', 'hasCooling', 'hasHeating', 'hasSpa', 'hasView']
viz_data['amenity_score'] = 0
for col in amenity_cols:
    if col in viz_data.columns:
        viz_data['amenity_score'] += viz_data[col].map({'TRUE': 1, 'FALSE': 0, True: 1, False: 0}).fillna(0)

print(f"Filtered records: {len(viz_data)}")
print(f"School quality distribution:\n{viz_data['school_quality'].value_counts().sort_index()}")

In [None]:
# Calculate regression line and R²
x = viz_data['livingAreaSqFt'].values
y = viz_data['latestPrice'].values

# Calculate linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
r_squared = r_value ** 2

# Create regression line data
x_min, x_max = x.min(), x.max()
regression_df = pd.DataFrame({
    'livingAreaSqFt': [x_min, x_max],
    'predicted_price': [slope * x_min + intercept, slope * x_max + intercept]
})

print(f"R² value: {r_squared:.4f}")
print(f"Slope: ${slope:.2f} per sq ft")

In [None]:
# Create brush selection
brush = alt.selection_interval(name='brush')

# Define color palette for school quality (categorical palette for accessibility)
school_colors = {
    'Below Average (0-3)': '#e74c3c',  # Red
    'Average (3-5)': '#f39c12',         # Orange
    'Good (5-7)': '#3498db',            # Blue
    'Excellent (7-10)': '#2ecc71'       # Green
}

# Create scatter plot
scatter = alt.Chart(viz_data).mark_circle(size=60, opacity=0.6).encode(
    x=alt.X('livingAreaSqFt:Q',
            title='Living Area (sq ft)',
            scale=alt.Scale(zero=False)),
    y=alt.Y('latestPrice:Q',
            title='Property Price ($)',
            axis=alt.Axis(format='$,.0f'),
            scale=alt.Scale(zero=False)),
    color=alt.condition(
        brush,
        alt.Color('school_quality:N',
                  title='School Quality Rating',
                  scale=alt.Scale(domain=list(school_colors.keys()),
                                  range=list(school_colors.values())),
                  legend=alt.Legend(orient='top-right')),
        alt.value('lightgray')
    ),
    tooltip=[
        alt.Tooltip('streetAddress:N', title='Address'),
        alt.Tooltip('city:N', title='City'),
        alt.Tooltip('livingAreaSqFt:Q', title='Living Area (sq ft)', format=','),
        alt.Tooltip('latestPrice:Q', title='Price', format='$,.0f'),
        alt.Tooltip('numOfBedrooms:Q', title='Bedrooms'),
        alt.Tooltip('numOfBathrooms:Q', title='Bathrooms'),
        alt.Tooltip('avgSchoolRating:Q', title='School Rating', format='.2f'),
        alt.Tooltip('amenity_score:Q', title='Amenity Score'),
        alt.Tooltip('yearBuilt:Q', title='Year Built')
    ]
).properties(
    width=700,
    height=450,
    title='Living Area vs. Price Relationship (Brush to select homes)'
).add_params(brush)

# Create regression line
regression_line = alt.Chart(regression_df).mark_line(
    color='black',
    strokeDash=[5, 5],
    size=2
).encode(
    x='livingAreaSqFt:Q',
    y='predicted_price:Q'
)

# Add R² annotation
r2_text = alt.Chart(pd.DataFrame({
    'x': [viz_data['livingAreaSqFt'].quantile(0.05)],
    'y': [viz_data['latestPrice'].quantile(0.95)],
    'text': [f'R² = {r_squared:.4f}']
})).mark_text(
    align='left',
    baseline='top',
    fontSize=14,
    fontWeight='bold',
    dx=5
).encode(
    x='x:Q',
    y='y:Q',
    text='text:N'
)

# Combine scatter plot with regression line and annotation
scatter_chart = scatter + regression_line + r2_text

# Create linked histogram
histogram = alt.Chart(viz_data).mark_bar().encode(
    x=alt.X('latestPrice:Q',
            bin=alt.Bin(maxbins=30),
            title='Property Price ($)',
            axis=alt.Axis(format='$,.0f')),
    y=alt.Y('count():Q',
            title='Number of Homes'),
    color=alt.condition(
        brush,
        alt.value('#3498db'),
        alt.value('lightgray')
    ),
    tooltip=[
        alt.Tooltip('count()', title='Count'),
        alt.Tooltip('latestPrice:Q', bin=True, title='Price Range', format='$,.0f')
    ]
).properties(
    width=700,
    height=150,
    title='Price Distribution (filtered by brush selection above)'
).transform_filter(brush)

# Combine both charts vertically
final_chart = alt.vconcat(
    scatter_chart,
    histogram
).configure_axis(
    labelFontSize=11,
    titleFontSize=13
).configure_title(
    fontSize=15,
    anchor='start'
).configure_legend(
    titleFontSize=12,
    labelFontSize=11
)

final_chart

## Key Insights

### Design & Interactivity
- **Scatter Plot**: Shows the relationship between living area (x-axis) and property price (y-axis)
- **Color Coding**: Properties are colored by school quality rating categories
- **Hover Tooltips**: Reveal detailed information including bedrooms, bathrooms, amenities, and more
- **Brush Selection**: Click and drag on the scatter plot to select a subset of homes
- **Linked Histogram**: Automatically updates to show price distribution of selected homes

### Visual Elements
- **Regression Line**: Black dashed line shows the correlation trend
- **R² Value**: Displayed in the upper left to quantify the correlation strength
- **Color Palette**: Uses accessible categorical colors (red, orange, blue, green) for school quality
- **Legend**: Positioned in the upper right corner for easy reference

### Observations
- Larger homes generally command higher prices (positive correlation)
- Homes near excellent schools (green dots) often have premium pricing
- The brush selection tool enables detailed exploration of specific market segments
- The histogram helps compare price distributions between different groups