# Zillow Housing Analysis - Interactive Notebook

This notebook demonstrates how to:
1. Fetch property data from Zillow API
2. Clean and explore the data
3. Engineer features for modeling
4. Train machine learning models for property valuation
5. Evaluate and predict property values
6. Visualize insights and results

In [None]:
# Import required libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import our custom modules
from zillow_analysis import ZillowAPIClient, PropertyValuationModel
from zillow_analysis.utils import (
    clean_property_data,
    calculate_derived_features,
    plot_price_distribution,
    plot_correlation_matrix,
    plot_price_vs_features,
    plot_feature_importance,
    plot_prediction_analysis,
    create_summary_dashboard
)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
%matplotlib inline

## 1. Data Collection

Fetch property data from Zillow API

In [None]:
# Initialize the API client
client = ZillowAPIClient()

# Search for properties
location = "New York, NY"  # Change this to your desired location
properties = client.search_properties(
    location=location,
    status="forSale",
    max_results=100
)

print(f"Found {len(properties)} properties in {location}")

In [None]:
# Get market trends
market_data = client.get_market_trends(location)

print("Market Trends:")
for key, value in market_data.items():
    print(f"  {key}: {value}")

## 2. Data Exploration

Explore and understand the property data

In [None]:
# Convert to DataFrame
df = pd.DataFrame(properties)

# Display basic info
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

# Display first few rows
df.head()

In [None]:
# Summary statistics
df.describe()

In [None]:
# Check for missing values
missing_data = df.isnull().sum()
missing_data[missing_data > 0].sort_values(ascending=False)

## 3. Data Cleaning and Feature Engineering

In [None]:
# Clean the data
df_clean = clean_property_data(properties)

# Calculate derived features
df_clean = calculate_derived_features(df_clean)

print(f"Cleaned dataset shape: {df_clean.shape}")
print(f"New features added: {set(df_clean.columns) - set(df.columns)}")

## 4. Data Visualization

In [None]:
# Price distribution
plot_price_distribution(df_clean)

In [None]:
# Correlation matrix
plot_correlation_matrix(df_clean)

In [None]:
# Price vs key features
features_to_plot = ['living_area', 'bedrooms', 'bathrooms', 'year_built', 'lot_size']
plot_price_vs_features(df_clean, features_to_plot)

In [None]:
# Summary dashboard
create_summary_dashboard(df_clean)

## 5. Model Training

Train machine learning models for property valuation

In [None]:
# Initialize the model
model = PropertyValuationModel(model_type="xgboost")

# Prepare data for training
properties_list = df_clean.to_dict('records')

# Train the model
metrics = model.train(properties_list, validate=True)

# Display metrics
print("\nModel Performance Metrics:")
for metric, value in metrics.items():
    if 'r2' in metric or 'mae' in metric or 'rmse' in metric:
        print(f"  {metric}: {value:.4f}" if 'r2' in metric else f"  {metric}: ${value:,.0f}")

In [None]:
# Feature importance
importance = model.get_feature_importance(top_n=15)
print("\nTop Features by Importance:")
print(importance)

# Plot feature importance
plot_feature_importance(importance)

## 6. Model Evaluation and Predictions

In [None]:
# Make predictions on all properties
predictions = model.predict(properties_list)

# Add predictions to dataframe
df_clean['predicted_price'] = predictions
df_clean['price_difference'] = df_clean['predicted_price'] - df_clean['price']
df_clean['percent_difference'] = (df_clean['price_difference'] / df_clean['price']) * 100

# Display results
df_clean[['address', 'price', 'predicted_price', 'price_difference', 'percent_difference']].head(10)

In [None]:
# Prediction analysis visualization
plot_prediction_analysis(df_clean['price'].values, df_clean['predicted_price'].values)

In [None]:
# Find potentially undervalued properties
undervalued = df_clean[df_clean['percent_difference'] < -10].sort_values('percent_difference')

print(f"\nFound {len(undervalued)} potentially undervalued properties:")
print("\nTop 5 Undervalued Properties:")
print(undervalued[['address', 'city', 'price', 'predicted_price', 'percent_difference']].head())

In [None]:
# Find potentially overvalued properties
overvalued = df_clean[df_clean['percent_difference'] > 10].sort_values('percent_difference', ascending=False)

print(f"\nFound {len(overvalued)} potentially overvalued properties:")
print("\nTop 5 Overvalued Properties:")
print(overvalued[['address', 'city', 'price', 'predicted_price', 'percent_difference']].head())

## 7. Evaluate Specific Properties

In [None]:
# Evaluate a specific property
sample_property = properties_list[0]
evaluation = model.evaluate_property(sample_property)

print("Property Evaluation:")
print(f"Address: {sample_property.get('address', 'N/A')}")
print(f"City: {sample_property.get('city', 'N/A')}, {sample_property.get('state', 'N/A')}")
print(f"\nAsking Price: ${evaluation['actual_price']:,.0f}")
print(f"Model Predicted Value: ${evaluation['predicted_value']:,.0f}")
if evaluation['difference']:
    print(f"Difference: ${evaluation['difference']:,.0f}")
    print(f"Percent Difference: {evaluation['percent_difference']:.2f}%")
print(f"\nEvaluation: {evaluation['evaluation']}")

## 8. Save Results

In [None]:
# Save the trained model
model.save_model('zillow_analysis/data/valuation_model.pkl')

# Export results
df_clean.to_csv('zillow_analysis/data/analyzed_properties.csv', index=False)

print("Model and results saved successfully!")

## 9. Next Steps

- Fine-tune model hyperparameters
- Add more features (neighborhood data, school ratings, etc.)
- Compare different model types
- Analyze specific neighborhoods or property types
- Track property prices over time