# House Price Prediction Model Development

This notebook implements a house price prediction model using Linear Regression.

## Features Used:
1. OverallQual - Overall material and finish quality
2. GrLivArea - Above grade living area square feet
3. TotalBsmtSF - Total square feet of basement area
4. GarageCars - Size of garage in car capacity
5. GarageArea - Size of garage in square feet
6. YearBuilt - Original construction date

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Generate sample dataset (In production, load from CSV)
# For demonstration purposes, we'll create a synthetic dataset
np.random.seed(42)

n_samples = 1000

# Generate synthetic data
data = {
    'OverallQual': np.random.randint(1, 11, n_samples),  # Quality rating 1-10
    'GrLivArea': np.random.randint(600, 5000, n_samples),  # Living area in sq ft
    'TotalBsmtSF': np.random.randint(0, 3000, n_samples),  # Basement area in sq ft
    'GarageCars': np.random.randint(0, 5, n_samples),  # Number of cars
    'GarageArea': np.random.randint(0, 1200, n_samples),  # Garage area in sq ft
    'YearBuilt': np.random.randint(1950, 2024, n_samples)  # Year built
}

df = pd.DataFrame(data)

# Create target variable (SalePrice) with realistic relationships
df['SalePrice'] = (
    50000 +  # Base price
    df['OverallQual'] * 20000 +  # Quality impact
    df['GrLivArea'] * 80 +  # Living area impact
    df['TotalBsmtSF'] * 30 +  # Basement impact
    df['GarageCars'] * 10000 +  # Garage cars impact
    df['GarageArea'] * 50 +  # Garage area impact
    (df['YearBuilt'] - 1950) * 500 +  # Age impact
    np.random.normal(0, 20000, n_samples)  # Random noise
)

# Ensure no negative prices
df['SalePrice'] = df['SalePrice'].clip(lower=50000)

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nStatistical summary:")
print(df.describe())

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Handle missing values if any (in our synthetic data, there are none)
# For real data, you might use:
# df.fillna(df.median(), inplace=True)

In [None]:
# Prepare features and target
feature_columns = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'GarageArea', 'YearBuilt']
X = df[feature_columns]
y = df['SalePrice']

print("Features shape:", X.shape)
print("Target shape:", y.shape)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully")

In [None]:
# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

print("Model trained successfully")
print("\nModel coefficients:")
for feature, coef in zip(feature_columns, model.coef_):
    print(f"{feature}: {coef:.2f}")
print(f"\nIntercept: {model.intercept_:.2f}")

In [None]:
# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

print("Predictions completed")

In [None]:
# Evaluate the model
print("="*50)
print("MODEL EVALUATION METRICS")
print("="*50)

# Training set metrics
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

print("\nTraining Set Metrics:")
print(f"MAE (Mean Absolute Error): ${train_mae:,.2f}")
print(f"MSE (Mean Squared Error): ${train_mse:,.2f}")
print(f"RMSE (Root Mean Squared Error): ${train_rmse:,.2f}")
print(f"R² Score: {train_r2:.4f}")

# Testing set metrics
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print("\nTesting Set Metrics:")
print(f"MAE (Mean Absolute Error): ${test_mae:,.2f}")
print(f"MSE (Mean Squared Error): ${test_mse:,.2f}")
print(f"RMSE (Root Mean Squared Error): ${test_rmse:,.2f}")
print(f"R² Score: {test_r2:.4f}")
print("="*50)

In [None]:
# Save the model and scaler
model_data = {
    'model': model,
    'scaler': scaler,
    'feature_columns': feature_columns
}

with open('house_price_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Model saved successfully as 'house_price_model.pkl'")

In [None]:
# Test loading the saved model
with open('house_price_model.pkl', 'rb') as f:
    loaded_model_data = pickle.load(f)

loaded_model = loaded_model_data['model']
loaded_scaler = loaded_model_data['scaler']
loaded_features = loaded_model_data['feature_columns']

print("Model loaded successfully")
print(f"Features: {loaded_features}")

# Test prediction with sample data
sample_house = pd.DataFrame({
    'OverallQual': [7],
    'GrLivArea': [2000],
    'TotalBsmtSF': [1200],
    'GarageCars': [2],
    'GarageArea': [500],
    'YearBuilt': [2010]
})

sample_scaled = loaded_scaler.transform(sample_house)
sample_prediction = loaded_model.predict(sample_scaled)

print(f"\nSample prediction for a house with:")
print(f"  Overall Quality: 7")
print(f"  Living Area: 2000 sq ft")
print(f"  Basement Area: 1200 sq ft")
print(f"  Garage Cars: 2")
print(f"  Garage Area: 500 sq ft")
print(f"  Year Built: 2010")
print(f"\nPredicted Price: ${sample_prediction[0]:,.2f}")