In [None]:
#Step 1: Understand the Data
Look at your dataset and identify columns.
Identify target variable (what you want to predict) → price.
Identify features (variables that may influence price) → e.g., bedrooms, bathrooms, sqft_living, floors, yr_built, etc.

Step 2: Clean the Data
Check for missing values or errors and decide how to handle them (remove rows, fill with average, etc.).
Ensure all selected features are in numerical format.
Convert dates or categorical variables to numbers if needed.

Step 3: Select Features
Choose features that are likely to impact the house price.
Exclude irrelevant information like street, statezip, or country (unless you plan to encode them).

Step 4: Split Data
Divide the dataset into training data (to train the model) and testing data (to evaluate performance).
Usually, 70–80% for training and 20–30% for testing.

Step 5: Train Linear Regression Model
Fit a linear regression model using the training data.
The model finds the relationship between features and the price.

Step 6: Make Predictions
Use the trained model to predict prices on the testing data.
This helps you see how well the model generalizes to new data.

Step 7: Evaluate the Model
Check how accurate the predictions are:
Mean Squared Error (MSE) → average error size
R² score → how much variance in price is explained by your model
Higher R² and lower MSE indicate a better model.

Step 8: Interpret Results
Look at the coefficients for each feature.
Positive → increases price
Negative → decreases price
#This tells you which features matter most in determining house price.

In [23]:
import kagglehub
import os
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("shree1992/housedata")

print("Path to dataset files:", path)
file_path = os.path.join(path,'data.csv')
df = pd.read_csv(file_path) 
df

Path to dataset files: C:\Users\USER PC\.cache\kagglehub\datasets\shree1992\housedata\versions\2


Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,3.130000e+05,3.0,1.50,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2.384000e+06,5.0,2.50,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,3.420000e+05,3.0,2.00,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,4.200000e+05,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,5.500000e+05,4.0,2.50,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,2014-07-09 00:00:00,3.081667e+05,3.0,1.75,1510,6360,1.0,0,0,4,1510,0,1954,1979,501 N 143rd St,Seattle,WA 98133,USA
4596,2014-07-09 00:00:00,5.343333e+05,3.0,2.50,1460,7573,2.0,0,0,3,1460,0,1983,2009,14855 SE 10th Pl,Bellevue,WA 98007,USA
4597,2014-07-09 00:00:00,4.169042e+05,3.0,2.50,3010,7014,2.0,0,0,3,3010,0,2009,0,759 Ilwaco Pl NE,Renton,WA 98059,USA
4598,2014-07-10 00:00:00,2.034000e+05,4.0,2.00,2090,6630,1.0,0,0,3,1070,1020,1974,0,5148 S Creston St,Seattle,WA 98178,USA


In [3]:
df.isnull().sum()

date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64

In [4]:
df.columns

Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city',
       'statezip', 'country'],
      dtype='object')

In [5]:
df.shape

(4600, 18)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [25]:
columns_to_drop = ['street', 'date', 'city', 'statezip','country']
df.drop(columns=columns_to_drop, axis=1, inplace=True)

print(f"Columns remaining: {df.shape[1]}")

Columns remaining: 13


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
0,3.130000e+05,3.0,1.50,1340,7912,1.5,0,0,3,1340,0,1955,2005
1,2.384000e+06,5.0,2.50,3650,9050,2.0,0,4,5,3370,280,1921,0
2,3.420000e+05,3.0,2.00,1930,11947,1.0,0,0,4,1930,0,1966,0
3,4.200000e+05,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0
4,5.500000e+05,4.0,2.50,1940,10500,1.0,0,0,4,1140,800,1976,1992
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,3.081667e+05,3.0,1.75,1510,6360,1.0,0,0,4,1510,0,1954,1979
4596,5.343333e+05,3.0,2.50,1460,7573,2.0,0,0,3,1460,0,1983,2009
4597,4.169042e+05,3.0,2.50,3010,7014,2.0,0,0,3,3010,0,2009,0
4598,2.034000e+05,4.0,2.00,2090,6630,1.0,0,0,3,1070,1020,1974,0


In [27]:
correlation_matrix = df.corr()
price_correlations = correlation_matrix['price'].sort_values(ascending=False)

print("--- Top 18 Features Correlated with Price ---")
print(price_correlations.head(13))

--- Top 18 Features Correlated with Price ---
price            1.000000
sqft_living      0.430410
sqft_above       0.367570
bathrooms        0.327110
view             0.228504
sqft_basement    0.210427
bedrooms         0.200336
floors           0.151461
waterfront       0.135648
sqft_lot         0.050451
condition        0.034915
yr_built         0.021857
yr_renovated    -0.028774
Name: price, dtype: float64


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# --- 0. Define Features (Updated based on your correlation list) ---

# Features identified as numerical (to be scaled)
numerical_features = [
    'sqft_living', 'sqft_above', 'bathrooms', 'sqft_basement', 
    'bedrooms', 'floors', 'sqft_lot', 'yr_built', 'yr_renovated'
] 

# Features identified as categorical (to be One-Hot Encoded)
categorical_features = ['view', 'waterfront', 'condition']

target_variable = 'price'

# Assuming 'df' is your full, clean DataFrame:
X = df.drop(target_variable, axis=1)
y = df[target_variable] 

# --- 1. Define the Preprocessing Pipeline ---

# The ColumnTransformer handles all data prep steps (including dropping unused columns)
preprocessor = ColumnTransformer(
    transformers=[
        # 1. Numerical Features: Apply Standard Scaling
        ('num', StandardScaler(), numerical_features),
        # 2. Categorical Features: Apply One-Hot Encoding
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='drop' # Drops all columns not listed in the feature lists
)

# --- 2. Create the Full Machine Learning Pipeline (Step 5: Train Model) ---

full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# --- Step 4: Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training data size: {X_train.shape[0]} rows")
print(f"Testing data size: {X_test.shape[0]} rows")
print("-" * 30)

# --- Step 5 & 6: Train Model and Make Predictions ---
full_pipeline.fit(X_train, y_train) 
y_pred = full_pipeline.predict(X_test)

print("Model Training and Prediction Complete.")
print("-" * 30)

# --- Step 7: Evaluate the Model ---
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("### Model Evaluation ###")
print(f"Mean Squared Error (MSE): ${mse:,.2f}")
print(f"R-squared (R²) Score: {r2:.4f}")
print("A higher R² and a lower MSE indicate better model performance.")
print("-" * 30)

# --- Step 8: Interpret Results (Coefficients) ---

regressor_model = full_pipeline['regressor']
# Get feature names after preprocessing
feature_names = full_pipeline['preprocessor'].get_feature_names_out()

coefficients_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': regressor_model.coef_
}).sort_values(by='Coefficient', ascending=False)


print("### Coefficient Interpretation (Feature Importance) ###")
print(coefficients_df.head(10))

print("\nInterpretation Notes:")
print(f"* Features like 'num__sqft_living' and encoded categorical features often have the highest absolute coefficients, indicating their large impact on price.")

Training data size: 3680 rows
Testing data size: 920 rows
------------------------------
Model Training and Prediction Complete.
------------------------------
### Model Evaluation ###
Mean Squared Error (MSE): $987,755,453,358.42
R-squared (R²) Score: 0.0315
A higher R² and a lower MSE indicate better model performance.
------------------------------
### Coefficient Interpretation (Feature Importance) ###
               Feature    Coefficient
15   cat__waterfront_1  212750.964687
0     num__sqft_living  136588.163215
1      num__sqft_above  120034.021497
10         cat__view_1  102170.187051
20    cat__condition_5   74217.395072
3   num__sqft_basement   59907.825612
12         cat__view_3   46189.982691
5          num__floors   36472.607967
2       num__bathrooms   27325.707327
19    cat__condition_4   16735.344637

Interpretation Notes:
* Features like 'num__sqft_living' and encoded categorical features often have the highest absolute coefficients, indicating their large impact on pr