# Project: House Price Prediction Model
# Author: Anshubana
# Description: Data preprocessing, EDA, feature engineering, model training and evaluation for predicting house prices with experiment tracking and model registry integration


In [17]:
!pip install scikit-learn
!pip install pandas
!pip install mlflow




In [18]:
from sklearn.ensemble import RandomForestRegressor #ML Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error #how close your model’s predicted values
from sklearn.datasets import fetch_california_housing #California housing dataset

In [19]:
hosse_data = fetch_california_housing() #fetching the dataset 1990 U.S. Census about housing in California.

In [20]:
hosse_data

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]], shape=(20640, 8)),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,)),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': 

X: Features (Inputs):

    MedInc: Median income in block group

    HouseAge: Median house age in block group

    AveRooms: Average number of rooms

    AveBedrms: Average number of bedrooms

    Population: Block group population

    AveOccup: Average house occupancy

    Latitude and Longitude

Target (Output):    
    y: Median house value 

In [21]:
# prepare the data
import pandas as pd
df=pd.DataFrame(hosse_data.data, columns=hosse_data.feature_names) #Sets the column names using the list of feature names and create dataframe with the data.
df["price"]=hosse_data.target

df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [22]:

# devide the dataset into Independent Features (X=Input) and depending Features (y=Output)
X= df.drop(columns=["price"]) #X is the feature set, which contains all columns except the target variable "price".
y= df["price"] #y is the depending Feature, which contains the "price" column.


In [23]:
#Split the dataset into 4 training sets and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#X_Train input features for training
#y_train output features for training



#X_Test input features for training
#y_Test output features for training

In [24]:
X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
14196,3.2596,33.0,5.017657,1.006421,2300.0,3.691814,32.71,-117.03
8267,3.8125,49.0,4.473545,1.041005,1314.0,1.738095,33.77,-118.16
17445,4.1563,4.0,5.645833,0.985119,915.0,2.723214,34.66,-120.48
14265,1.9425,36.0,4.002817,1.033803,1418.0,3.994366,32.69,-117.11
2271,3.5542,43.0,6.268421,1.134211,874.0,2.300000,36.78,-119.80
...,...,...,...,...,...,...,...,...
11284,6.3700,35.0,6.129032,0.926267,658.0,3.032258,33.78,-117.96
11964,3.0500,33.0,6.868597,1.269488,1753.0,3.904232,34.02,-117.43
5390,2.9344,36.0,3.986717,1.079696,1756.0,3.332068,34.03,-118.38
860,5.7192,15.0,6.395349,1.067979,1777.0,3.178891,37.58,-121.96


In [25]:
y_train

14196    1.030
8267     3.821
17445    1.726
14265    0.934
2271     0.965
         ...  
11284    2.292
11964    0.978
5390     2.221
860      2.835
15795    3.250
Name: price, Length: 16512, dtype: float64

In [30]:
# Define the hiperparameters grid for Random Forest
from sklearn.model_selection import GridSearchCV


def hyperparameter_tuning(X_train,y_train,param_grid):
    rf = RandomForestRegressor()
    #create a GridSearchCV object  which is used to find the best hyperparameters for this model by testing parameter combinations.
    # 3 fold cross validation
    # -1 running in parallel
    # verbose=2 for detailed output
    #evaluating performance using negative mean squared error. lower is better
    gridSearch = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2,scoring='neg_mean_squared_error')
    gridSearch.fit(X_train, y_train) #fit the model to the training data
    return gridSearch

   


In [31]:
param_grid = {    
    'n_estimators': [100, 200],  # Number of trees > More trees generally improve performance 
    'max_depth': [5, 10, None],  # Maximum depth of trees
    'min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],  # Minimum number of samples required to be at a leaf node
}


In [32]:
#we set our schmea(Shape/datatype/structure) for the model( Input and output)
'''
    The signature is a way to define the model’s expected input and output schema.
    It captures the types and shapes of inputs and outputs 
    This helps MLflow to validate inputs at inference time (use a trained model to predict) to avoid errors.
'''
from mlflow.models import infer_signature
signature =infer_signature(X_train, y_train)


  ''' 
                 Log the model with signature
                 MLflow treats the trained model as a file called an artifact.
                 saving the model files (e.g. pickle files, model binaries) into MLflow’s artifact storage (local disk /S3).
                 You can later download, load, or serve this model from MLflow.
                 '''

In [39]:
#Start MLflow experiment
import mlflow
from urllib.parse import urlparse
with mlflow.start_run():
        #perform hyperparameter tuning
        gridSearch = hyperparameter_tuning(X_train, y_train, param_grid)

        ## get the best model from grid search
        best_model = gridSearch.best_estimator_
        #predict the target variable using the best model
        y_pred = best_model.predict(X_test)
        mse= mean_squared_error(y_test, y_pred) #calculate the mean squared error
        print(f"Mean Squared Error: {mse}")

        # Log the model and metrics
        mlflow.log_param("best_n_estimators", gridSearch.best_params_['n_estimators'])
        mlflow.log_param("best_max_depth", gridSearch.best_params_['max_depth'])
        mlflow.log_param("best_min_samples_split", gridSearch.best_params_['min_samples_split'])
        mlflow.log_param("best_min_samples_leaf", gridSearch.best_params_['min_samples_leaf'])
        mlflow.log_metric("mse", mse)
        print("Model logged in MLflow")

        #Start MLflow experiment
        mlflow.set_tracking_uri("http://127.0.0.1:5000")  # Set the MLflow tracking URI to your MLflow server
        mlflow.set_experiment("House_Price_Prediction")  # Set the experiment name

        type = urlparse(mlflow.get_tracking_uri()).scheme  # Get the artifact URI for the logged model
        if type != "file":
                 mlflow.sklearn.log_model(best_model, "model", registered_model_name="Best House Price Prediction Model",signature=signature)  
        else:
                mlflow.sklearn.log_model(best_model, "model", signature=signature)
                 
               


Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.8s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.9s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.6s
[CV] END max_depth=5, min_samples_leaf=



Mean Squared Error: 0.2545952729927698
Model logged in MLflow


Registered model 'Best House Price Prediction Model' already exists. Creating a new version of this model...
2025/08/09 16:02:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best House Price Prediction Model, version 2


🏃 View run loud-sloth-95 at: http://127.0.0.1:5000/#/experiments/216075398956383281/runs/db69f1ee0e5f4b868e0d538eb9567301
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/216075398956383281


Created version '2' of model 'Best House Price Prediction Model'.
