# House Prices Modeling

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_log_error

### Load the dataset

In [2]:
train_df = pd.read_csv('E:/dsp_navaraja_mannepalli/data/train.csv')
test_df = pd.read_csv('E:/dsp_navaraja_mannepalli/data/test.csv')


### Split the data

In [3]:
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Preprocessing

### Feature Selection

In [4]:
continuous_features = ['LotArea', 'GrLivArea']
categorical_features = ['MSZoning', 'Neighborhood']


### Feature Processing (Persistent)

In [5]:
imputer = SimpleImputer(strategy='most_frequent')
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore')

## Model Training

In [6]:
X_train_cat = imputer.fit_transform(X_train[categorical_features])
X_train_cont = scaler.fit_transform(X_train[continuous_features])
X_train_encoded = encoder.fit_transform(X_train_cat).toarray()

# Combine features
X_train_processed = np.concatenate([X_train_cont, X_train_encoded], axis=1)

# Train model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_processed, y_train)

## Model Evaluation

In [8]:
# Transform test data (NO fitting)
X_test_cat = imputer.transform(X_test[categorical_features])
X_test_cont = scaler.transform(X_test[continuous_features])
X_test_encoded = encoder.transform(X_test_cat).toarray()

X_test_processed = np.concatenate([X_test_cont, X_test_encoded], axis=1)

# Predict & evaluate
from sklearn.metrics import mean_squared_log_error
y_pred = model.predict(X_test_processed)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(f"RMSLE: {rmsle:.4f}")

RMSLE: 0.2068


## Model Inference

In [11]:

inference_data = test_df.copy()

inf_cat = imputer.transform(inference_data[categorical_features])
inf_cont = scaler.transform(inference_data[continuous_features])
inf_encoded = encoder.transform(inf_cat).toarray()

inf_processed = np.concatenate([inf_cont, inf_encoded], axis=1)

predictions = model.predict(inf_processed)
print(predictions[:10])

[ 88264.41248655 150444.44970655 195167.82402368 191164.7502481
 259253.12738902 194928.93060119 159365.67275828 180053.84525294
 171928.35325225 114264.92269307]


In [13]:
from joblib import dump

# Save all required objects
dump(model, '../models/model.joblib')
dump(imputer, '../models/imputer.joblib')
dump(scaler, '../models/scaler.joblib')
dump(encoder, '../models/encoder.joblib')

print("Saved: model.joblib, imputer.joblib, scaler.joblib, encoder.joblib")

Saved: model.joblib, imputer.joblib, scaler.joblib, encoder.joblib


## Modified Model Inference

In [17]:
try:
    model = load('../models/model.joblib')
except FileNotFoundError:
    print("Error: Model file not found. Train model first.")
try:
    imputer = load('../models/imputer.joblib')
except FileNotFoundError:
    print("Error: Imputer file not found.")
try:
    scaler = load('../models/scaler.joblib')
except FileNotFoundError:
    print("Error: Scaler file not found.")
try:
    encoder = load('../models/encoder.joblib')
except FileNotFoundError:
    print("Error: Encoder file not found.")

inf_cat = imputer.transform(test_df[categorical_features])
inf_cont = scaler.transform(test_df[continuous_features])
inf_encoded = encoder.transform(inf_cat).toarray()

inf_processed = np.concatenate([inf_cont, inf_encoded], axis=1)
predictions = model.predict(inf_processed)

print("Predictions sample:", predictions[:10])

Predictions sample: [ 88264.41248655 150444.44970655 195167.82402368 191164.7502481
 259253.12738902 194928.93060119 159365.67275828 180053.84525294
 171928.35325225 114264.92269307]
