# House Prices Modeling

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_log_error

### Load the dataset

In [22]:
train_df = pd.read_csv('E:/dsp_navaraja_mannepalli/data/train.csv')
test_df = pd.read_csv('E:/dsp_navaraja_mannepalli/data/test.csv')


### Split the data

In [23]:
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Preprocessing

### Feature Selection

In [24]:
continuous_features = ['LotArea', 'GrLivArea']
categorical_features = ['MSZoning', 'Neighborhood']


### Feature Processing (Scaling and encoding)

In [31]:
# Imputation
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X_train[categorical_features])  # Fit ONLY on train
X_train_cat_imputed = imputer.transform(X_train[categorical_features])
X_test_cat_imputed = imputer.transform(X_test[categorical_features])

# Scaling
scaler = StandardScaler()
scaler.fit(X_train[continuous_features])  # Fit ONLY on train
X_train_cont_scaled = scaler.transform(X_train[continuous_features])
X_test_cont_scaled = scaler.transform(X_test[continuous_features])

# For OneHotEncoder, use get_feature_names_out()
encoder = OneHotEncoder()
encoder.fit(X_train[categorical_features])
encoded_columns = encoder.get_feature_names_out(categorical_features)

# Combine with continuous feature names
all_columns = list(continuous_features) + list(encoded_columns)

# Assign to processed data
X_train_processed_df = pd.DataFrame(X_train_processed, columns=all_columns)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=all_columns)

## Model Training

In [32]:
model = LinearRegression()
model.fit(X_train_processed, y_train)

## Model Evaluation

In [33]:
y_pred = model.predict(X_test_processed)

def compute_rmsle(y_test, y_pred, precision=2):
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

rmsle = compute_rmsle(y_test, y_pred)
print(f'RMSLE: {rmsle}')

print("\nRandom sample of 20 rows from the test set predictions:")
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions_df.sample(20))


RMSLE: 0.21

Random sample of 20 rows from the test set predictions:
      Actual      Predicted
1032  310000  329566.709459
237   194500  198260.296153
271   241500  191030.087418
1018  160000  181906.941975
1036  315500  219564.058209
56    172500  225746.023488
423   315000  334658.449072
451   280000  221320.174108
865   148500  123289.357872
765   264132  253047.526186
614    75500   62854.267281
846   213000  200464.414036
925   175000  146929.159437
772   107000  110658.288916
736    93500  125969.107591
365   147000  126261.947486
1226  214000  284466.014007
691   755000  465618.706973
297   239000  243938.928816
427   109008  116212.472807


In [52]:
# Save the processed training and test sets
pd.DataFrame(X_train_processed_df).to_parquet('../models/X_train_processed_ref1.parquet', index=False)
pd.DataFrame(X_test_processed_df).to_parquet('../models/X_test_processed_ref1.parquet', index=False)

In [38]:
print("Original columns (reference):", expected_X_train.columns)
print("Refactored columns (current):", pd.DataFrame(X_train_processed).columns)

Original columns (reference): Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31'],
      dtype='object')
Refactored columns (current): RangeIndex(start=0, stop=32, step=1)


In [46]:
# Ensure column names match reference (strings)
X_train_processed_df = pd.DataFrame(X_train_processed)
X_train_processed_df.columns = X_train_processed_df.columns.astype(str)

X_test_processed_df = pd.DataFrame(X_test_processed)
X_test_processed_df.columns = X_test_processed_df.columns.astype(str)

In [47]:
expected_X_train = pd.read_parquet('../models/X_train_processed_ref.parquet')
pd.DataFrame(X_train_processed).equals(expected_X_train)  # Now uses RangeIndex

False

In [50]:
# Load reference data (saved in Step 0.1)
expected_X_train = pd.read_parquet('../models/X_train_processed_ref.parquet')
expected_X_test = pd.read_parquet('../models/X_test_processed_ref.parquet')

try:
    pd.testing.assert_frame_equal(X_train_processed_df, expected_X_train, check_dtype=True)
    pd.testing.assert_frame_equal(X_test_processed_df, expected_X_test, check_dtype=True)
    print("✅ All assertions passed! DataFrames are identical.")
except AssertionError as e:
    print("❌ Assertion failed:", e)

✅ All assertions passed! DataFrames are identical.


In [51]:
import joblib
joblib.dump(scaler, '../models/scaler.joblib')
joblib.dump(encoder, '../models/encoder.joblib')

['../models/encoder.joblib']