<a href="https://colab.research.google.com/github/nipun12-cyber/nipun_musick/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:
try:
  from xgboost import XGBRegressor
  HAS_XGBOOST = True
except Exception:
  HAS_XGBOOST = False

In [5]:
# Load the dataset
df = pd.read_csv('/content/Housing.csv')

# Display the first 5 rows
display(df.head())

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [7]:
def basic_eda(df, target):
  print('--- DATA SHAPE ---')
  print(df.shape)
  print('\n--- TARGET SUMMARY ---')
  print(df[target].describe())
  print('\n--- NULLS PER COLUMN ---')
  print(df.isnull().sum().sort_values(ascending=False).head(20))

In [34]:
def build_preprocessor(df, target, numeric_override=None, categorical_override=None):
    # heuristics: numeric = number dtype, categorical = object or low-cardinality ints
    if numeric_override is None:
      numeric_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
    else:
      numeric_cols = numeric_override
    if categorical_override is None:
        cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
    # also treat integer columns with low cardinality as categorical
        for col in df.select_dtypes(include=['int64']).columns:
            if col not in numeric_cols:
              if df[col].nunique() < 20:
                cat_cols.append(col)
    else:
      cat_cols = categorical_override


    # ensure we don't include the target accidentally
    numeric_cols = [c for c in numeric_cols if c not in [target]]
    cat_cols = [c for c in cat_cols if c not in [target]]


    num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])


    cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])


    preprocessor = ColumnTransformer([
    ('num', num_pipeline, numeric_cols),
    ('cat', cat_pipeline, cat_cols)
    ], remainder='drop')


    return preprocessor, numeric_cols, cat_cols

In [36]:
def evaluate_model(model, X_test, y_test):
  preds = model.predict(X_test)
  mae = mean_absolute_error(y_test, preds)
  mse = mean_squared_error(y_test, preds)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, preds)
  return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

In [37]:
def main(args):
  TARGET = args.target


  df = load_data(args.data)
  if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not in dataset columns")


  # Basic EDA
  basic_eda(df, TARGET)


  # Quick feature dropping (user can edit)
  drop_cols = []
  if drop_cols:
    df = df.drop(columns=drop_cols)


  # Split
  X = df.drop(columns=[TARGET])
  y = df[TARGET]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  # Build preprocessor
  preprocessor, num_cols, cat_cols = build_preprocessor(pd.concat([X_train, y_train], axis=1), TARGET)
  print('\nNumeric cols detected:', num_cols)
  print('Categorical cols detected:', cat_cols)

  models = {
      'linear': LinearRegression(),
      'rf': RandomForestRegressor(random_state=42, n_jobs=-1)
  }
  if HAS_XGBOOST:
    models['xgb'] = XGBRegressor(random_state=42, n_jobs=1)
  results = {}
  best_model = None
  best_score = float('inf')



  for name, estimator in models.items():
    print(f"\n--- Training {name} ---")
    pipe = Pipeline([
        ('pre', preprocessor),
        ('model', estimator)
    ])
    pipe.fit(X_train, y_train)
    res = evaluate_model(pipe, X_test, y_test)
    results[name] = res
    print(name, res)


    # choose by RMSE (lower is better)
    if res['RMSE'] < best_score:
      best_score = res['RMSE']
      best_model = (name, pipe)
  print('\n--- Summary of models ---')
  for k,v in results.items():
    print(k, v)


  print('\nBest model:', best_model[0], 'with RMSE=', best_score)

  if 'rf' in models:
    print('\n--- Hyperparameter tuning for RandomForest (RandomizedSearch) ---')
    rf_pipe = Pipeline([('pre', preprocessor), ('model', RandomForestRegressor(random_state=42, n_jobs=-1))])
    param_dist = {
      'model__n_estimators': [100, 200, 400],
      'model__max_depth': [None, 8, 16, 32],
      'model__min_samples_split': [2, 5, 10]
    }
    rs = RandomizedSearchCV(rf_pipe, param_distributions=param_dist, n_iter=8, cv=3, scoring='neg_root_mean_squared_error', random_state=42, n_jobs=-1)
    rs.fit(X_train, y_train)
    print('Best params:', rs.best_params_)
    tuned = rs.best_estimator_
    print('Tuned RMSE:', evaluate_model(tuned, X_test, y_test))


    # consider tuned as candidate
    tuned_rmse = evaluate_model(tuned, X_test, y_test)['RMSE']
    if tuned_rmse < best_score:
      best_score = tuned_rmse
      best_model = ('rf_tuned', tuned)
  print('\nFinal best model:', best_model[0], 'RMSE=', best_score)

  output_path = args.output or 'best_house_model.joblib'
  joblib.dump(best_model[1], output_path)
  print('Saved best model to', output_path)

  try:
      model_obj = best_model[1].named_steps['model']
      if hasattr(model_obj, 'feature_importances_'):
        print('\n--- Feature importances (raw) ---')
      importances = model_obj.feature_importances_
  # We can't easily map back to OHE feature names here without fit-time artifacts; user can inspect on their side.
      print(importances[:20])
  except Exception:
    pass


  print('\nDone.')
if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument('--data', type=str, default='/content/Housing.csv', help='Path to CSV data file')
  parser.add_argument('--target', type=str, default='price', help='Target column name')
  parser.add_argument('--output', type=str, default=None, help='Path to save trained model')
  args, unknown = parser.parse_known_args()
  main(args)

--- DATA SHAPE ---
(545, 13)

--- TARGET SUMMARY ---
count    5.450000e+02
mean     4.766729e+06
std      1.870440e+06
min      1.750000e+06
25%      3.430000e+06
50%      4.340000e+06
75%      5.740000e+06
max      1.330000e+07
Name: price, dtype: float64

--- NULLS PER COLUMN ---
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

Numeric cols detected: ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
Categorical cols detected: ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

--- Training linear ---
linear {'MAE': 970043.4039201644, 'RMSE': np.float64(1324506.9600914402), 'R2': 0.6529242642153175}

--- Training rf ---
rf {'MAE': 1025961.1683486238, 'RMSE': np.float64(1403925

In [49]:
import joblib
import pandas as pd

# Load the trained model
model = joblib.load('best_house_model.joblib')

# Example new house data
new_data = pd.DataFrame([{
    'area': 2500,
    'bedrooms': 3,
    'bathrooms': 2,
    'stories': 2,
    'parking': 1,
    'mainroad': 'no',
    'guestroom': 'yes',
    'basement': 'yes',
    'hotwaterheating': 'yes',
    'airconditioning': 'yes',
    'prefarea': 'yes',
    'furnishingstatus': 'furnished'
}])

# Make prediction
predicted_price = model.predict(new_data)[0]

# Print with dollar sign and comma formatting
print(f"Predicted house price: ${predicted_price:,.2f}")


Predicted house price: $7,036,803.55


In [31]:
def load_data(data_path):
  """Loads data from a CSV file into a pandas DataFrame."""
  return pd.read_csv(data_path)