In [1]:
import os, json
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# 1) Load
df = pd.read_csv("data.csv").replace([np.inf, -np.inf], np.nan)

# 2) Target (MEDV typical for Boston)
target = next((c for c in df.columns if c.strip().lower()=="medv"), df.columns[-1])
X = df.drop(columns=[target])
y = df[target]

# 3) Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4) Preprocessing
num_feats = X.select_dtypes(include=[np.number]).columns.tolist()
cat_feats = [c for c in X.columns if c not in num_feats]

numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numeric_tf, num_feats),
    ("cat", categorical_tf, cat_feats),
])

# 5) Linear Regression pipeline
linreg_pipe = Pipeline([
    ("prep", preprocessor),
    ("model", LinearRegression())
])

linreg_pipe.fit(X_train, y_train)

# 6) Evaluate
y_pred = linreg_pipe.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R²:   {r2:.4f}")

# 7) Save pretrained model + meta
joblib.dump(linreg_pipe, "boston_linear_regression.pkl")
with open("linear_feature_meta.json", "w") as f:
    json.dump({
        "all_features": X.columns.tolist(),
        "numeric_features": num_feats,
        "categorical_features": cat_feats,
        "target": target
    }, f, indent=2)


Test RMSE: 4.9190
Test R²:   0.6701


In [2]:
#a) Predict on the test split (show a few rows)

In [3]:
preview = X_test.copy()
preview[f"{target}_TRUE"] = y_test
preview[f"{target}_PRED"] = y_pred
preview.head(10)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV_TRUE,MEDV_PRED
173,0.09178,0.0,4.05,0,0.51,6.416,84.1,2.6463,5,296,16.6,395.5,9.04,23.6,29.024604
274,0.05644,40.0,6.41,1,0.447,6.758,32.9,4.0776,4,254,17.6,396.9,3.53,32.4,35.926679
491,0.10574,0.0,27.74,0,0.609,5.983,98.8,1.8681,4,711,20.1,390.11,18.07,13.6,14.766446
72,0.09164,0.0,10.81,0,0.413,6.065,7.8,5.2873,4,305,19.2,390.91,5.52,22.8,25.007034
452,5.09017,0.0,18.1,0,0.713,6.297,91.8,2.3682,24,666,20.2,385.09,17.27,16.1,18.755807
76,0.10153,0.0,12.83,0,0.437,6.279,74.5,4.0522,5,398,18.7,373.66,11.97,20.0,23.280058
316,0.31827,0.0,9.9,0,0.544,5.914,83.2,3.9986,4,304,18.4,390.7,18.33,17.8,17.623231
140,0.2909,0.0,21.89,0,0.624,6.174,93.6,1.6119,4,437,21.2,388.08,24.16,14.0,14.242842
471,4.03841,0.0,18.1,0,0.532,6.229,90.7,3.0993,24,666,20.2,395.33,12.87,19.6,23.104073
500,0.22438,0.0,9.69,0,0.585,6.027,79.7,2.4982,6,391,19.2,396.9,14.33,16.8,20.598712


In [4]:
# b) Predict from a custom input (e.g., from UI or a dict

In [5]:
import pandas as pd
import joblib
import json

# Load artifacts
pipe = joblib.load("boston_linear_regression.pkl")
with open("linear_feature_meta.json", "r") as f:
    meta = json.load(f)

all_feats = meta["all_features"]

# Example custom feature dict (fill with your own numbers/strings)
example = {
    # Classic Boston features example; adjust to your data columns if different
    "CRIM": 0.1, "ZN": 18.0, "INDUS": 2.3, "CHAS": 0, "NOX": 0.5,
    "RM": 6.2, "AGE": 45.0, "DIS": 4.5, "RAD": 1, "TAX": 300,
    "PTRATIO": 17.5, "B": 390.0, "LSTAT": 9.5
}

# Ensure all required columns exist
row = {c: example.get(c, 0.0) for c in all_feats}  # safe default 0.0 / ""
X_new = pd.DataFrame([row], columns=all_feats)

pred = pipe.predict(X_new)[0]
print(f"Predicted {meta['target']}: {pred:.2f}  (MEDV is $1000s)")


Predicted MEDV: 23.98  (MEDV is $1000s)
