In [25]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Load the dataset
df = pd.read_csv("/Users/malhar.inamdar/Desktop/cummins/train.csv")  

# Select key features including additional bathroom counts
selected_features = [
    "OverallQual", "GarageCars", "Neighborhood", "GrLivArea",
    "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath",
    "TotalBsmtSF", "TotRmsAbvGrd", "YearBuilt"
]

# Extract relevant features and target
X = df[selected_features]
y = df["SalePrice"]

# One-hot encode categorical features (Neighborhood in this case)
X = pd.get_dummies(X, columns=["Neighborhood"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model and feature names
with open("house_price_mlmodel.pkl", "wb") as f:
    pickle.dump({"model": model, "features": X_train.columns.tolist()}, f)

print("Model trained and saved as 'house_price_mlmodel.pkl'")


Model trained and saved as 'house_price_mlmodel.pkl'


In [None]:
import pickle
import pandas as pd

# Load trained model and features
with open("house_price_model.pkl", "rb") as f:
    data = pickle.load(f)
model = data["model"]
model_features = data["features"]

# Example: user inputs for numerical features (ensure you include all that were used during training)
user_input = {
    #"OverallQual": 7,          # Example overall quality score
    "GarageCars": 3, 
    "GrLivArea": 1300, 
    "BsmtFullBath": 3,         # Additional bathroom features
    "BsmtHalfBath": 4,
    "FullBath": 2, 
    "HalfBath": 2,
    "TotalBsmtSF": 1000, 
    "TotRmsAbvGrd": 6, 
    #"YearBuilt": 1995
}

# User's preferred locations (can be multiple)
preferred_locations = ["Somerst","BrkSide","Sawyer","CollgCr", "Blmngtn", "NWAmes"]  # Example neighborhoods

# Convert input into DataFrame
input_df = pd.DataFrame([user_input])

# One-hot encode categorical features
# Since 'Neighborhood' is a categorical feature in our model, we need to add columns for each possible neighborhood.
# In the training phase, we encoded 'Neighborhood', so here we make sure that if the user provides a preferred neighborhood,
# it's added to the input DataFrame.
for loc in preferred_locations:
    col_name = f"Neighborhood_{loc}"
    input_df[col_name] = 1

# Some encoded columns might be missing if they weren't in user input; adjust to match the model features
input_df = pd.get_dummies(input_df)

# Ensure all feature columns match the model
input_df = input_df.reindex(columns=model_features, fill_value=0)

# Predict house price
predicted_price = model.predict(input_df)[0]
print(f"Predicted Price: ${predicted_price:,.2f}")

# Define budget range (±10%)
budget_min, budget_max = predicted_price * 0.9, predicted_price * 1.1

# Filter dataset to find houses within budget and preferred locations
filtered_houses = df[
    (df["SalePrice"] >= budget_min) & 
    (df["SalePrice"] <= budget_max) &
    (df["Neighborhood"].isin(preferred_locations))
]

if filtered_houses.empty:
    print("No houses found within the predicted budget and preferred locations.")
    
    # Try without location filter as a fallback
    budget_houses = df[
        (df["SalePrice"] >= budget_min) & 
        (df["SalePrice"] <= budget_max)
    ]
    
    if not budget_houses.empty:
        print(f"Found {len(budget_houses)} houses within your budget, but in different neighborhoods.")
        print(budget_houses[["Neighborhood", "SalePrice", "OverallQual", "GarageCars"]].head(10))
else:
    print(f"Found {len(filtered_houses)} houses within budget and preferred locations.")
    print("Top recommendations:")
    
    # Calculate similarity score based on the absolute difference in price from the predicted price
    filtered_houses["price_diff"] = abs(filtered_houses["SalePrice"] - predicted_price)
    
    # Sort by price difference and overall quality score
    sorted_houses = filtered_houses.sort_values(
        by=["price_diff", "OverallQual"], 
        ascending=[True, False]
    )
    
    # Display relevant information for the top results
    recommended = sorted_houses[
        ["Neighborhood", "SalePrice", "OverallQual", "GrLivArea", 
         "TotalBsmtSF", "YearBuilt", "GarageCars", "FullBath", 
         "BsmtFullBath", "BsmtHalfBath", "HalfBath"]
    ].head(10)
    
    # improving readability of the output
    recommended["SalePrice"] = recommended["SalePrice"].apply(lambda x: f"${x:,.2f}")
    print(recommended)
    
    # Provide summary statistics
    print("\nSummary Statistics for Recommended Houses:")
    print(f"Average Quality Score: {sorted_houses['OverallQual'].mean():.1f}/10")
    print(f"Average Size: {sorted_houses['GrLivArea'].mean():.0f} sq ft")
    print(f"Newest Property: {sorted_houses['YearBuilt'].max()}")
    print(f"Oldest Property: {sorted_houses['YearBuilt'].min()}")


Predicted Price: $116,594.00
Found 44 houses within budget and preferred locations.
Top recommendations:
     Neighborhood    SalePrice  OverallQual  GrLivArea  TotalBsmtSF  \
960       BrkSide  $116,500.00            5        858          858   
1202      BrkSide  $117,000.00            5       1348          884   
9         BrkSide  $118,000.00            5       1077          991   
149       BrkSide  $115,000.00            5       1344          896   
609        Sawyer  $118,500.00            4       1029         1029   
51        BrkSide  $114,500.00            6       1176          816   
437       BrkSide  $119,000.00            6        904          884   
1400      BrkSide  $120,000.00            6       1158          862   
555       BrkSide  $113,000.00            5       1048          993   
1025      CollgCr  $112,500.00            5        882          882   

      YearBuilt  GarageCars  FullBath  BsmtFullBath  BsmtHalfBath  HalfBath  
960        1958           0        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_houses["price_diff"] = abs(filtered_houses["SalePrice"] - predicted_price)
