In [None]:
from google.colab import files
uploaded = files.upload()

Saving properties_export_2026-01-29.csv to properties_export_2026-01-29 (1).csv


In [None]:
import pandas as pd

df = pd.read_csv(list(uploaded.keys())[0])

features = [
    "Price", "City", "Type", "Capacity", "Vacancies",
    "Rating", "Views", "Gender Preference", "Sharing Type", "Amenities"
]

df = df[features].copy()
df.head()

Unnamed: 0,Price,City,Type,Capacity,Vacancies,Rating,Views,Gender Preference,Sharing Type,Amenities
0,4500,Hyderabad,pg,10,3,4.5,150.0,unisex,double,WiFi; Food; AC; Parking
1,3500,Hyderabad,hostel,25,5,4.8,280.0,female,shared,WiFi; Food; AC; Laundry; Security
2,2800,Hyderabad,pg,8,2,3.8,95.0,unisex,triple,WiFi; Power Backup; Security
3,6500,Hyderabad,hostel,12,4,4.9,320.0,unisex,single,WiFi; Food; AC; Parking; Laundry; Power Backup...
4,2200,Hyderabad,hostel,30,8,3.5,160.0,male,shared,WiFi; Power Backup; Security


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

df["Amenities"] = df["Amenities"].fillna("").apply(
    lambda x: [i.strip().lower() for i in x.split(";")]
)

mlb = MultiLabelBinarizer()
amenities_encoded = pd.DataFrame(
    mlb.fit_transform(df["Amenities"]),
    columns=mlb.classes_
)

df = pd.concat([df.drop("Amenities", axis=1), amenities_encoded], axis=1)


In [None]:
from sklearn.preprocessing import LabelEncoder

for col in ["City", "Type", "Gender Preference", "Sharing Type"]:
    df[col] = LabelEncoder().fit_transform(df[col])


In [None]:
df["liked"] = (
    (df["Rating"] >= 4.3) &
    (df["Vacancies"] >= 3)
).astype(int)

df["liked"].value_counts()


Unnamed: 0_level_0,count
liked,Unnamed: 1_level_1
0,44
1,13


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop("liked", axis=1)
y = df["liked"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


In [None]:
!pip install xgboost lightgbm




In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb

models = {
    "LogisticRegression": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=2000))
    ]),

    "RandomForest": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", RandomForestClassifier(n_estimators=300, max_depth=8))
    ]),

    "GradientBoosting": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", GradientBoostingClassifier())
    ]),

    "XGBoost": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", xgb.XGBClassifier(
            n_estimators=300,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss"
        ))
    ]),

    "LightGBM": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", lgb.LGBMClassifier(
            n_estimators=300,
            max_depth=6,
            learning_rate=0.05
        ))
    ])
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)

    results[name] = {"accuracy": acc, "f1": f1}

pd.DataFrame(results).T.sort_values("f1", ascending=False)


In [None]:
pd.DataFrame(results).T.sort_values("f1", ascending=False)


Unnamed: 0,accuracy,f1
RandomForest,1.0,1.0
XGBoost,1.0,1.0
GradientBoosting,1.0,1.0
LogisticRegression,0.933333,0.857143
LightGBM,0.733333,0.0


In [None]:
import joblib

best_model = models["XGBoost"]

joblib.dump(best_model, "best_model.pkl")
joblib.dump(list(X.columns), "feature_names.pkl")

files.download("best_model.pkl")
files.download("feature_names.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>