In [1]:
import wandb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

In [2]:
# Load dataset
data = pd.read_csv('/content/p1. tehranhouses.csv')
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000,61666.67
1,60,1,True,True,True,Shahran,1850000000,61666.67
2,79,2,True,True,True,Pardis,550000000,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000,233333.33


In [3]:
# Preprocessing: Encoding boolean columns
bool_cols = ['Parking', 'Warehouse', 'Elevator']
for col in bool_cols:
    data[col] = data[col].astype(int)


In [4]:
# Feature selection and target
X = data[['Area', 'Room', 'Parking', 'Warehouse', 'Elevator']]
y = data['Price']


In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Initialize WandB
wandb.init(project="tehran-house-price-prediction", config={
    "model": "RandomForestRegressor",
    "n_estimators": 100,
    "max_depth": None,
    "random_state": 42
})
config = wandb.config   #7ea7f26d339a8738436e045e3f18eeaddfc60412

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [7]:
# Remove commas and convert numeric columns to float
for col in ['Area', 'Room', 'Price']:
    data[col] = data[col].replace('[^0-9]', '', regex=True).astype(float)

# Convert boolean columns to integers
bool_cols = ['Parking', 'Warehouse', 'Elevator']
for col in bool_cols:
    data[col] = data[col].astype(int)

# Feature selection and target
X = data[['Area', 'Room', 'Parking', 'Warehouse', 'Elevator']]
y = data['Price']

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=config.n_estimators, max_depth=config.max_depth, random_state=config.random_state)
model.fit(X_train, y_train)

# Evaluate the model
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 2.391716010581051e+19


In [8]:
# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [9]:
# Log metrics to WandB
wandb.log({"mse": mse, "rmse": rmse})

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")


MSE: 2.391716010581051e+19
RMSE: 4890517365.8633


In [10]:
# Save the model
wandb.save('random_forest_model.pkl')

wandb.finish()


0,1
mse,▁
rmse,▁

0,1
mse,2.391716010581051e+19
rmse,4890517365.8633


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from joblib import dump
import xgboost as xgb

# ستون‌های عددی و دسته‌بندی
numeric_features = ['Area', 'Room']
categorical_features = ['Parking', 'Warehouse', 'Elevator', 'Address']

# پیش‌پردازش برای ستون‌های دسته‌بندی
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# ترکیب پیش‌پردازش
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# مدل XGBoost (استفاده از مدل آموزش داده شده)
model = xgb.XGBRegressor()

# ایجاد Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# آموزش Pipeline روی داده‌ها
X = data.drop(columns=['Price', 'Price(USD)'])
y = data['Price']
pipeline.fit(X, y)

# ذخیره Pipeline
dump(pipeline, 'xgbpipe.joblib')

print("Pipeline ذخیره شد!")


Pipeline ذخیره شد!
