# 🏠 Airbnb Price Prediction Project

### 📌 Objective
Build a regression model to predict Airbnb listing prices based on property details.

In [None]:
# 📦 Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

In [None]:
# 📥 Load Dataset
df = pd.read_csv("Airbnb_data - airbnb_data.csv")
df.head()

## 🔍 Data Exploration

In [None]:
df.info()
df.describe()

In [None]:
# Missing values
df.isnull().sum().sort_values(ascending=False).head(10)

## 🧹 Data Preprocessing

In [None]:
# Drop unneeded columns and handle missing data
df = df.drop(columns=['id', 'name', 'thumbnail_url', 'zipcode'])
df = df.dropna(subset=['log_price'])  # target should not be missing
df['bedrooms'].fillna(df['bedrooms'].median(), inplace=True)
df['review_scores_rating'].fillna(df['review_scores_rating'].mean(), inplace=True)
df = df.dropna()  # drop remaining for simplicity

In [None]:
# One-hot encoding for categorical variables
categorical_cols = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'neighbourhood']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

## ✂️ Feature Selection & Splitting

In [None]:
# Define features and target
X = df.drop(columns=['log_price'])
y = df['log_price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 🤖 Model Building

In [None]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [None]:
# XGBoost
xgbr = xgb.XGBRegressor()
xgbr.fit(X_train, y_train)
y_pred_xgb = xgbr.predict(X_test)

## 📊 Evaluation

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"📈 {model_name} Evaluation")
    print("RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("R² Score:", r2_score(y_true, y_pred))

evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_xgb, "XGBoost")

## 💡 Insights

In [None]:
# Feature Importance from XGBoost
importances = pd.Series(xgbr.feature_importances_, index=X.columns)
importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Important Features")
plt.show()

### ✅ Conclusion
- Built models to predict Airbnb prices.
- Identified top factors influencing prices.
- XGBoost performed better based on RMSE and R².