In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('airbnb_seattle.csv')

# Convert object data types to category
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype('category')

# Replace all NAs with 0
df.fillna(0, inplace=True)

# Calculate Estimated Revenue
df['estimated_revenue'] = df['price'] * df['minimum_nights']

# Selecting the relevant columns after cleaning
cleaned_df = df[['host_is_superhost', 'host_identity_verified', 'neighbourhood_group_cleansed',
                 'accommodates', 'number_of_reviews', 'review_scores_communication',
                 'review_scores_cleanliness', 'review_scores_value', 'review_scores_accuracy',
                 'review_scores_checkin', 'review_scores_location', 'estimated_revenue']]

# Convert price column to numeric after removing any dollar signs or commas
cleaned_df['price'] = cleaned_df['price'].replace('[\$,]', '', regex=True).astype(float)

# Display cleaned data
print(cleaned_df.head())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plotting categorical variables vs. Estimated Revenue
sns.boxplot(x='host_is_superhost', y='estimated_revenue', data=cleaned_df)
plt.title('Superhost Status vs Estimated Revenue')
plt.show()

sns.boxplot(x='host_identity_verified', y='estimated_revenue', data=cleaned_df)
plt.title('Host Identity Verification vs Estimated Revenue')
plt.show()

sns.boxplot(x='neighbourhood_group_cleansed', y='estimated_revenue', data=cleaned_df)
plt.title('Neighbourhood Group vs Estimated Revenue')
plt.show()

# Plotting numerical variables vs. Estimated Revenue
sns.scatterplot(x='accommodates', y='estimated_revenue', data=cleaned_df)
plt.title('Accommodates vs Estimated Revenue')
plt.show()

sns.heatmap(cleaned_df.corr(), annot=True)
plt.title('Correlation Heatmap')
plt.show()


In [1]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Univariate Regression: Price vs Accommodates
X = cleaned_df[['accommodates']]
y = cleaned_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f'Univariate Regression: Accommodates vs Price')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'R^2: {r2_score(y_test, y_pred)}')

plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.title('Univariate Regression: Accommodates vs Price')
plt.show()


In [None]:
# Multivariate Regression: Using multiple features to predict price
X = cleaned_df[['accommodates', 'number_of_reviews', 'neighbourhood_group_cleansed']]
y = cleaned_df['price']

X = pd.get_dummies(X, drop_first=True)  # Convert categorical variables to dummy/indicator variables

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f'Multivariate Regression: Multiple Features vs Price')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'R^2: {r2_score(y_test, y_pred)}')

plt.scatter(y_test, y_pred)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('Multivariate Regression: Multiple Features vs Price')
plt.show()


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Prepare the data
X = cleaned_df[['accommodates', 'number_of_reviews', 'neighbourhood_group_cleansed',
                'host_is_superhost', 'host_identity_verified', 'review_scores_communication',
                'review_scores_cleanliness', 'review_scores_value', 'review_scores_accuracy',
                'review_scores_checkin', 'review_scores_location']]

y = cleaned_df['price']

# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the XGBoost Regressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05, max_depth=5, random_state=42)

# Fit the model
xgb_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_reg.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'XGBoost Regressor: Price Prediction')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Feature importance plot
xgb.plot_importance(xgb_reg)
plt.title('XGBoost Feature Importance')
plt.show()

# Plotting the predicted vs actual values
plt.scatter(y_test, y_pred, color='black')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
plt.title('Predicted vs Actual Prices')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.show()
