In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import datetime as dt
from textblob import TextBlob
import ast
from textblob.sentiments import NaiveBayesAnalyzer

## Sentiment analysis

In [None]:
reviews_sep = pd.read_csv('Data/reviews_sep.csv')
reviews_aug = pd.read_csv('Data/reviews_aug.csv')
reviews_oct = pd.read_csv('Data/reviews_oct.csv')
reviews_nov = pd.read_csv('Data/reviews_nov.csv')
reviews_raw= pd.concat([reviews_sep, reviews_aug, reviews_oct,reviews_nov], ignore_index=True)

In [None]:
# Drop columns that are not needed
reviews_raw = pd.DataFrame.drop(reviews_raw, columns=[
    'id',
    'date',
    'reviewer_id',
    'reviewer_name'
])

def calculate_sentiment(entry):
    #case where the entry is null
    if (type(entry) != str and math.isnan(entry)):
        return -55
    opinion = TextBlob(entry)
    #return the polarity of the text's sentiment
    return opinion.sentiment.polarity

# Apply the sentiment analysis to the comments
reviews_raw['comments'] = reviews_raw['comments'].apply(calculate_sentiment)
# Drop the null rows
reviews_raw = reviews_raw[reviews_raw['comments'] != -55]
# Get the average sentiment for each listing
reviews_cleaned = reviews_raw.groupby('listing_id')['comments'].mean()

reviews_cleaned.to_csv('Data/reviews_cleaned.csv')

## Data cleaning

In [None]:
data_sep = pd.read_csv('Data/listings_sep.csv')
data_aug = pd.read_csv('Data/listings_aug.csv')
data_oct = pd.read_csv('Data/listings_oct.csv')
data_nov = pd.read_csv('Data/listings_nov.csv')

In [None]:
data = pd.concat([data_sep, data_aug, data_oct,data_nov], ignore_index=True)

In [None]:
data.info()

In [None]:
data = data.drop(columns=['host_id','listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url',
       'host_url', 'host_name', 'host_location', 'host_about','host_response_time',
        'host_thumbnail_url', 'host_picture_url','host_verifications',
       'host_neighbourhood', 'host_listings_count',
        'neighbourhood',
       'bathrooms_text',
       'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_updated',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'calendar_last_scraped',
        'first_review',
       'last_review',
       'license',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms'],axis=1)

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data = data.dropna(subset=['price'])
len(data)

In [None]:
print(data['price'].head())

In [None]:
data['price'] = data['price'].replace(r'[\$,]', '', regex=True).replace(',', '.', regex=True).astype(float)

In [None]:
data['price'].describe()

In [None]:
data['price'] = np.log1p(data['price'])

In [None]:
data['price'].describe()

In [None]:
data['price'].hist(bins=50)

In [None]:
data.plot(kind="scatter", x="longitude", y="latitude", grid=True,
s=data["price"],
c=data["price"], cmap="jet", colorbar=True,
legend=True, sharex=False, figsize=(20,15))
plt.show()

In [None]:
data['host_since'].head()

In [None]:
data['host_lifetime'] = pd.to_datetime(data['host_since']).apply(lambda x: (pd.to_datetime('today') - x).days)
data.drop(columns=['host_since'], inplace=True)

In [None]:
data['host_response_rate'].head()

In [None]:
data['host_response_rate'] = data['host_response_rate'].fillna('0%').str.rstrip('%').astype('float') / 100.0
data['host_acceptance_rate'] = data['host_acceptance_rate'].fillna('0%').str.rstrip('%').astype('float') / 100.0

In [None]:
def clean_booltype(entry):
    if (entry == 't'):
        return 1
    else:
        return 0

In [None]:
data['host_is_superhost'] = data['host_is_superhost'].apply(clean_booltype)
data['host_has_profile_pic'] = data['host_has_profile_pic'].apply(clean_booltype)
data['host_identity_verified'] = data['host_identity_verified'].apply(clean_booltype)
data['has_availability'] = data['has_availability'].apply(clean_booltype)
data['instant_bookable'] = data['instant_bookable'].apply(clean_booltype)

In [None]:
data.keys()

In [None]:
missing_data = data.isnull().sum()
missing_data_percentage = (missing_data / len(data)) * 100
missing_data_summary = pd.DataFrame({'Missing Values': missing_data, 'Percentage': missing_data_percentage})
print(missing_data_summary.sort_values(by='Missing Values', ascending=False))

In [None]:
for x in ['review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'reviews_per_month']:
   data[x]= data[x].fillna(0)
data.dropna(subset=['beds', 'bedrooms','bathrooms'], inplace=True)

In [None]:
reviews_cleaned = pd.read_csv('Data/reviews_cleaned.csv')

In [None]:
data = data.set_index('id').join(reviews_cleaned.set_index('listing_id'))
data['comments'] = data['comments'].fillna(0)

In [None]:
data['comments'].describe()

In [None]:
data.info()

In [None]:
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
data[categorical_columns].nunique()

In [None]:
data = pd.get_dummies(data, columns=['property_type', 'room_type','neighbourhood_group_cleansed','neighbourhood_cleansed'])

In [None]:
from collections import Counter

data['amenities'] = data['amenities'].apply(ast.literal_eval)

amenities_list = [amenity for sublist in data['amenities'] for amenity in sublist]
amenities_count = Counter(amenities_list)

# Filter amenities with count > 100 (for demonstration, using a lower threshold)
filtered_amenities = {amenity for amenity, count in amenities_count.items() if count > 100}
# Create binary columns for each filtered amenity
amenities_df = pd.DataFrame({amenity: data['amenities'].apply(lambda x: 1 if amenity in x else 0) for amenity in filtered_amenities})
# Concatenate the new binary columns with the original DataFrame
data = pd.concat([data, amenities_df], axis=1)

data = data.drop(columns=['amenities'])

In [None]:
data.info()

In [None]:
data = pd.read_csv('Data/listings_cleaned.csv')

In [None]:
from sklearn.model_selection import train_test_split

y = data['price']
X = data.drop(columns=['price'])
# Split the data into training + validation and testing sets (90% train+val, 10% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Further split the training + validation set into training and validation sets (11.1% val of the 90% train+val data, which is 10% of the original data)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1111, random_state=42)

# Print the sizes of the resulting DataFrames
print(f"Training Data: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Validation Data: X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"Testing Data: X_test: {X_test.shape}, y_test: {y_test.shape}")

In [None]:
y = data ['price']
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# Further split the training + validation set into training and validation sets (11.1% val of the 90% train+val data, which is 10% of the original data)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1111, random_state=42)
print(f"Training Data: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Validation Data: X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"Testing Data: X_test: {X_test.shape}, y_test: {y_test.shape}")


In [None]:
X = pd.DataFrame(X)


In [None]:
from geopy.distance import geodesic
nyc_center = (40.7128, -74.0060)
X = data.apply(lambda row: geodesic((row['latitude'], row['longitude']), nyc_center).miles, axis=1)
X['DistanceFromCenter'] = pd.DataFrame(X)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_val_standardized = scaler.transform(X_val)
X_test_standardized = scaler.transform(X_test)

# Convert the standardized arrays back to DataFrames
X_train_standardized = pd.DataFrame(X_train_standardized)
X_val_standardized = pd.DataFrame(X_val_standardized)
X_test_standardized = pd.DataFrame(X_test_standardized)

# Optimize memory usage by converting data types
X_train_standardized = X_train_standardized.astype('float32')
X_val_standardized = X_val_standardized.astype('float32')
X_test_standardized = X_test_standardized.astype('float32')

In [None]:
X_train_standardized.to_csv('X_train.csv', index=False)
X_val_standardized.to_csv('X_val.csv', index=False)
X_test_standardized.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_val.to_csv('y_val.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [None]:
data.to_csv('Data/listings_cleaned.csv', index=False)

## Feature Selection

In [3]:
%cd /kaggle/input/airbnb

/kaggle/input/airbnb


In [4]:
X_train = pd.read_csv('X_train.csv')
X_val = pd.read_csv('X_val.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()
y_val = pd.read_csv('y_val.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def print_evaluation_metrics(trained_model, trained_model_name, X_test, y_test, type):
    print('--------- For Model: ', trained_model_name, f' -----{type}---\n',)
    predicted_values = trained_model.predict(X_test)
    print("Mean Absolute Error (MAE): ",
          mean_absolute_error(y_test, predicted_values))
    print("Mean Squared Error (MSE): ", mean_squared_error(
        y_test, predicted_values))
    print("R-squared (R²): ", r2_score(y_test, predicted_values))

In [6]:
from sklearn.linear_model import Lasso

best_alpha = 0.0004

reg = Lasso(alpha=best_alpha, max_iter=10000)
reg.fit(X_train, y_train)
print_evaluation_metrics(reg, 'Lasso', X_train, y_train, 'train')
print_evaluation_metrics(reg, 'Lasso', X_test, y_test, 'Test')

--------- For Model:  Lasso  -----train---

Mean Absolute Error (MAE):  0.3139065634485807
Mean Squared Error (MSE):  0.18666539288838122
R-squared (R²):  0.7058825490054479
--------- For Model:  Lasso  -----Test---

Mean Absolute Error (MAE):  0.31574907281858255
Mean Squared Error (MSE):  0.19664143494735853
R-squared (R²):  0.6973424036493155


In [7]:
#lasso
selected_features_lasso = X_train.columns[reg.coef_!=0]
print(f"Number of selected features: {len(selected_features_lasso)}")
# Train Linear Regression model using selected features
X_train_lasso = X_train[selected_features_lasso]
X_test_lasso = X_test[selected_features_lasso]

Number of selected features: 596


## Running Models

In [8]:
X_train = X_train_lasso
X_test = X_test_lasso

In [20]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': np.linspace(0.0001, 100, 100),
}
grid_search = GridSearchCV(Ridge(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_ridge = grid_search.best_estimator_

# Evaluate the best model on the test set
print_evaluation_metrics(best_ridge, 'Ridge Regression', X_train_lasso, y_train, 'train')
print_evaluation_metrics(best_ridge, 'Ridge Regression', X_test_lasso, y_test, 'test')

--------- For Model:  Ridge Regression  -----train---

Mean Absolute Error (MAE):  0.31359008373353164
Mean Squared Error (MSE):  0.18639248931579946
R-squared (R²):  0.7063125467778955
--------- For Model:  Ridge Regression  -----test---

Mean Absolute Error (MAE):  0.3160260117212674
Mean Squared Error (MSE):  0.1969445619746086
R-squared (R²):  0.6968758504150951


In [21]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'alpha': 25.2526}


In [10]:
import xgboost as xgb

model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
model.fit(X_train_lasso, y_train)

In [11]:
print_evaluation_metrics(model, "XGBoost", X_train_lasso, y_train,"Train")
print_evaluation_metrics(model, "XGBoost", X_test_lasso, y_test,"Test")

--------- For Model:  XGBoost  -----Train---

Mean Absolute Error (MAE):  0.1988542465099645
Mean Squared Error (MSE):  0.076007033292292
R-squared (R²):  0.8802402815879512
--------- For Model:  XGBoost  -----Test---

Mean Absolute Error (MAE):  0.21509833422524677
Mean Squared Error (MSE):  0.09858916203546093
R-squared (R²):  0.8482580295660034


In [None]:
from sklearn.svm import SVR
import joblib

svm_model = SVR(gamma = 0.05, verbose = True) #best gamma 0.05, c=0.5
svm_model.fit(X_train_lasso, y_train)

In [47]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

space = {
    'max_depth': hp.quniform("max_depth", 3, 18, 1),
    'gamma': hp.uniform('gamma', 0, 9),
    'reg_alpha': hp.quniform('reg_alpha', 0, 180, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': 180,
    'seed': 42
}

In [48]:
# Define the objective function
def objective(space):
    clf = xgb.XGBRegressor(
        n_estimators=int(space['n_estimators']),
        max_depth=int(space['max_depth']),
        gamma=space['gamma'],
        reg_alpha=int(space['reg_alpha']),
        reg_lambda=space['reg_lambda'],
        min_child_weight=int(space['min_child_weight']),
        colsample_bytree=space['colsample_bytree'],
        seed=int(space['seed']), early_stopping_rounds=10, eval_metric="rmse"
    )
    
    evaluation = [(X_train, y_train), (X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation,
             verbose=False)
    
    pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    print("MSE:", mse)
    print("R-squared (R²):", r2)
    return {'loss': mse, 'status': STATUS_OK}

In [49]:
# Run the hyperparameter optimization
trials = Trials()

best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=150,
                        trials=trials)

print("The best hyperparameters are: ", best_hyperparams)

MSE:                                                   
0.13342025124925952                                    
R-squared (R²):                                        
0.7946483020813222                                     
MSE:                                                                              
0.17351837379072363                                                               
R-squared (R²):                                                                   
0.7329319024332849                                                                
MSE:                                                                              
0.17608024156037472                                                               
R-squared (R²):                                                                   
0.7289888436291296                                                                
MSE:                                                                              
0.14068017409790207          

In [50]:
best_model = xgb.XGBRegressor(
    n_estimators=180,
    max_depth=int(best_hyperparams['max_depth']),
    gamma=best_hyperparams['gamma'],
    reg_alpha=int(best_hyperparams['reg_alpha']),
    reg_lambda=best_hyperparams['reg_lambda'],
    min_child_weight=int(best_hyperparams['min_child_weight']),
    colsample_bytree=best_hyperparams['colsample_bytree'],
    seed=42
)

best_model.fit(X_train, y_train)

# Predict on the test set
y_test_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)
print("Best Model MSE:", mse)
print("Best Model R-squared (R²):", r2)

Best Model MSE: 0.038545925751105194
Best Model R-squared (R²): 0.9406726398225047


In [52]:
print_evaluation_metrics(best_model, "XGBoost", X_train, y_train,"Train")
print_evaluation_metrics(best_model, "XGBoost", X_test, y_test,"Test")

--------- For Model:  XGBoost  -----Train---

Mean Absolute Error (MAE):  0.049262234561550586
Mean Squared Error (MSE):  0.007100107115199131
R-squared (R²):  0.9888127875542557
--------- For Model:  XGBoost  -----Test---

Mean Absolute Error (MAE):  0.10517485014305009
Mean Squared Error (MSE):  0.038545925751105194
R-squared (R²):  0.9406726398225047


In [12]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(128,64, 32), activation='relu', solver='adam', max_iter=500, random_state=42,
                validation_fraction=0.1, early_stopping = True,alpha = 0.001)
mlp.fit(X_train_lasso, y_train)
print_evaluation_metrics(mlp, "MLP", X_train_lasso, y_train,"Train")
print_evaluation_metrics(mlp, "MLP", X_test_lasso, y_test,"Test")

--------- For Model:  MLP  -----Train---

Mean Absolute Error (MAE):  0.0900523387425658
Mean Squared Error (MSE):  0.020718133414216307
R-squared (R²):  0.967355681227971
--------- For Model:  MLP  -----Test---

Mean Absolute Error (MAE):  0.13049473391888264
Mean Squared Error (MSE):  0.05192736287994809
R-squared (R²):  0.9200768096597641
