In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV

%matplotlib inline

In [2]:
boston_calendar = pd.read_csv('../project1_supplemental_files/airbnb_data/boston_calendar.csv')
seattle_calendar = pd.read_csv('../project1_supplemental_files/airbnb_data/seattle_calendar.csv')
boston_listings = pd.read_csv('../project1_supplemental_files/airbnb_data/boston_listings.csv')
seattle_listings = pd.read_csv('../project1_supplemental_files/airbnb_data/seattle_listings.csv')
boston_reviews = pd.read_csv('../project1_supplemental_files/airbnb_data/boston_reviews.csv')
seattle_reviews = pd.read_csv('../project1_supplemental_files/airbnb_data/seattle_reviews.csv')

seattle_listings['state'] = 'WA'
boston_listings['state'] = 'MA'

combined_listings = pd.concat([boston_listings, seattle_listings], ignore_index=True)
combined_reviews = pd.concat([boston_reviews, seattle_reviews], ignore_index=True)

In [3]:
# df = listings[ [ 'id', 'state', 'host_response_rate', 'host_is_superhost', 'property_type', 'room_type', 'accommodates', 'review_scores_rating', 'price' ] ]
# df = listings[ [ 'id', 'state', 'host_response_rate', 'host_is_superhost', 'room_type', 'accommodates', 'review_scores_rating', 'price' ] ]
# df = listings[ [ 'id', 'state', 'neighbourhood_cleansed', 'host_is_superhost', 'property_type', 'room_type', 'accommodates', 'review_scores_rating', 'price' ] ]

# df

In [4]:
# def model_and_score(df):
# 	# Prepare the data
# 	sample_df = df.dropna()
# 	X = sample_df.drop(columns=['price'])
# 	y = sample_df['price']

# 	# Convert categorical variables to dummy variables
# 	X = pd.get_dummies(X, drop_first=True)

# 	# Handle missing values by filling them with the mean of the column
# 	# X = X.fillna(X.mean())

# 	# Split the data into training and testing sets
# 	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 	# Create and train the linear regression model
# 	model = LinearRegression()
# 	model.fit(X_train, y_train)

# 	# Make predictions
# 	y_pred = model.predict(X_test)

# 	# Evaluate the model
# 	r2 = r2_score(y_test, y_pred)
# 	mse = mean_squared_error(y_test, y_pred)

# 	print(f'R^2 Score: {r2}')
# 	print(f'Mean Squared Error: {mse}')

# 	# Plot the results
# 	# plt.scatter(y_test, y_pred)
# 	# plt.show()

In [5]:
# def scale_model_cross_validate(df):
# 	# Prepare the data
# 	sample_df = df.dropna()
# 	X = sample_df.drop(columns=['price'])
# 	y = sample_df['price']

# 	# Convert categorical variables to dummy variables
# 	X = pd.get_dummies(X, drop_first=True)

# 	# Feature scaling
# 	scaler = StandardScaler()
# 	X_scaled = scaler.fit_transform(X)

# 	# Polynomial features
# 	poly = PolynomialFeatures(degree=2)
# 	X_poly = poly.fit_transform(X_scaled)

# 	# Ridge regression with cross-validation and hyperparameter tuning
# 	ridge = Ridge()
# 	param_grid = {'alpha': [0.1, 1.0, 10.0]}
# 	grid_search = GridSearchCV(ridge, param_grid, cv=5)
# 	grid_search.fit(X_poly, y)

# 	# Best model
# 	best_model = grid_search.best_estimator_
# 	print(f'Best model: {best_model}')

# 	# Evaluate model
# 	scores = cross_val_score(best_model, X_poly, y, cv=5)
# 	print(f'Cross-validated R² score: {scores.mean()}')

In [6]:
# df = listings[ [ 'state', 'host_response_rate', 'host_is_superhost', 'property_type', 'room_type', 'accommodates', 'review_scores_rating', 'price' ] ]
# model_and_score(df) # 0.5196
# scale_model_cross_validate(df) # 0.
# print()

# df = listings[ [ 'state', 'host_response_rate', 'host_is_superhost', 'room_type', 'accommodates', 'review_scores_rating', 'price' ] ]
# model_and_score(df) # 0.5026
# scale_model_cross_validate(df) # 0.
# print()

# df = listings[ [ 'state', 'neighbourhood_cleansed', 'host_is_superhost', 'property_type', 'room_type', 'accommodates', 'review_scores_rating', 'price' ] ]
# model_and_score(df) # 0.5517
# scale_model_cross_validate(df) # 0.
# print()

# df = listings[ [ 'state', 'neighbourhood_cleansed', 'host_is_superhost', 'host_response_rate', 'host_acceptance_rate', 'property_type', 'room_type', 'accommodates', 'review_scores_rating', 'price' ] ]
# model_and_score(df) # 0.5955
# scale_model_cross_validate(df) # 0.
# print()

# df = listings[ [ 'state', 'neighbourhood_cleansed', 'host_is_superhost', 'host_response_rate', 'host_acceptance_rate', 'property_type', 'room_type', 'accommodates', 'review_scores_rating', 'price' ] ]
# model_and_score(df) # 0.5955
# scale_model_cross_validate(df) # 0.
# print()

In [7]:
price_cols_to_clean = ['price', 'security_deposit', 'cleaning_fee', 'extra_people']

for col_nm in price_cols_to_clean:
	combined_listings[col_nm] = combined_listings[col_nm].replace({r'\$': '', ',': ''}, regex=True).astype(float)

percentage_cols_to_clean = ['host_response_rate', 'host_acceptance_rate']

for col_nm in percentage_cols_to_clean:
	combined_listings[col_nm] = combined_listings[col_nm].str.replace('%', '').astype(float)

In [8]:
def model_and_score(df):
	# Prepare the data
	X = df.drop(columns=['price'])
	y = df['price']

	# Convert categorical variables to dummy variables
	X = pd.get_dummies(X, drop_first=True)

	# Split the data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Create and train the linear regression model
	model = LinearRegression()
	model.fit(X_train, y_train)

	# Make predictions
	y_pred = model.predict(X_test)

	# Evaluate the model
	r2 = r2_score(y_test, y_pred)
	mse = mean_squared_error(y_test, y_pred)

	# print(f'R^2 Score: {r2}')
	# print(f'Mean Squared Error: {mse}')
	# print(r2)
	# print(mse)
	return r2, mse

	# Plot the results
	# plt.scatter(y_test, y_pred)
	# plt.show()

In [9]:
nulls_df = combined_listings[combined_listings['host_verifications'].isnull()]

In [10]:
na_cols_to_fill_str = [ 'host_response_time', 'host_neighbourhood', 'property_type', 'host_is_superhost', 'host_verifications', 'host_identity_verified' ]
na_cols_to_fill_num = [ 'host_response_rate', 'host_acceptance_rate', 'bathrooms', 'bedrooms', 'beds', 'security_deposit' ]
na_cols_to_fill_1 = ['host_listings_count', 'host_total_listings_count']
na_cols_to_drop_nas = [ 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'reviews_per_month' ]

In [11]:
# all_possible_cols = [ 
# 	'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 
# 	'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications',
# 	'host_identity_verified', 'neighbourhood_cleansed', 'is_location_exact', 'property_type', 'room_type',
# 	'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'security_deposit',
# 	'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'number_of_reviews',
# 	'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
# 	'review_scores_communication', 'review_scores_location', 'instant_bookable',
# 	'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification',
# 	'calculated_host_listings_count', 'reviews_per_month'
# ]

# best_r2 = best_mse = 0
# best_col_nm = ""

# results = []

# for col_nm in all_possible_cols:
# 	# print("Looking at column:", col_nm)
# 	df = combined_listings[ [ col_nm, 'price' ] ].copy()

# 	if len(combined_listings[col_nm].unique()) == 1:
# 		print(f"Column '{col_nm}' has only one unique value. Skipping it.")
# 		continue

# 	if col_nm in na_cols_to_fill_str:
# 		df[col_nm] = df[col_nm].fillna('Unknown')
# 	elif col_nm in na_cols_to_fill_num:
# 		df[col_nm] = df[col_nm].fillna(0)
# 	elif col_nm in na_cols_to_fill_1:
# 		df[col_nm] = df[col_nm].fillna(1)
# 	elif col_nm in na_cols_to_drop_nas:
# 		df = df.dropna()
# 	elif len( df[ df[col_nm].isnull() ] ) > 0:
# 		print(f"Column '{col_nm}' has null values. Data type: {combined_listings[col_nm].dtype}. Number of Nulls: {len( df[ df[col_nm].isnull() ] )}")

# 	r2, mse = model_and_score(df)
	
# 	results.append({
# 		"column": col_nm,
# 		"r2": r2,
# 		"mse": mse
# 	})

# 	if r2 > best_r2:
# 		best_r2 = r2
# 		best_mse = mse
# 		best_col_nm = col_nm

# print()
# print(f"Best Column: {col_nm} with R^2 score: {best_r2} (MSE: {best_mse})")

In [12]:
# results_df = pd.DataFrame(results)

In [13]:
# new_testing_cols = [ 'room_type', 'accommodates', 'bedrooms', 'beds', 'neighbourhood_cleansed', 'host_neighbourhood', 'bathrooms' ]

# df = listings[ [ *new_testing_cols, 'price' ] ].copy()
# for col_nm in df.columns:
# 	if col_nm in na_cols_to_fill_str:
# 		df[col_nm] = df[col_nm].fillna('Unknown')
# 	elif col_nm in na_cols_to_fill_num:
# 		df[col_nm] = df[col_nm].fillna(0)
# 	elif col_nm in na_cols_to_drop_nas:
# 		df = df.dropna(subset=[col_nm])

# r2, mse = model_and_score(df)
# print(f"{r2:.4f} (MSE: {mse:.4f})")

In [14]:
# orig_best_col_comb = [ 'neighbourhood_cleansed', 'host_is_superhost', 'host_response_rate', 'host_acceptance_rate', 'property_type', 'room_type', 'accommodates', 'review_scores_rating' ]
# new_testing_cols = [ 'room_type', 'accommodates', 'bedrooms', 'beds', 'neighbourhood_cleansed', 'host_neighbourhood', 'bathrooms' ]

# for col_nm in new_testing_cols:
# 	if col_nm not in orig_best_col_comb:
# 		cols_to_run = [  *orig_best_col_comb, col_nm ]

# 		df = combined_listings[ [ *cols_to_run, 'price' ] ].copy()
# 		for df_col_nm in df.columns:
# 			if df_col_nm in na_cols_to_fill_str:
# 				df[df_col_nm] = df[df_col_nm].fillna('Unknown')
# 			elif df_col_nm in na_cols_to_fill_num:
# 				df[df_col_nm] = df[df_col_nm].fillna(0)
# 			elif df_col_nm in na_cols_to_drop_nas:
# 				df = df.dropna(subset=[df_col_nm])

# 		r2, mse = model_and_score(df)
# 		print(f"{col_nm}\n\t{r2:.4f} (MSE: {mse:.4f})")

In [15]:
# new_testing_cols = [ 'neighbourhood_cleansed', 'host_is_superhost', 'host_response_rate', 'host_acceptance_rate', 'property_type', 'room_type', 'accommodates', 'review_scores_rating', 'room_type', 'accommodates', 'bedrooms', 'beds', 'neighbourhood_cleansed', 'host_neighbourhood', 'bathrooms' ]

# df = combined_listings[ [ *new_testing_cols, 'price' ] ].copy()
# for col_nm in df.columns:
# 	if col_nm in na_cols_to_fill_str:
# 		df[col_nm] = df[col_nm].fillna('Unknown')
# 	elif col_nm in na_cols_to_fill_num:
# 		df[col_nm] = df[col_nm].fillna(0)
# 	elif col_nm in na_cols_to_drop_nas:
# 		df = df.dropna(subset=[col_nm])

# r2, mse = model_and_score(df)
# print(f"{r2:.4f} (MSE: {mse:.4f})")

In [16]:
# all_possible_cols = [ 
# 	'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 
# 	'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications',
# 	'host_identity_verified', 'neighbourhood_cleansed', 'is_location_exact', 'property_type', 'room_type',
# 	'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'security_deposit',
# 	'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'number_of_reviews',
# 	'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
# 	'review_scores_communication', 'review_scores_location', 'instant_bookable',
# 	'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification',
# 	'calculated_host_listings_count', 'reviews_per_month'
# ]

# new_testing_cols = []

# best_r2 = best_mse = 0
# best_col_nm = ""

# results = []

# for col_nm in all_possible_cols:
# 	if col_nm not in new_testing_cols:
# 		df = combined_listings[ [ *new_testing_cols, 'price' ] ].copy()
# 		new_df = combined_listings[ [ *new_testing_cols, col_nm, 'price' ] ].copy()

# 		for df_col_nm in df.columns:
# 			if df_col_nm in na_cols_to_fill_str:
# 				df[df_col_nm] = df[df_col_nm].fillna('Unknown')
# 			elif df_col_nm in na_cols_to_fill_num:
# 				df[df_col_nm] = df[df_col_nm].fillna(0)
# 			elif df_col_nm in na_cols_to_drop_nas:
# 				df = df.dropna(subset=[df_col_nm])

# 		for df_col_nm in new_df.columns:
# 			if df_col_nm in na_cols_to_fill_str:
# 				new_df[df_col_nm] = new_df[df_col_nm].fillna('Unknown')
# 			elif df_col_nm in na_cols_to_fill_num:
# 				new_df[df_col_nm] = new_df[df_col_nm].fillna(0)
# 			elif df_col_nm in na_cols_to_drop_nas:
# 				new_df = new_df.dropna(subset=[df_col_nm])

# 		r2, mse = model_and_score(df)
# 		new_r2, new_mse = model_and_score(new_df)
	
# 		results.append({
# 			"column": col_nm,
# 			"r2": r2,
# 			"mse": mse
# 		})

# 		if new_r2 > best_r2:
# 			best_r2 = new_r2
# 			best_mse = new_mse
# 			best_col_nm = col_nm

# print(f"Best Column: {best_col_nm} with R^2 score: {best_r2} (MSE: {best_mse})")
# print(f"Improvement in score: {best_r2 - r2} (MSE: {mse - best_mse})")




In [17]:
def test_col_combos(df, col_combo):
	df = combined_listings[ [ *col_combo, 'price' ] ].copy()
	for col_nm in df.columns:
		if col_nm in na_cols_to_fill_str:
			df[col_nm] = df[col_nm].fillna('Unknown')
		elif col_nm in na_cols_to_fill_num:
			df[col_nm] = df[col_nm].fillna(0)
		elif col_nm in na_cols_to_fill_1:
			df[col_nm] = df[col_nm].fillna(1)
		elif col_nm in na_cols_to_drop_nas:
			df = df.dropna(subset=[col_nm])

	r2, mse = model_and_score(df)
	return {
		"columns": col_combo,
		"r2": r2,
		"mse": mse
	}

# Using simple model and r2 score

In [18]:
all_possible_cols = [ 
	'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 
	'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications',
	'host_identity_verified', 'neighbourhood_cleansed', 'is_location_exact', 'property_type', 'room_type',
	'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'security_deposit',
	'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'number_of_reviews',
	'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
	'review_scores_communication', 'review_scores_location', 'instant_bookable',
	'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification',
	'calculated_host_listings_count', 'reviews_per_month'
]

results = []

for idx, cn in enumerate(all_possible_cols):
    for extra_cols in range(1, len(all_possible_cols)):
        if idx + extra_cols < len(all_possible_cols):
            testing_cols = all_possible_cols[idx:idx+extra_cols+1]
            results.append(test_col_combos(combined_listings, testing_cols))

In [19]:
results_df = pd.DataFrame(results)

In [20]:
best_col_group = results_df.sort_values(by='r2', ascending=False).head(1)['columns'].values[0]
best_col_group

['host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_identity_verified',
 'neighbourhood_cleansed',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'security_deposit',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness']

In [21]:
for col_nm in all_possible_cols:
    if col_nm not in best_col_group:
        print(col_nm)

review_scores_checkin
review_scores_communication
review_scores_location
instant_bookable
cancellation_policy
require_guest_profile_picture
require_guest_phone_verification
calculated_host_listings_count
reviews_per_month


# Using the scaled with ridge testing

In [22]:
def scale_model_cross_validate(df):
	# Prepare the data
	sample_df = df.dropna()
	X = sample_df.drop(columns=['price'])
	y = sample_df['price']

	# Convert categorical variables to dummy variables
	X = pd.get_dummies(X, drop_first=True)

	# Feature scaling
	scaler = StandardScaler()
	X_scaled = scaler.fit_transform(X)

	# Polynomial features
	poly = PolynomialFeatures(degree=2)
	X_poly = poly.fit_transform(X_scaled)

	# Ridge regression with cross-validation and hyperparameter tuning
	ridge = Ridge()
	param_grid = {'alpha': [0.1, 1.0, 10.0]}
	grid_search = GridSearchCV(ridge, param_grid, cv=5)
	grid_search.fit(X_poly, y)

	# Best model
	best_model = grid_search.best_estimator_
	# print(f'Best model: {best_model}')

	# Evaluate model
	scores = cross_val_score(best_model, X_poly, y, cv=5)
	# print(f'Cross-validated R² score: {scores.mean()}')
	return best_model, scores.mean()

def test_col_combos_scaled(df, col_combo):
	df = combined_listings[ [ *col_combo, 'price' ] ].copy()
	for col_nm in df.columns:
		if col_nm in na_cols_to_fill_str:
			df[col_nm] = df[col_nm].fillna('Unknown')
		elif col_nm in na_cols_to_fill_num:
			df[col_nm] = df[col_nm].fillna(0)
		elif col_nm in na_cols_to_fill_1:
			df[col_nm] = df[col_nm].fillna(1)
		elif col_nm in na_cols_to_drop_nas:
			df = df.dropna(subset=[col_nm])

	best_model, scores_avg = scale_model_cross_validate(df)
	return {
		"columns": col_combo,
		"best_model": best_model,
		"scores_avg": scores_avg
	}

In [23]:
all_possible_cols = [ 
	'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 
	'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications',
	'host_identity_verified', 'neighbourhood_cleansed', 'is_location_exact', 'property_type', 'room_type',
	'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'security_deposit',
	'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'number_of_reviews',
	'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
	'review_scores_communication', 'review_scores_location', 'instant_bookable',
	'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification',
	'calculated_host_listings_count', 'reviews_per_month'
]

results = []

for idx, cn in enumerate(all_possible_cols):
    for extra_cols in range(1, len(all_possible_cols)):
        if idx + extra_cols < len(all_possible_cols):
            testing_cols = all_possible_cols[idx:idx+extra_cols+1]
            results.append(test_col_combos_scaled(combined_listings, testing_cols))

KeyboardInterrupt: 

In [53]:
results_df = pd.DataFrame(results)