In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
#import the data
df = pd.read_csv("../1_numeric_outliers_removed_transformed.csv")
df.head()

Unnamed: 0,id,num__accommodates,num__bedrooms,num__beds,num__price,num__availability_30,num__availability_60,num__availability_90,num__availability_365,num__number_of_reviews,...,bool__host_identity_verified_t,bool__has_availability_t,bool__instant_bookable_t,bool__prop_type_reduced_Entire condo,bool__prop_type_reduced_Entire guest suite,bool__prop_type_reduced_Entire guesthouse,bool__prop_type_reduced_Entire home,bool__prop_type_reduced_Entire rental unit,bool__prop_type_reduced_Entire townhouse,bool__prop_type_reduced_Other
0,360,-0.562715,0.152657,-0.192967,-0.93181,-0.551344,0.120505,0.449145,-0.210263,1.146599,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,364,-0.562715,-0.885936,-0.935868,0.207151,1.504547,1.454513,1.306215,1.467521,0.229,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,31503,-0.997213,-0.885936,-0.935868,-0.765445,0.314295,0.890125,0.943609,1.380054,0.947121,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,39405,-0.997213,-0.885936,-0.935868,-0.343133,-0.334934,-0.905654,-0.37496,-0.472665,6.013866,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,154999,-0.997213,-0.885936,-0.935868,-0.010403,-0.984163,-1.26481,-1.429815,-0.273876,-0.529017,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#confirm no missing values
missing_count = df.isnull().sum()
print(missing_count)



id                                            0
num__accommodates                             0
num__bedrooms                                 0
num__beds                                     0
num__price                                    0
num__availability_30                          0
num__availability_60                          0
num__availability_90                          0
num__availability_365                         0
num__number_of_reviews                        0
num__number_of_reviews_ltm                    0
num__number_of_reviews_l30d                   0
num__review_scores_rating                     0
num__review_scores_accuracy                   0
num__review_scores_cleanliness                0
num__review_scores_checkin                    0
num__review_scores_communication              0
num__review_scores_location                   0
num__review_scores_value                      0
num__reviews_per_month                        0
num__bath_number                        

In [11]:
#make X variable with all columns except ratings
X_full = df.drop(columns=["num__review_scores_rating"])
X_full.columns

Index(['id', 'num__accommodates', 'num__bedrooms', 'num__beds', 'num__price',
       'num__availability_30', 'num__availability_60', 'num__availability_90',
       'num__availability_365', 'num__number_of_reviews',
       'num__number_of_reviews_ltm', 'num__number_of_reviews_l30d',
       'num__review_scores_accuracy', 'num__review_scores_cleanliness',
       'num__review_scores_checkin', 'num__review_scores_communication',
       'num__review_scores_location', 'num__review_scores_value',
       'num__reviews_per_month', 'num__bath_number',
       'bool__host_has_profile_pic_t', 'bool__host_identity_verified_t',
       'bool__has_availability_t', 'bool__instant_bookable_t',
       'bool__prop_type_reduced_Entire condo',
       'bool__prop_type_reduced_Entire guest suite',
       'bool__prop_type_reduced_Entire guesthouse',
       'bool__prop_type_reduced_Entire home',
       'bool__prop_type_reduced_Entire rental unit',
       'bool__prop_type_reduced_Entire townhouse',
       'bool__p

In [14]:
#Select specific features 
select_features =  ["id", "num__accommodates", "num__availability_30", "num__review_scores_value", "num__review_scores_cleanliness"]
#create varable X_sel with features identified
X_sel = df[select_features]
X_sel.head()

Unnamed: 0,id,num__accommodates,num__availability_30,num__review_scores_value,num__review_scores_cleanliness
0,360,-0.562715,-0.551344,0.429767,0.411728
1,364,-0.562715,1.504547,-0.122886,-0.104331
2,31503,-0.997213,0.314295,0.346869,0.549344
3,39405,-0.997213,-0.334934,0.263971,0.308516
4,154999,-0.997213,-0.984163,-0.426845,-0.482775


In [18]:
print(df.columns)


Index(['id', 'num__accommodates', 'num__bedrooms', 'num__beds', 'num__price',
       'num__availability_30', 'num__availability_60', 'num__availability_90',
       'num__availability_365', 'num__number_of_reviews',
       'num__number_of_reviews_ltm', 'num__number_of_reviews_l30d',
       'num__review_scores_rating', 'num__review_scores_accuracy',
       'num__review_scores_cleanliness', 'num__review_scores_checkin',
       'num__review_scores_communication', 'num__review_scores_location',
       'num__review_scores_value', 'num__reviews_per_month',
       'num__bath_number', 'bool__host_has_profile_pic_t',
       'bool__host_identity_verified_t', 'bool__has_availability_t',
       'bool__instant_bookable_t', 'bool__prop_type_reduced_Entire condo',
       'bool__prop_type_reduced_Entire guest suite',
       'bool__prop_type_reduced_Entire guesthouse',
       'bool__prop_type_reduced_Entire home',
       'bool__prop_type_reduced_Entire rental unit',
       'bool__prop_type_reduced_Entir

In [19]:
#set the target variable
y = df['num__review_scores_rating'].values.reshape(-1,1)

In [20]:
#split data into training and testing sets
X_full_train, X_full_test, X_sel_train, X_sel_test, y_train, y_test = train_test_split(X_full, X_sel, y, random_state=42)

Train Models

In [21]:
#create the models
lr1 = LinearRegression()
lr2 = LinearRegression()

#fit the first model to the full training data
lr1.fit(X_full_train, y_train)

#fit second model
lr2.fit(X_sel_train, y_train)

Evaluate the model

In [22]:
# Calculate the mean_squared_error and the r-squared value
# for the testing data

# Use our models to make predictions
predicted1 = lr1.predict(X_full_test)
predicted2 = lr2.predict(X_sel_test)

# Score the predictions with mse and r2
mse1 = mean_squared_error(y_test, predicted1)
r21 = r2_score(y_test, predicted1)
mse2 = mean_squared_error(y_test, predicted2)
r22 = r2_score(y_test, predicted2)

print(f"All Features:")
print(f"mean squared error (MSE): {mse1}")
print(f"R-squared (R2): {r21}")
print("---------------------")
print(f"Select Features:")
print(f"mean squared error (MSE): {mse2}")
print(f"R-squared (R2): {r22}")

All Features:
mean squared error (MSE): 1.0930589402394113
R-squared (R2): -0.0010309621921011924
---------------------
Select Features:
mean squared error (MSE): 1.0930589402394113
R-squared (R2): -0.0010309621921011924


In [None]:
#results confirm that linear regression isnt a good model for our dataset
#try an ensemble method instead