# Import package

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [100]:
#Import linear algebra and data manipulation
import numpy as np
import pandas as pd

#Import plotting packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors

#Import common utilities
import math
from collections import Counter

# statsmodels
import pylab
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels as statm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols

#Import machine learning
# Predictive Model
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Imputer
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import Imputer
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingRegressor  
from sklearn.ensemble import RandomForestRegressor

# Accuracy Evaluation
from sklearn.model_selection import train_test_split #split
from sklearn.metrics import r2_score, mean_squared_error #metrics
import sklearn.metrics as metrics
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.model_selection import cross_val_score

# NLP
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import string
import nltk
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Configuration

In [6]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 505)

# Load data

In [7]:
listings_df = pd.read_csv('D:\\MyProjects\\01_Airbnb\\Data\\seattle_listings.csv', sep=',')
listings_df.shape

(3818, 92)

# Data Preparation

### Check null values
   First, we start we a check for null values. This is because we can not use columns that has so many null values

In [9]:
def Check_Missing_Data(df):    
    # count all missing values of each column
    total = df.isnull().sum().sort_values(ascending=False)
    # calculate percentage of null values for each column
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

missing_values_df = Check_Missing_Data(listings_df)
missing_values_df

Unnamed: 0,Total,Percent
license,3818,1.0
square_feet,3721,0.974594
monthly_price,2301,0.602672
security_deposit,1952,0.511262
weekly_price,1809,0.473808
notes,1606,0.420639
neighborhood_overview,1032,0.270299
cleaning_fee,1030,0.269775
transit,934,0.244631
host_about,859,0.224987


   => Most of the columns have between 0-30% of null ratio. Therefore, I decided to exclude from the analysis the columns with 30% or more of null ratio

In [15]:
missing_values_df[(missing_values_df.Percent < 0.3)].shape[0] 

86

   => This reduces the number of columns from 92 to 86

In [19]:
selected_cols = missing_values_df[(missing_values_df.Percent < 0.3)].index.values
selected_cols

array(['neighborhood_overview', 'cleaning_fee', 'transit', 'host_about',
       'host_acceptance_rate', 'review_scores_accuracy',
       'review_scores_checkin', 'review_scores_value',
       'review_scores_location', 'review_scores_cleanliness',
       'review_scores_communication', 'review_scores_rating',
       'reviews_per_month', 'first_review', 'last_review', 'space',
       'host_response_time', 'host_response_rate', 'neighbourhood',
       'xl_picture_url', 'thumbnail_url', 'medium_url',
       'host_neighbourhood', 'summary', 'bathrooms', 'host_location',
       'zipcode', 'bedrooms', 'host_identity_verified',
       'host_has_profile_pic', 'host_picture_url', 'host_since',
       'host_total_listings_count', 'host_listings_count',
       'host_thumbnail_url', 'host_name', 'host_is_superhost', 'beds',
       'property_type', 'host_verifications', 'host_url', 'host_id',
       'picture_url', 'experiences_offered', 'description', 'name',
       'last_scraped', 'scrape_id', 'list

### Remove redundant columns 
Next, I will delete columns that seems not impact price (not contribute to predicting price) or columns that have only one single value

In [41]:
listings_df.apply(lambda x:x.unique().size,axis=0).reset_index()

Unnamed: 0,index,0
0,id,3818
1,listing_url,3818
2,scrape_id,1
3,last_scraped,1
4,name,3792
5,summary,3479
6,space,3120
7,description,3742
8,experiences_offered,1
9,neighborhood_overview,2507


In [57]:
removed_cols = [ 'transit', 'host_about','host_acceptance_rate', 'first_review', 'last_review',
        'xl_picture_url', 'thumbnail_url', 'medium_url','host_location','host_has_profile_pic', 
        'host_picture_url', 'host_thumbnail_url', 'host_name', 'host_url', 'host_id','picture_url', 
        'experiences_offered', 'name','last_scraped', 'scrape_id', 'listing_url', 'host_total_listings_count',
        'calendar_last_scraped','calendar_updated', 'has_availability','requires_license',
       'jurisdiction_names', 'instant_bookable', 'require_guest_profile_picture','host_neighbourhood'
       'require_guest_phone_verification','city', 'state', 'market','host_response_time','host_response_rate',
       'smart_location', 'country_code', 'country','is_location_exact', 'id','host_listings_count',]

In [58]:
selected_cols = [e for e in selected_cols if e not in removed_cols]
selected_cols

['neighborhood_overview',
 'cleaning_fee',
 'review_scores_accuracy',
 'review_scores_checkin',
 'review_scores_value',
 'review_scores_location',
 'review_scores_cleanliness',
 'review_scores_communication',
 'review_scores_rating',
 'reviews_per_month',
 'space',
 'neighbourhood',
 'host_neighbourhood',
 'summary',
 'bathrooms',
 'zipcode',
 'bedrooms',
 'host_identity_verified',
 'host_since',
 'host_is_superhost',
 'beds',
 'property_type',
 'host_verifications',
 'description',
 'street',
 'latitude',
 'neighbourhood_cleansed',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'minimum_nights',
 'cancellation_policy',
 'maximum_nights',
 'extra_people',
 'neighbourhood_group_cleansed',
 'calculated_host_listings_count',
 'longitude',
 'guests_included',
 'room_type',
 'accommodates',
 'bed_type',
 'amenities',
 'price']

In [59]:
len(selected_cols)

45

   => This reduces the number of columns to 47

In [66]:
data_df = listings_df.copy(deep=True)
data_df = data_df[selected_cols]
data_df.shape

(3818, 45)

### Convert columns from string to numeric values

In [67]:
data_df['price'] = data_df['price'].str.replace("[$, ]", "").astype("float")
data_df['cleaning_fee'] = data_df['cleaning_fee'].str.replace("[$, ]", "").astype("float")
data_df['extra_people'] = data_df['extra_people'].str.replace("[$, ]", "").astype("float")

### Select all numerical features

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64','uint8']
numericdata = data_df.select_dtypes(include=numerics)
numericdata.shape       

   => There are 26 numerical features

In [69]:
Check_Missing_Data(numericdata)

Unnamed: 0,Total,Percent
cleaning_fee,1030,0.269775
review_scores_accuracy,658,0.172342
review_scores_checkin,658,0.172342
review_scores_value,656,0.171818
review_scores_location,655,0.171556
review_scores_cleanliness,653,0.171032
review_scores_communication,651,0.170508
review_scores_rating,647,0.16946
reviews_per_month,627,0.164222
bathrooms,16,0.004191


### Filling NA values for numeric features

In [70]:
#fill missing values for review scores columns
review_columns = ['review_scores_accuracy', 'review_scores_checkin', 'review_scores_value',
                  'review_scores_location', 'review_scores_cleanliness','review_scores_communication',
                  'review_scores_rating','reviews_per_month', 'cleaning_fee']
for column in review_columns:
    data_df[column].fillna(data_df[column].mean(), inplace = True)

numericdata = data_df.select_dtypes(include=numerics)
numericdata.shape 
Check_Missing_Data(numericdata)

Unnamed: 0,Total,Percent
bathrooms,16,0.004191
bedrooms,6,0.001572
beds,1,0.000262
price,0,0.0
accommodates,0,0.0
review_scores_accuracy,0,0.0
review_scores_checkin,0,0.0
review_scores_value,0,0.0
review_scores_location,0,0.0
review_scores_cleanliness,0,0.0


In [71]:
#fill in missing values for bathrooms, bedrooms and beds with mode
data_df['bathrooms'] = data_df['bathrooms'].fillna(data_df['bathrooms'].mode()[0])
data_df['bedrooms'] = data_df['bedrooms'].fillna(data_df['bedrooms'].mode()[0])
data_df['beds'] = data_df['beds'].fillna(data_df['beds'].mode()[0])

numericdata = data_df.select_dtypes(include=numerics)
numericdata.shape 
Check_Missing_Data(numericdata)

Unnamed: 0,Total,Percent
price,0,0.0
accommodates,0,0.0
review_scores_accuracy,0,0.0
review_scores_checkin,0,0.0
review_scores_value,0,0.0
review_scores_location,0,0.0
review_scores_cleanliness,0,0.0
review_scores_communication,0,0.0
review_scores_rating,0,0.0
reviews_per_month,0,0.0


### Add log_price

In [72]:
data_df['log_price'] = data_df['price'].apply(lambda x: math.log(x))

### Seperate features and label 

In [75]:
col_list = list(numericdata.columns.values)
col_list = [e for e in col_list if e not in ['price','log_price']]
X_data = data_df.loc[:,col_list]       
Y_data = data_df.loc[numericdata.index.values,'log_price']
X_data.shape 

(3818, 25)

### Normalization

In [76]:
def Normalize_Data(X_data):    
    for col in X_data.columns.values:
        X_data[col] = X_data[col].astype(float)
        mean = np.mean(X_data[col])
        std = np.std(X_data[col])
        X_data[col] = X_data[col].apply(lambda x: (x - mean) / std)
    return X_data

X_data = Normalize_Data(X_data)
X_data.head()

Unnamed: 0,cleaning_fee,review_scores_accuracy,review_scores_checkin,review_scores_value,review_scores_location,review_scores_cleanliness,review_scores_communication,review_scores_rating,reviews_per_month,bathrooms,bedrooms,beds,latitude,availability_30,availability_60,availability_90,availability_365,number_of_reviews,minimum_nights,maximum_nights,extra_people,calculated_host_listings_count,longitude,guests_included,accommodates
0,-3.031586e-14,0.5726658,0.393763,0.8023812,-1.06367,0.6112031,0.3679779,0.07654173,1.195309,-0.438461,-0.348068,-0.645342,0.17024,-0.228908,0.179356,0.379264,0.798601,4.897864,-0.083987,-0.246795,-0.322324,-0.160602,-1.194754,0.249756,0.329029
1,-0.5204004,0.5726658,0.393763,0.8023812,0.6831564,0.6112031,0.3679779,0.2426703,-0.35955,-0.438461,-0.348068,-0.645342,0.236078,-0.311063,-1.020585,-1.235563,0.364696,0.550724,-0.022651,-0.410158,-0.60668,0.518255,-1.025923,-0.513098,0.329029
2,5.71168,0.5726658,0.393763,0.8023812,0.6831564,0.6112031,0.3679779,0.4087988,-0.5576595,5.500872,4.183643,4.621311,0.017732,-1.296929,-1.32057,-1.206202,-0.195436,-0.058936,0.10002,-0.445801,0.815098,-0.160602,-1.146171,6.352583,3.869138
3,-3.031586e-14,1.119073e-14,-1.049403e-13,1.769432e-13,5.275077e-14,6.363495e-14,-1.304567e-13,-1.487322e-13,-1.439645e-14,-0.438461,-1.480996,0.232433,0.22098,-1.379085,-1.5777,-1.70533,-0.802903,-0.589075,-0.083987,0.20468,-0.60668,-0.330316,-1.139743,-0.513098,-0.176701
4,1.517011,-1.00229,0.393763,-0.6624741,-1.06367,-0.7666159,0.3679779,-0.4218439,-0.7137458,1.258492,1.917787,1.110209,0.091937,1.08558,0.993601,0.937113,0.948495,0.418189,-0.083987,0.20468,0.246387,-0.330316,-1.240293,3.301169,1.340489


### Split data to training set and test set

In [77]:
X_train, X_test, Y_train, Y_test=train_test_split(X_data,Y_data, test_size=0.3,train_size = 0.7,random_state=100)

# Numerical feature selection 

### Multiple Linear Regression

#### Check p-value in Multiple linear regression

In [78]:
def Model_LinearRegression1(X_data, Y_data):
    #add constant to data
    X = sm.add_constant(X_data)
    # Fit the linear model
    model = sm.OLS(Y_data, X)
    results = model.fit()
    return results.summary()

Model_LinearRegression1(X_train,Y_train) 

0,1,2,3
Dep. Variable:,log_price,R-squared:,0.548
Model:,OLS,Adj. R-squared:,0.543
Method:,Least Squares,F-statistic:,128.1
Date:,"Sat, 13 Apr 2019",Prob (F-statistic):,0.0
Time:,17:08:59,Log-Likelihood:,-1184.1
No. Observations:,2672,AIC:,2420.0
Df Residuals:,2646,BIC:,2573.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.6810,0.007,634.772,0.000,4.667,4.695
cleaning_fee,0.1248,0.010,12.538,0.000,0.105,0.144
review_scores_accuracy,0.0417,0.010,4.096,0.000,0.022,0.062
review_scores_checkin,-0.0263,0.011,-2.456,0.014,-0.047,-0.005
review_scores_value,-0.0824,0.011,-7.664,0.000,-0.103,-0.061
review_scores_location,0.0629,0.008,7.639,0.000,0.047,0.079
review_scores_cleanliness,0.0238,0.010,2.289,0.022,0.003,0.044
review_scores_communication,0.0109,0.011,1.022,0.307,-0.010,0.032
review_scores_rating,0.0192,0.013,1.505,0.132,-0.006,0.044

0,1,2,3
Omnibus:,36.813,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,66.086
Skew:,-0.018,Prob(JB):,4.46e-15
Kurtosis:,3.77,Cond. No.,16.9


#### Split data into training set and test set

In [82]:
def Model_LinearRegression3(X_train, Y_train, X_test, Y_test, flg = 0):
    # Choose a class of model
    from sklearn.linear_model import LinearRegression
    # Choose model hyperparameters
    model = LinearRegression(fit_intercept=True)
    # Fit the model to the train data set
    model.fit(X_train, Y_train)    
    # prediction
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)        
    # Evaluating Model Accuracy 
    Regression_Metrics(model,X_train, Y_train,Y_train_pred,X_test,Y_test,Y_test_pred)
    
    if(flg == 1):    
        return model, Y_train_pred, Y_test_pred

def Regression_Metrics(model, x_train, y_train, y_train_pred, x_test, y_test, y_test_pred):  
    
    print('Train R-squared: %.4f' % model.score(x_train, y_train))
    print('Test R-squared: %.4f' % model.score(x_test, y_test))
    print()  
    print('Train RMSE:', np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
    print('Test RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

    
Model_LinearRegression3(X_train,Y_train,X_test,Y_test)  

Train R-squared: 0.5475
Test R-squared: 0.4691

Train RMSE: 0.376897809707449
Test RMSE: 0.42063765903502404


#### Cross validation

In [92]:
def CV_Model(model,X, Y):
    cv_scores = cross_val_score(model, X_data, Y_data, scoring = 'r2', cv = 10)
    rmse = np.sqrt(-cross_val_score(model, X_data, Y_data, scoring="neg_mean_squared_error", cv = 10))
    print('r2 score: ',cv_scores)
    print( "Average r2 score: ", np.round( np.mean( cv_scores ), 2 ) )
    print( "Average rmse score: ", np.round( np.mean( rmse ), 2 ) )

CV_Model(linear_model.LinearRegression(),X_data,Y_data)

r2 score:  [0.66971734 0.59155222 0.55265229 0.2211334  0.35065171 0.30956138
 0.45976912 0.49070999 0.45499872 0.50208961]
Average r2 score:  0.46
Average rmse score:  0.4


### Random Forest 

#### Split data into training set and test set

In [94]:
def Model_RandomForest(X_train, Y_train, X_test, Y_test, flg = 0):   
    # train model
    model = RandomForestRegressor(n_estimators = 100, random_state=42)
    # predicting label
    model.fit(X_train, Y_train)
    #Default parameters our model used
    model.get_params()    
    # predicting
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)            
    # Evaluating Model Accuracy 
    Regression_Metrics(model,X_train, Y_train,Y_train_pred,X_test,Y_test,Y_test_pred)  
    
    if(flg == 1):    
        return model, Y_train_pred, Y_test_pred    

Model_RandomForest(X_train, Y_train,X_test,Y_test)

Train R-squared: 0.9522
Test R-squared: 0.6727

Train RMSE: 0.12249422845167057
Test RMSE: 0.3302818191233967


#### Cross validation

In [93]:
model = RandomForestRegressor(n_estimators=100, max_depth=5)
CV_Model(model,X_data,Y_data)

r2 score:  [0.69521233 0.65270675 0.55341088 0.33289155 0.46042886 0.4599182
 0.60193299 0.54002148 0.56277493 0.54352511]
Average r2 score:  0.54
Average rmse score:  0.37


### Boosting Tree

#### Split data into training set and test set

In [98]:
def Model_GBT(X_train, Y_train, X_test, Y_test,flg=0): 
    # train model
    gb = GradientBoostingRegressor(n_estimators = 100, random_state=42)
    gb.fit(X_train, Y_train)
    #Default parameters our model used
    gb.get_params()    
    # Predicting label
    Y_train_pred = gb.predict(X_train)
    Y_test_pred = gb.predict(X_test)    
    # Evaluating Model Accuracy 
    Regression_Metrics(gb,X_train, Y_train,Y_train_pred,X_test,Y_test,Y_test_pred)
    
    if(flg == 1):    
        return model, Y_train_pred, Y_test_pred    

Model_GBT(X_train, Y_train,X_test,Y_test)

Train R-squared: 0.7591
Test R-squared: 0.6759

Train RMSE: 0.27498558980796134
Test RMSE: 0.3286226659982657


#### Cross validation 

In [101]:
model = GradientBoostingRegressor(n_estimators=100, max_depth=5)
CV_Model(model,X_data,Y_data)

r2 score:  [0.7525245  0.71284556 0.65121108 0.50436743 0.55679618 0.65454444
 0.71600779 0.60747167 0.62463169 0.61053949]
Average r2 score:  0.64
Average rmse score:  0.33


#### remove insignificant features

In [103]:
remove_list = ['review_scores_communication','review_scores_rating','availability_30','availability_60',
               'availability_90','availability_365','number_of_reviews','minimum_nights','maximum_nights']
col_list = [e for e in col_list if e not in remove_list]

16

In [132]:
X_data = X_data[col_list]
X_data.shape[1]

16

In [106]:
CV_Model(linear_model.LinearRegression(),X_data,Y_data)

r2 score:  [0.67224733 0.59224029 0.55287049 0.25144101 0.34811501 0.30796155
 0.52745552 0.48995038 0.45238845 0.50312568]
Average r2 score:  0.47
Average rmse score:  0.39


In [107]:
model = RandomForestRegressor(n_estimators=100, max_depth=5)
CV_Model(model,X_data,Y_data)

r2 score:  [0.69228363 0.65547835 0.55850872 0.3353348  0.47319481 0.44335859
 0.60798095 0.53764929 0.56697981 0.55214266]
Average r2 score:  0.54
Average rmse score:  0.37


In [109]:
model = GradientBoostingRegressor(n_estimators=100, max_depth=5)
CV_Model(model,X_data,Y_data)

r2 score:  [0.73845047 0.69031362 0.65085171 0.49515048 0.57572708 0.61704802
 0.68508462 0.57659327 0.59897938 0.59391496]
Average r2 score:  0.62
Average rmse score:  0.33


### XGBoost

In [None]:
#train XGBoost model
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train,y_train)

#calculate and print scores for the model for top 15 features
y_train_preds = xgb.predict(X_train)
y_test_preds = xgb.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_preds),
        mean_squared_error(y_test, y_test_preds)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_preds),
        r2_score(y_test, y_test_preds)))


In [None]:

#get feature importances from the model
headers = ["name", "score"]
values = sorted(zip(X_train.columns, xgb.feature_importances_), key=lambda x: x[1] * -1)
xgb_feature_importances = pd.DataFrame(values, columns = headers)

#plot feature importances for top 15 features
features = xgb_feature_importances['name'][:15]
y_pos = np.arange(len(features))
scores = xgb_feature_importances['score'][:15]
 
plt.figure(figsize=(10,5))
plt.bar(y_pos, scores, align='center', alpha=0.5)
plt.xticks(y_pos, features, rotation='vertical')
plt.ylabel('Score')
plt.xlabel('Features')
plt.title('Feature importances (XGBoost)')

plt.savefig('feature importances XGB.png')
 
plt.show()

# Add categorical features

### room_type

In [123]:
# Approach 1:create dummy variables
room_type = data_df.loc[data_df.room_type.isnull(),'room_type']
room_dummies = pd.get_dummies(data_df[['room_type']], prefix='room_type', prefix_sep='_')
room_dummies.head()

Unnamed: 0,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [124]:
X_data = pd.concat((X_data,room_dummies.astype(int)),axis=1)

In [125]:
X_data.shape 

(3818, 19)

In [119]:
CV_Model(linear_model.LinearRegression(),X_data,Y_data)

r2 score:  [0.74685023 0.68312327 0.68079637 0.43245461 0.51816128 0.48534038
 0.62689914 0.60969908 0.58323363 0.61795687]
Average r2 score:  0.6
Average rmse score:  0.34


In [126]:
model = RandomForestRegressor(n_estimators=100, max_depth=5)
CV_Model(model,X_data,Y_data)

r2 score:  [0.73986307 0.69896169 0.66879593 0.44291029 0.54895538 0.51275221
 0.66324362 0.591826   0.59220069 0.61336738]
Average r2 score:  0.61
Average rmse score:  0.34


In [127]:
model = GradientBoostingRegressor(n_estimators=100, max_depth=5)
CV_Model(model,X_data,Y_data)

r2 score:  [0.76180719 0.70790086 0.69079761 0.50820367 0.61649517 0.62700948
 0.7143673  0.64176469 0.64123472 0.64730457]
Average r2 score:  0.66
Average rmse score:  0.32


In [129]:
# Approach 1:create dummy variables
room_type = data_df.loc[data_df.room_type.isnull(),'room_type']
room_dummies = pd.get_dummies(data_df[['room_type']], prefix='room_type', prefix_sep='_',drop_first=True)
room_dummies.head()

Unnamed: 0,room_type_Private room,room_type_Shared room
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [133]:
X_data = pd.concat((X_data,room_dummies.astype(int)),axis=1)
X_data.shape   # (77063, 18)

(3818, 18)

In [134]:
CV_Model(linear_model.LinearRegression(),X_data,Y_data)

r2 score:  [0.74685023 0.68312327 0.68079637 0.43245461 0.51816128 0.48534038
 0.62689914 0.60969908 0.58323363 0.61795687]
Average r2 score:  0.6
Average rmse score:  0.34


In [136]:
model = RandomForestRegressor(n_estimators=100, max_depth=5)
CV_Model(model,X_data,Y_data)

r2 score:  [0.73693982 0.69167531 0.65847384 0.42103268 0.54163324 0.52145649
 0.66459344 0.59566204 0.59029648 0.61225989]
Average r2 score:  0.6
Average rmse score:  0.34


In [137]:
model = GradientBoostingRegressor(n_estimators=100, max_depth=5)
CV_Model(model,X_data,Y_data)

r2 score:  [0.75270404 0.72057852 0.68898561 0.48362991 0.60284318 0.63623565
 0.71035211 0.62338202 0.64242846 0.64051163]
Average r2 score:  0.65
Average rmse score:  0.32
