In [1]:
# load package
import numpy as np
import pandas as pd

# set path
import os
default_path = "/Users/mayritaspring/Desktop/Github/Data-Science/Example_LasVegasTrip/"
os.chdir(default_path)

# skip interation warning
import warnings; 
warnings.filterwarnings('ignore')

In [2]:
# Data Prepararion
# read data
review_data = pd.read_csv("LasVegasTripAdvisorReviews.csv",index_col="ID")
print(review_data.head())

#data description
review_data.head
review_data.values
review_data.shape
review_data.columns
review_data.index
review_data.info()
review_data.dtypes
review_data.describe()

#one hot encoding
category_var = ['User country', 'Period of stay', 'Pool', 'Gym', 'Tennis court','Spa','Casino','Free internet', 'Traveler type','Hotel name','User continent','Review month','Review weekday']
for col in category_var: 
    review_data[col] = review_data[col].astype('category')
    dummies = pd.get_dummies(review_data.loc[:, col], prefix=col ) 
    review_data = pd.concat( [review_data, dummies], axis = 1)

# drop original variable
fields_to_drop =  category_var
review_data = review_data.drop(fields_to_drop, axis = 1 )

# replace missing value with zero
review_data = review_data.fillna(review_data.mean())

   User country  Nr. reviews  Nr. hotel reviews  Helpful votes  Score  \
ID                                                                      
1           USA           11                  4             13      5   
2           USA          119                 21             75      3   
3           USA           36                  9             25      5   
4            UK           14                  7             14      4   
5        Canada            5                  5              2      4   

   Period of stay Traveler type Pool  Gym Tennis court Spa Casino  \
ID                                                                  
1         Dec-Feb       Friends   NO  YES           NO  NO    YES   
2         Dec-Feb      Business   NO  YES           NO  NO    YES   
3         Mar-May      Families   NO  YES           NO  NO    YES   
4         Mar-May       Friends   NO  YES           NO  NO    YES   
5         Mar-May          Solo   NO  YES           NO  NO    YES   

   F

In [8]:
# Split to Training and Testing
from sklearn import cross_validation
seed = 7
test_size = 0.3
X = review_data.loc[:, review_data.columns != 'Score']
y = review_data[['Score']]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=seed)

In [4]:
# Linear Regression (OLS)
# load package
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)
print(regr.score(X_test, y_test, sample_weight=None))

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
regr_mse = mean_squared_error(y_test, y_pred)

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))
regr_score = r2_score(y_test, y_pred)

-0.23665633470138747
Coefficients: 
 [[ 2.64652589e-04  1.60420539e-06  1.79533303e-04  2.33841550e-01
  -1.46983707e-04 -1.09650791e-03  2.81607029e-01 -8.63709753e-01
   1.57877061e+00  1.12318057e-01 -2.12923536e+00  2.46707331e-01
  -3.15563730e-01  1.86795024e-14  2.56351761e-01 -2.82779079e-01
  -3.21675960e-01  5.53904393e-01  4.11722530e-01 -9.54401478e-01
   2.31491902e-03 -1.99285033e-14  4.31824295e-01 -7.16769637e-02
  -1.19388197e-01 -1.19087449e+00  4.31574689e-01 -4.75324254e-01
   2.17650017e-02 -7.60994741e-01 -5.32907052e-15  5.03329834e-01
  -6.79120237e-01  3.23356609e-02 -1.77276555e-01  1.15837561e+00
   1.97374821e-01  1.19983169e-02  1.00348586e+00  5.55111512e-16
   1.99798318e+00 -4.55191440e-15  1.84197616e+00 -4.17365208e-01
  -4.85462680e-01 -1.05901280e+00 -1.11022302e-15  2.07311472e-01
  -9.70840496e-01  5.49315236e-01 -1.42752185e-02  1.61149710e-01
   1.83712419e-01 -8.88231695e-01  2.70931593e-01  3.53932504e-01
  -2.71031338e-01 -3.53832759e-01 -2.42

In [5]:
# Ridge regression
# set parameter
alphas = np.logspace(-4, -0.5, 30) # default base = 10
tuned_parameters = [{'alpha': alphas}]
n_folds = 3

# load package
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# model
model = Ridge()
ridge = GridSearchCV(estimator=model, param_grid = tuned_parameters, cv=n_folds, refit=False)
ridge.fit(X_train, y_train)

# Final Model
ridge_final = Ridge(alpha = ridge.best_params_['alpha'])
ridge_final.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = ridge_final.predict(X_test)
# ridge_score = ridge_final.score(X_test, y_test, sample_weight=None)
print(ridge_final.score(X_test, y_test, sample_weight=None))

# The coefficients
print('Coefficients: \n', ridge_final.coef_)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
ridge_mse = mean_squared_error(y_test, y_pred)

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))
ridge_score = r2_score(y_test, y_pred)

-0.19494904157746062
Coefficients: 
 [[ 3.46648074e-04 -3.84793279e-05  1.17344868e-04  2.29855402e-01
  -1.48308844e-04 -1.05373817e-03  1.21029410e-01 -7.39672594e-01
   7.49275914e-01 -7.31868270e-02 -1.27933791e+00  6.37647695e-02
  -2.98099729e-01  0.00000000e+00  1.22180559e-01 -3.07604184e-01
  -3.49432087e-01  3.39290690e-01  2.50284757e-01 -7.64783032e-01
  -1.73873286e-01  0.00000000e+00  2.41475236e-01  2.63406954e-01
   2.30621399e-01 -5.32336932e-01  3.04082746e-01 -5.78339473e-02
  -1.11175606e-01 -6.94709945e-01  0.00000000e+00  3.42412779e-01
  -1.95334668e-01 -1.19901730e-01  1.76739421e-01  8.44773854e-01
   6.94990289e-02 -1.24841418e-01  7.40209979e-01  0.00000000e+00
   1.31213137e+00  0.00000000e+00  1.32058026e+00 -5.06979195e-03
  -3.84360306e-01 -1.01433370e+00  0.00000000e+00  5.65735345e-02
  -4.13106430e-01  3.46068648e-01  8.90574533e-02  4.23251183e-02
   1.08090354e-02 -3.97598791e-01  2.76881716e-01  2.79982536e-01
  -2.31493654e-01 -3.25370598e-01 -2.44

In [6]:
# LASSO
# load packages
from sklearn.linear_model import Lasso

# model
lasso = Lasso(random_state=0, normalize = True)
clf = GridSearchCV(lasso,  param_grid= tuned_parameters, cv=n_folds, refit=False)
clf.fit(X_train, y_train)

# Final Model
clf_final = Lasso(alpha = clf.best_params_['alpha'])
clf_final.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = clf_final.predict(X_test)
print(clf_final.score(X_test, y_test, sample_weight=None))

# The coefficients
print('Coefficients: \n', clf_final.coef_)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
clf_mse = mean_squared_error(y_test, y_pred)

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))
clf_score = r2_score(y_test, y_pred)

-0.07871713952384263
Coefficients: 
 [ 6.70515176e-04  1.98742761e-04 -6.40541926e-04  2.39246916e-01
 -1.49171723e-04 -8.26865436e-04  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -1.10789484e-01 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  1.93437320e-02
  0.00000000e+00 -0.00000000e+00  1.24343096e-01 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  4.29185868e-01
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -2.20126890e-01  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00  1.05088523e-01  0.00000000e+00
 -0.00000000e+00 -1.32160431e-01 -4.87286996e-01  3.8

In [7]:
# compare
print('------------------------------------------------')
print('Compare Score: \n', round(regr_score,4), round(ridge_score,4), round(clf_score,4))
print('Compare MSE: \n', round(regr_mse,4), round(ridge_mse,4), round(clf_mse,4))

------------------------------------------------
Compare Score: 
 -0.2367 -0.1949 -0.0787
Compare MSE: 
 1.2315 1.19 1.0742
