In [99]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score ,precision_score, recall_score, f1_score
import tensorflow as tf
from sklearn.naive_bayes import GaussianNB


In [7]:
df = pd.read_csv("LasVegasTripAdvisorReviews-Dataset.csv", sep=";")
df.head(5)

Unnamed: 0,User country,Nr. reviews,Nr. hotel reviews,Helpful votes,Score,Period of stay,Traveler type,Pool,Gym,Tennis court,Spa,Casino,Free internet,Hotel name,Hotel stars,Nr. rooms,User continent,Member years,Review month,Review weekday
0,USA,11,4,13,5,Dec-Feb,Friends,NO,YES,NO,NO,YES,YES,Circus Circus Hotel & Casino Las Vegas,3,3773,North America,9,January,Thursday
1,USA,119,21,75,3,Dec-Feb,Business,NO,YES,NO,NO,YES,YES,Circus Circus Hotel & Casino Las Vegas,3,3773,North America,3,January,Friday
2,USA,36,9,25,5,Mar-May,Families,NO,YES,NO,NO,YES,YES,Circus Circus Hotel & Casino Las Vegas,3,3773,North America,2,February,Saturday
3,UK,14,7,14,4,Mar-May,Friends,NO,YES,NO,NO,YES,YES,Circus Circus Hotel & Casino Las Vegas,3,3773,Europe,6,February,Friday
4,Canada,5,5,2,4,Mar-May,Solo,NO,YES,NO,NO,YES,YES,Circus Circus Hotel & Casino Las Vegas,3,3773,North America,7,March,Tuesday


In [17]:
df.loc[df[df['Member years']<0].index, 'Member years']= np.NaN
df['Member years'].fillna(df['Member years'].median(), inplace=True)

In [47]:
df.replace(to_replace={'3':3,'4':4,'5':5,'3,5':3.5,'4,5':4.5},inplace=True)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Nr. reviews        504 non-null    int64  
 1   Nr. hotel reviews  504 non-null    int64  
 2   Helpful votes      504 non-null    int64  
 3   Score              504 non-null    int64  
 4   Period of stay     504 non-null    int32  
 5   Traveler type      504 non-null    int64  
 6   Pool               504 non-null    int64  
 7   Gym                504 non-null    int64  
 8   Tennis court       504 non-null    int64  
 9   Spa                504 non-null    int64  
 10  Casino             504 non-null    int64  
 11  Free internet      504 non-null    int64  
 12  Hotel stars        504 non-null    float64
 13  Nr. rooms          504 non-null    int64  
 14  User continent     504 non-null    object 
 15  Member years       504 non-null    float64
 16  Review month       504 non

In [49]:
df['Pool'] = LabelEncoder().fit_transform(df['Pool'])
df['Gym'] = LabelEncoder().fit_transform(df['Gym'])
df['Tennis court'] = LabelEncoder().fit_transform(df['Tennis court'])
df['Spa'] = LabelEncoder().fit_transform(df['Spa'])
df['Casino'] = LabelEncoder().fit_transform(df['Casino'])
df['Free internet'] = LabelEncoder().fit_transform(df['Free internet'])
df['Traveler type'] = LabelEncoder().fit_transform(df['Traveler type'])
df['Period of stay'] = LabelEncoder().fit_transform(df['Period of stay'])
df['User continent'] = LabelEncoder().fit_transform(df['User continent'])

In [55]:
df.drop(['Review month', 'Review weekday'], axis = 1, inplace = True)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Nr. reviews        504 non-null    int64  
 1   Nr. hotel reviews  504 non-null    int64  
 2   Helpful votes      504 non-null    int64  
 3   Score              504 non-null    int64  
 4   Period of stay     504 non-null    int64  
 5   Traveler type      504 non-null    int64  
 6   Pool               504 non-null    int64  
 7   Gym                504 non-null    int64  
 8   Tennis court       504 non-null    int64  
 9   Spa                504 non-null    int64  
 10  Casino             504 non-null    int64  
 11  Free internet      504 non-null    int64  
 12  Hotel stars        504 non-null    float64
 13  Nr. rooms          504 non-null    int64  
 14  User continent     504 non-null    int32  
 15  Member years       504 non-null    float64
dtypes: float64(2), int32(1), i

In [57]:
X = df.drop(columns = 'Score')
y = df['Score']

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# MODEL TESTING

## Linear Regression

In [122]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
percentage = (1-mse) * 100
print(f"The performance measurement is: {percentage}")

The performance measurement is: 10.3062666250188


## Decision Tree Classifier

In [119]:
model_dtc = DecisionTreeClassifier()
model_dtc.fit(X_train, y_train)
y_pred_dtc = model_dtc.predict(X_test)
mse_dtc = mean_squared_error(y_test, y_pred_dtc)
percentage_dtc = abs((1-mse_dtc)*100)
print(f"The performance measurement is: {percentage_dtc}")

The performance measurement is: 82.89473684210526


## Neural Network

In [123]:
model_tf = model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=1)
])
model_tf.compile(optimizer = tf.optimizers.Adam(), loss = 'mean_squared_error')
model_tf.fit(X_train, y_train, epochs=100, batch_size=32)

# Test your neural network
loss = model_tf.evaluate(X_test, y_test)

print(f"The performance measurement is: {abs((1-loss)*100)}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Gaussian Naive Bayes

In [113]:
model_NB = GaussianNB()
model_NB.fit(X_train, y_train)
y_pred_NB = model_NB.predict(X_test)
mse_NB = mean_squared_error(y_test, y_pred_NB)
accuracy = accuracy_score(y_test, y_pred_NB)
precision = precision_score(y_test, y_pred_NB, average='weighted')
recall = recall_score(y_test, y_pred_NB, average='weighted')
f1 = f1_score(y_test, y_pred_NB, average='weighted')
weight_acc = 0.4
weight_prec = 0.3
weight_rec = 0.2
weight_f1 = 0.1
perf_measure = (weight_acc * accuracy) + (weight_prec * precision) + (weight_rec * recall) + (weight_f1 * f1)
perf_measure_percent = perf_measure * 100
print(f"The performance measurement is: {perf_measure_percent}")

The performance measurement is: 29.51665671081726
