# **Phase 4 Part B - Classification (Supervised Learning)**
Émilie Brazeau, Nicholas Gin, Gordon Tang

## Sampling Version

In [None]:
import pandas as pd 
import time     
import numpy as np  
import pickle
from sklearn.model_selection import train_test_split  
from sklearn.tree import DecisionTreeClassifier       
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

Load pre-processed from Phase 4 Part A:

In [None]:
# Loading the data
url='https://drive.google.com/file/d/1mqxv8AFExcVBryjJqk-m8aX_5mXzOi7O/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url, encoding='UTF-8')

df

Unnamed: 0,review_text,rating,branch_Disneyland California,branch_Disneyland Hong Kong,branch_Disneyland Paris,branch_Universal Studios Florida,branch_Universal Studios Japan,branch_Universal Studios Singapore,month,quarter,...,attendance_millions_2012,attendance_millions_2013,attendance_millions_2014,attendance_millions_2015,attendance_millions_2016,attendance_millions_2017,attendance_millions_2018,attendance_millions_2019,attendance_millions_2020,attendance_millions_2021
0,daughter spent sunny hot day disneyland hong k...,4,0,1,0,0,0,0,7,3,...,-0.442627,-0.376997,-0.468011,-0.788311,-0.876592,-0.924523,-0.887582,-1.078403,-1.131385,-1.041753
1,location infront mtr station time required cov...,5,0,1,0,0,0,0,11,4,...,-0.442627,-0.376997,-0.468011,-0.788311,-0.876592,-0.924523,-0.887582,-1.078403,-1.131385,-1.041753
2,spent day disneyland great time family friendl...,5,0,1,0,0,0,0,4,2,...,-0.442627,-0.376997,-0.468011,-0.788311,-0.876592,-0.924523,-0.887582,-1.078403,-1.131385,-1.041753
3,recent tour hong kong limited time choose diff...,5,0,1,0,0,0,0,5,2,...,-0.442627,-0.376997,-0.468011,-0.788311,-0.876592,-0.924523,-0.887582,-1.078403,-1.131385,-1.041753
4,hong kong young kid place go fun obviously pie...,5,0,1,0,0,0,0,4,2,...,-0.442627,-0.376997,-0.468011,-0.788311,-0.876592,-0.924523,-0.887582,-1.078403,-1.131385,-1.041753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90350,visit universal orlando brilliant experience p...,5,0,0,0,1,0,0,8,3,...,-0.559182,-0.458312,-0.308994,-0.185408,-0.038519,-0.074019,-0.033271,0.015606,0.834246,0.948710
90351,universal studio shall begin staff lovely ride...,5,0,0,0,0,0,1,9,3,...,-1.185813,-1.279161,-1.369828,-1.351164,-1.306594,-1.345734,-1.377830,-1.328516,-1.625254,-1.478053
90352,split day two park 10 hour great fun staff ama...,5,0,0,0,1,0,0,10,4,...,-0.559182,-0.458312,-0.308994,-0.185408,-0.038519,-0.074019,-0.033271,0.015606,0.834246,0.948710
90353,definitely great ride experience jurassic park...,4,0,0,0,1,0,0,7,3,...,-0.559182,-0.458312,-0.308994,-0.185408,-0.038519,-0.074019,-0.033271,0.015606,0.834246,0.948710


In [None]:
# Each model has to take review_texts as numerical values (they can not take them as strings), so convert them into TF-IDF (term frequency-inverse document frequency)
# tf-idf = term frequency of word * (number of documents in the corpus / document frequency of word)
vectorizer = TfidfVectorizer()
review_text_tf_idf = vectorizer.fit_transform(df['review_text'].values)

In [None]:
# Defining our features and our target (what we are predicting).
other_features = df[['branch_Disneyland California', 'branch_Disneyland Hong Kong', 'branch_Disneyland Paris', 'branch_Universal Studios Florida', 'branch_Universal Studios Japan', 'branch_Universal Studios Singapore', 'month', 'quarter', 'year', 'attendance_millions_2010', 'attendance_millions_2011', 'attendance_millions_2012', 'attendance_millions_2013', 'attendance_millions_2014', 'attendance_millions_2015', 'attendance_millions_2016', 'attendance_millions_2017', 'attendance_millions_2018', 'attendance_millions_2019', 'attendance_millions_2020', 'attendance_millions_2021']].values
target = df['rating'].values

In [None]:
# Concatenate the other feature vectors to the review_text matrix.
features = np.concatenate((review_text_tf_idf.toarray(), other_features), axis=1)

print(features)

[[ 0.          0.          0.         ... -1.07899831 -1.13246491
  -1.04282685]
 [ 0.          0.          0.         ... -1.07899831 -1.13246491
  -1.04282685]
 [ 0.          0.          0.         ... -1.07899831 -1.13246491
  -1.04282685]
 ...
 [ 0.          0.          0.         ...  0.01559747  0.83348769
   0.947744  ]
 [ 0.          0.          0.         ...  0.01559747  0.83348769
   0.947744  ]
 [ 0.          0.          0.         ...  0.01559747  0.83348769
   0.947744  ]]


In [None]:
# Split the dataset into training and testing sets using a 75:25 split ratio 
# (this is actually the default train/test split).

# Setting random_state to 100, so we can duplicate results later.
# Split the dataset into training and testing sets using a 75:25 split ratio
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.25, random_state = 100)

In [None]:
# Undersample the majority class using RandomUnderSampler.
rus = RandomUnderSampler(sampling_strategy = 'majority', random_state = 100)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

# Oversample the minority classes using RandomOverSampler.
ros = RandomOverSampler(sampling_strategy = 'minority', random_state = 100)
X_train_res, y_train_res = ros.fit_resample(X_train_res, y_train_res)

## Decision Tree Classifier

In [None]:
# Start measuring the time taken to construct the model.
start_time = time.time()

# Create the model object.
dt_model = DecisionTreeClassifier(random_state = 100) 

# Train the model on the training data.
dt_model.fit(X_train_res, y_train_res) 

# Stop measuring the time taken to construct the model.
end_time = time.time()

# Make predictions using the trained model on the test data.
dt_pred = dt_model.predict(X_test)

In [None]:
# Calculate the accuracy, precision and recall of the predictions.
dt_accuracy = accuracy_score(y_test, dt_pred)
dt_precision = precision_score(y_test, dt_pred, average = 'weighted')
dt_recall = recall_score(y_test, dt_pred, average = 'weighted') 

# Stop measuring the time taken to construct the model.
dt_time = end_time - start_time

print('Decision Tree:')
print('Accuracy: {:.4f}'.format(dt_accuracy))
print('Precision: {:.4f}'.format(dt_precision))
print('Recall: {:.4f}'.format(dt_recall))
print('Time taken to construct model: {:.4f} mins'.format(dt_time / 60))

import pickle
with open('dt_model_v2.pkl', 'wb') as f:
    pickle.dump(dt_model, f)

"""
# If using Google Colab:
files.download('dt_model_v2.pkl')
"""

Decision Tree:
Accuracy: 0.2732
Precision: 0.4947
Recall: 0.2732
Time taken to construct model: 4.3507 mins


"\n# If using Google Colab:\nfiles.download('dt_model_v2.pkl')\n"

## Random Forest Classifier

In [None]:
# Start measuring the time taken to construct the model.
start_time = time.time()

# Create the model object.
rf_model = RandomForestClassifier(random_state = 100)  

# Train the model on the training data.
rf_model.fit(X_train_res, y_train_res) 

# Stop measuring the time taken to construct the model.
end_time = time.time()

# Make predictions using the trained model on the test data.
rf_pred = rf_model.predict(X_test)

In [None]:
# Calculate the accuracy, precision and recall of the predictions.
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred, average = 'weighted', zero_division=1) 
rf_recall = recall_score(y_test, rf_pred, average = 'weighted') 

# Stop measuring the time taken to construct the model.
rf_time = end_time - start_time

print('Random Forest:')
print('Accuracy: {:.4f}'.format(rf_accuracy))
print('Precision: {:.4f}'.format(rf_precision))
print('Recall: {:.4f}'.format(rf_recall))
print('Time taken to construct model: {:.4f} mins'.format(rf_time / 60))

with open('rf_model_v2.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
    
"""
# If using Google Colab:
files.download('rf_model_v2.pkl')
"""

Random Forest:
Accuracy: 0.2744
Precision: 0.7309
Recall: 0.2744
Time taken to construct model: 4.3279 mins


"\n# If using Google Colab:\nfiles.download('rf_model_v2.pkl')\n"

## Gradient Boosting Classifier

In [None]:
# Start measuring the time taken to construct the model.
start_time = time.time()

# Create the model object.
gb_model = GradientBoostingClassifier(random_state = 100)  

# Train the model on the training data.
gb_model.fit(X_train_res, y_train_res) 

# Stop measuring the time taken to construct the model.
end_time = time.time()

# Make predictions using the trained model on the test data.
gb_pred = gb_model.predict(X_test)

In [None]:
# Calculate the accuracy, precision and recall of the predictions.
gb_accuracy = accuracy_score(y_test, gb_pred)
gb_precision = precision_score(y_test, gb_pred, average = 'weighted') 
gb_recall = recall_score(y_test, gb_pred, average = 'weighted') 

# Stop measuring the time taken to construct the model.
gb_time = end_time - start_time
print('Gradient Boosting:')
print('Accuracy: {:.4f}'.format(gb_accuracy))
print('Precision: {:.4f}'.format(gb_precision))
print('Recall: {:.4f}'.format(gb_recall))
print('Time taken to construct model: {:.4f} hours'.format(gb_time / 3600))

with open('gb_model_v2.pkl', 'wb') as f:
    pickle.dump(gb_model, f)

"""
# If using Google Colab:
files.download('gb_model_v2.pkl')
"""

Gradient Boosting:
Accuracy: 0.2875
Precision: 0.6381
Recall: 0.2875
Time taken to construct model: 17.7269 hours


"\n# If using Google Colab:\nfiles.download('gb_model_v2.pkl')\n"