# Project 5: Disneyland Park and Rating Classifier
## Part I: Cleaning and Preprocessing Data

### 1. Imports

In [1]:
import os
import sys

# essentials:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 


# plotly:
import plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# scikit-learn:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score, balanced_accuracy_score, accuracy_score, RocCurveDisplay, roc_auc_score, recall_score, precision_score, confusion_matrix
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import precision_score, recall_score, accuracy_score, ConfusionMatrixDisplay, roc_auc_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.ensemble import VotingRegressor, BaggingClassifier, BaggingRegressor, RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier, HistGradientBoostingClassifier, HistGradientBoostingRegressor 
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.inspection import plot_partial_dependence
from sklearn.feature_extraction import text 
from sklearn.svm import SVC

# NN & NLP scikit-learn:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.datasets import make_blobs
from sklearn.datasets import fetch_openml, make_classification, make_regression
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin

# unsupervised learning scikit-learn:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

# imblearn:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, SMOTENC

# API & Webscraping:
import time
import requests
import datetime
from bs4 import BeautifulSoup

# nltk:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.chunk.regexp import RegexpParser
from nltk.chunk import tree2conlltags
from nltk import word_tokenize
from nltk.corpus import PlaintextCorpusReader

# other nlp libraries:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline
from spacytextblob.spacytextblob import SpacyTextBlob

# tensorflow/keras:
import tensorflow as tf
import tensorboard
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.layers import SimpleRNN, LSTM, GRU, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping

# time series/sktime:
import sktime
import statsmodels
import pmdarima as pmd
import pandas_datareader as pdr
from keras.preprocessing.sequence import TimeseriesGenerator
from sktime.forecasting.compose import EnsembleForecaster
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.arima import AutoARIMA
from sktime.forecasting.ets import AutoETS
from sktime.utils.plotting import plot_series


# spacy: 
import spacy
from spacy.matcher import Matcher
from spacy import displacy

# statsmodels:
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf

# emojis:
import emoji
import demoji

# others:
import scipy.stats as stats
import missingno as msno
from itertools import groupby
from IPython.display import display

### 2. Read in & Inspect Data

In [2]:
disney = pd.read_csv('../data/DisneylandReviews.csv', encoding='latin-1')

In [3]:
disney.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [4]:
disney.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42656 entries, 0 to 42655
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Review_ID          42656 non-null  int64 
 1   Rating             42656 non-null  int64 
 2   Year_Month         42656 non-null  object
 3   Reviewer_Location  42656 non-null  object
 4   Review_Text        42656 non-null  object
 5   Branch             42656 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.0+ MB


In [5]:
disney.describe()

Unnamed: 0,Review_ID,Rating
count,42656.0,42656.0
mean,318855300.0,4.217695
std,165709200.0,1.063371
min,1398724.0,1.0
25%,174327400.0,4.0
50%,290758300.0,5.0
75%,448957900.0,5.0
max,670801400.0,5.0


In [6]:
disney.isna().sum()

Review_ID            0
Rating               0
Year_Month           0
Reviewer_Location    0
Review_Text          0
Branch               0
dtype: int64

In [7]:
disney['Rating'].nunique()

5

In [8]:
disney['Rating'].value_counts()

5    23146
4    10775
3     5109
2     2127
1     1499
Name: Rating, dtype: int64

In [9]:
disney['Branch'].unique()

array(['Disneyland_HongKong', 'Disneyland_California', 'Disneyland_Paris'],
      dtype=object)

In [10]:
disney['Branch'].value_counts()

Disneyland_California    19406
Disneyland_Paris         13630
Disneyland_HongKong       9620
Name: Branch, dtype: int64

In [11]:
disney['Reviewer_Location'].value_counts()

United States     14551
United Kingdom     9751
Australia          4679
Canada             2235
India              1511
                  ...  
Namibia               1
Armenia               1
Timor-Leste           1
South Sudan           1
Åland Islands         1
Name: Reviewer_Location, Length: 162, dtype: int64

In [12]:
# average rating for hong kong park
disney['Rating'][disney['Branch'] == 'Disneyland_HongKong'].mean()

4.204158004158004

In [13]:
# average rating for california park
disney['Rating'][disney['Branch'] == 'Disneyland_California'].mean()

4.405338555086056

In [14]:
# average rating for paris park
disney['Rating'][disney['Branch'] == 'Disneyland_Paris'].mean()

3.96008804108584

In [15]:
# drop unneeded column
disney.drop(columns = ['Review_ID'], inplace = True)

### 3. Feature Engineering

In [16]:
# text word count column
disney['text_word_count'] = disney['Review_Text'].str.split().str.len()

In [17]:
# get months function
def get_months(date):
    return (date[5:])
    

In [18]:
# month column
disney['month'] = disney['Year_Month'].map(get_months)

In [19]:
# get years function
def get_years(date):
    return (date[0:4])

In [20]:
# year column
disney['year'] = disney['Year_Month'].map(get_years)

### 4. Remove Unnecessary Characters with RegEx

In [21]:
# citation: Breakfast Hour - NLP Practice I

def regex_cleaner(words):
    
    # set token
    my_tokenizer = RegexpTokenizer("[\w']+|\$[\d\.]+")
    
    # tokenize words
    clean_words = my_tokenizer.tokenize(words.lower())
    
    # return words
    return ' '.join(clean_words)

In [22]:
# test function
regex_cleaner(disney["Review_Text"].loc[0])

"if you've ever been to disneyland anywhere you'll find disneyland hong kong very similar in the layout when you walk into main street it has a very familiar feel one of the rides its a small world is absolutely fabulous and worth doing the day we visited was fairly hot and relatively busy but the queues moved fairly well"

In [23]:
disney['clean_text'] = disney['Review_Text'].map(regex_cleaner)

### 5. Add Subjectivity with Textblob Library

Info: Subjectivity lies between [0,1]. Subjectivity quantifies the amount of personal opinion and factual information contained in the text

In [24]:
# get subjectivity scores of reviews
disney['subjectivity'] = disney['clean_text'].apply(lambda review: TextBlob(review).sentiment.subjectivity)

### 6. Add Polarity with Textblob, Vader Sentiment, and Hugging Face Libraries

**Textblob Library**

Info: Polarity lies between [-1,1], -1 defines a negative sentiment and 1 defines a positive sentiment.

In [25]:
# add polarity scores of reviews using text blob library
disney['tb_polarity'] = disney['clean_text'].apply(lambda review: TextBlob(review).sentiment.polarity)

**Vader Sentiment Library**

In [26]:
# function to retrieve polarity scores of reviews using vader sentiment library
def get_vs_polarity(review):
 
    # create a SentimentIntensityAnalyzer object
    sid_obj = SentimentIntensityAnalyzer()
    
    # get polarity score
    sentiment_dict = sid_obj.polarity_scores(review)
 
    return sentiment_dict['compound']

In [28]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/olivialara/nltk_data...


True

In [29]:
# add polarity scores of reviews using vader sentiment library
disney['vs_polarity'] = disney['clean_text'].apply(get_vs_polarity)

**Hugging Face Library**

In [30]:
# pipeline for sentiment analysis from hugging face library
clf = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
2022-01-02 21:31:41.972433: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-02 21:31:41.986979: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained

In [31]:
# example of positive review 
clf(disney["clean_text"].loc[0])[0]

{'label': 'POSITIVE', 'score': 0.9998108744621277}

In [32]:
# find index of review with low rating
disney[disney['Rating'] == 1].head()

Unnamed: 0,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,text_word_count,month,year,clean_text,subjectivity,tb_polarity,vs_polarity
101,1,2019-3,India,The main attractions were shut while we still ...,Disneyland_HongKong,23,3,2019,the main attractions were shut while we still ...,0.441667,0.258333,0.4215
144,1,2019-3,United Arab Emirates,"Visited Disneyland Hong Kong today March 15, 2...",Disneyland_HongKong,92,3,2019,visited disneyland hong kong today march 15 20...,0.420556,-0.05125,-0.4402
236,1,2019-2,Macau,"Every year I come to Disney with my children, ...",Disneyland_HongKong,57,2,2019,every year i come to disney with my children e...,0.69623,-0.190377,-0.4215
255,1,2018-10,Canada,I'm writing an honest and truthful opinion of ...,Disneyland_HongKong,128,10,2018,i'm writing an honest and truthful opinion of ...,0.550667,0.247667,0.7866
264,1,2019-1,Australia,"Visited from Australia. Enjoyed the rides, par...",Disneyland_HongKong,314,1,2019,visited from australia enjoyed the rides parad...,0.510433,-0.082237,-0.4636


In [33]:
# example of negative review 
clf(disney["clean_text"].loc[236])[0]

{'label': 'NEGATIVE', 'score': 0.9992465972900391}

In [34]:
# function to retrieve polarity scores of reviews using hugging face library
def get_hf_polarity(review):
    try: 
        # get inside first element of list
        score_dict = clf(review)[0]
    
        # retrieve score value
        score_num = score_dict['score']
    
        # add negative sign if negative
        if score_dict['label'] == "NEGATIVE":
            score_num *=  -1
    
        # return score rounded to nearest 6 decimal places
        return(np.round(score_num, 6))
    
    except:
        return 0

In [35]:
# test function for postive review
get_hf_polarity(disney["clean_text"].loc[0])

0.999811

In [36]:
# test function for negative review
get_hf_polarity(disney["clean_text"].loc[236])

-0.999247

In [38]:
gen = pipeline("text-generation")

No model was supplied, defaulted to gpt2 (https://huggingface.co/gpt2)
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [39]:
gen(disney["Review_Text"].loc[0], max_length = 100)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


[{'generated_text': "If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides  its a Small World  is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well.  It's a good place for a short stay to see an attraction (you can do there as long as you don't eat and drink.  Some of the"}]

In [40]:
disney["Review_Text"].loc[0]

"If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides  its a Small World  is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well. "

**Spacy Library**

In [None]:
# citation: https://pypi.org/project/spacytextblob/
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("spacytextblob")

In [None]:
doc = nlp(disney["Review_Text"].loc[2])
print('Assessments:', doc._.assessments)

### 5. Replacing Park Names with Numbers

In [41]:
park_map = {"Disneyland_California": 1, "Disneyland_Paris": 2, "Disneyland_HongKong":3}

In [42]:
# create target column
disney["park"] = disney["Branch"].replace(park_map)

### 6. Save Cleaned Dataset

In [43]:
clean_disney = disney[["Branch", "park", "Review_Text", "clean_text", "Reviewer_Location", "Year_Month", "year", "month", "text_word_count", "subjectivity", "tb_polarity", "vs_polarity", "Rating"]]

In [44]:
clean_disney

Unnamed: 0,Branch,park,Review_Text,clean_text,Reviewer_Location,Year_Month,year,month,text_word_count,subjectivity,tb_polarity,vs_polarity,Rating
0,Disneyland_HongKong,3,If you've ever been to Disneyland anywhere you...,if you've ever been to disneyland anywhere you...,Australia,2019-4,2019,4,59,0.561481,0.239352,0.6786,4
1,Disneyland_HongKong,3,Its been a while since d last time we visit HK...,its been a while since d last time we visit hk...,Philippines,2019-5,2019,5,171,0.459783,0.205797,0.9879,4
2,Disneyland_HongKong,3,Thanks God it wasn t too hot or too humid wh...,thanks god it wasn t too hot or too humid when...,United Arab Emirates,2019-4,2019,4,169,0.434857,0.119238,0.9945,4
3,Disneyland_HongKong,3,HK Disneyland is a great compact park. Unfortu...,hk disneyland is a great compact park unfortun...,Australia,2019-4,2019,4,91,0.512143,0.189286,0.8489,4
4,Disneyland_HongKong,3,"the location is not in the city, took around 1...",the location is not in the city took around 1 ...,United Kingdom,2019-4,2019,4,31,0.437500,0.266667,0.2846,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
42651,Disneyland_Paris,2,i went to disneyland paris in july 03 and thou...,i went to disneyland paris in july 03 and thou...,United Kingdom,missing,miss,ng,173,0.570682,0.225000,0.9884,5
42652,Disneyland_Paris,2,2 adults and 1 child of 11 visited Disneyland ...,2 adults and 1 child of 11 visited disneyland ...,Canada,missing,miss,ng,191,0.637045,0.194773,0.9915,5
42653,Disneyland_Paris,2,My eleven year old daughter and myself went to...,my eleven year old daughter and myself went to...,South Africa,missing,miss,ng,109,0.479670,0.231319,0.8979,5
42654,Disneyland_Paris,2,"This hotel, part of the Disneyland Paris compl...",this hotel part of the disneyland paris comple...,United States,missing,miss,ng,112,0.559821,0.255952,0.9517,4


In [45]:
clean_disney.to_csv('../data/Clean_DisneylandReviews.csv', index=False)