# Project 5: Disneyland Park and Rating Classifier
## Part III: Park Classifier Model

### 1. Imports

In [1]:
import os
import sys

# essentials:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 


# plotly:
import plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# scikit-learn:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score, balanced_accuracy_score, accuracy_score, RocCurveDisplay, roc_auc_score, recall_score, precision_score, confusion_matrix
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import precision_score, recall_score, accuracy_score, ConfusionMatrixDisplay, roc_auc_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.ensemble import VotingRegressor, BaggingClassifier, BaggingRegressor, RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier, HistGradientBoostingClassifier, HistGradientBoostingRegressor 
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.inspection import plot_partial_dependence
from sklearn.feature_extraction import text 
from sklearn.svm import SVC

# NN & NLP scikit-learn:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.datasets import make_blobs
from sklearn.datasets import fetch_openml, make_classification, make_regression
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin

# unsupervised learning scikit-learn:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

# imblearn:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, SMOTENC

# API & Webscraping:
import time
import requests
import datetime
from bs4 import BeautifulSoup

# nltk:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.chunk.regexp import RegexpParser
from nltk.chunk import tree2conlltags
from nltk import word_tokenize
from nltk.corpus import PlaintextCorpusReader

# tensorflow/keras:
import tensorflow as tf
import tensorboard
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.layers import SimpleRNN, LSTM, GRU, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping

# time series/sktime:
import sktime
import statsmodels
import pmdarima as pmd
import pandas_datareader as pdr
from keras.preprocessing.sequence import TimeseriesGenerator
from sktime.forecasting.compose import EnsembleForecaster
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.arima import AutoARIMA
from sktime.forecasting.ets import AutoETS
from sktime.utils.plotting import plot_series


# spacy: 
import spacy
from spacy.matcher import Matcher
from spacy import displacy

# statsmodels:
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf

# emojis:
import emoji
import demoji

# others:
import scipy.stats as stats
import missingno as msno
from itertools import groupby
from IPython.display import display

### 2. Read in & Inspect Data

In [2]:
disney = pd.read_csv('../data/Clean_DisneylandReviews.csv')

In [3]:
disney.head()

Unnamed: 0,Branch,park,Review_Text,clean_text,Reviewer_Location,Year_Month,year,month,text_word_count,subjectivity,tb_polarity,vs_polarity,Rating
0,Disneyland_HongKong,3,If you've ever been to Disneyland anywhere you...,if you've ever been to disneyland anywhere you...,Australia,2019-4,2019,4,59,0.561481,0.239352,0.6786,4
1,Disneyland_HongKong,3,Its been a while since d last time we visit HK...,its been a while since d last time we visit hk...,Philippines,2019-5,2019,5,171,0.459783,0.205797,0.9879,4
2,Disneyland_HongKong,3,Thanks God it wasn t too hot or too humid wh...,thanks god it wasn t too hot or too humid when...,United Arab Emirates,2019-4,2019,4,169,0.434857,0.119238,0.9945,4
3,Disneyland_HongKong,3,HK Disneyland is a great compact park. Unfortu...,hk disneyland is a great compact park unfortun...,Australia,2019-4,2019,4,91,0.512143,0.189286,0.8489,4
4,Disneyland_HongKong,3,"the location is not in the city, took around 1...",the location is not in the city took around 1 ...,United Kingdom,2019-4,2019,4,31,0.4375,0.266667,0.2846,4


In [4]:
disney.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42656 entries, 0 to 42655
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Branch             42656 non-null  object 
 1   park               42656 non-null  int64  
 2   Review_Text        42656 non-null  object 
 3   clean_text         42656 non-null  object 
 4   Reviewer_Location  42656 non-null  object 
 5   Year_Month         42656 non-null  object 
 6   year               42656 non-null  object 
 7   month              42656 non-null  object 
 8   text_word_count    42656 non-null  int64  
 9   subjectivity       42656 non-null  float64
 10  tb_polarity        42656 non-null  float64
 11  vs_polarity        42656 non-null  float64
 12  Rating             42656 non-null  int64  
dtypes: float64(3), int64(3), object(7)
memory usage: 4.2+ MB


In [5]:
disney.describe()

Unnamed: 0,park,text_word_count,subjectivity,tb_polarity,vs_polarity,Rating
count,42656.0,42656.0,42656.0,42656.0,42656.0,42656.0
mean,1.770583,129.703817,0.524574,0.212199,0.68087,4.217695
std,0.79237,154.713032,0.124134,0.175264,0.477683,1.063371
min,1.0,3.0,0.0,-1.0,-0.9977,1.0
25%,1.0,45.0,0.450786,0.105354,0.640925,4.0
50%,2.0,81.0,0.519638,0.203333,0.8957,5.0
75%,2.0,156.0,0.594202,0.31159,0.9661,5.0
max,3.0,3963.0,1.0,1.0,0.9999,5.0


### 3. Baseline Model

In [6]:
X = disney['clean_text']
y = disney['park']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 4, stratify = y)

In [8]:
y_train.value_counts(normalize=True)

1    0.454926
2    0.319549
3    0.225525
Name: park, dtype: float64

The goal is to beat baseline of .45

### 4. Functions for Efficiency

In [9]:
# function that lemmatizes text

def split_into_lemmas(text):
    '''return lowercased, lemmatizeed list of words as a string from a document passed in '''
   
    text = text.lower()
    lemmer = WordNetLemmatizer()
    return ' '.join([lemmer.lemmatize(word) for word in text.split() ])

In [10]:
# function that stems sentence to make it more understandable

def stem_sentence(sentence): 
    
    p_stemmer = PorterStemmer()
    return ' '.join([p_stemmer.stem(word) for word in sentence.split(' ')])

In [25]:
# function that does train test split, creates a pipline, and scores the given model with transformer
# this function assumes X has one feature: clean_text

def model_score(transformer, classifier, X,y):
    # train, test, split X and y 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 4, stratify = y)
    
    # make pipeline
    pipe = make_pipeline(transformer, classifier)
    pipe.fit(X_train, y_train)
    
    # define score
    test_score = np.round(pipe.score(X_test, y_test), 3)
    
    # print model and score
    print (f' model: {transformer, classifier}')
    return (f' test score: {test_score}')

In [26]:
# function that does train test split, creates a pipline, and scores the given model with transformer
# this function assumes X has more than one feature

def model_score_more_feats(transformer, classifier, X,y):
    # train, test, split X and y 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 4, stratify = y)
    
    # make transformer
    ct = make_column_transformer((transformer, 'clean_text'), remainder='passthrough')
    
    # make pipeline
    pipe = make_pipeline(ct, StandardScaler(with_mean=False), classifier)
    pipe.fit(X_train, y_train)
    
    # define score
    test_score = np.round(pipe.score(X_test, y_test), 3)
    
    # print model and score
    print (f' model: {transformer, classifier}')
    return (f' test score: {test_score}')

In [13]:
# function that helps guage which hyperparameters to tune in order to avoid running too many parameters in Grid Search

def count_vec_options(classifier, X, y):
    # count vectorizer
    print(model_score(CountVectorizer(), classifier, X, y))
    print(model_score(CountVectorizer(stop_words = 'english'),classifier, X, y))
    print(model_score(CountVectorizer(stop_words = 'english', max_features = 1_000), classifier, X, y))
    print(model_score(CountVectorizer(preprocessor=split_into_lemmas), classifier, X, y))
    print(model_score(CountVectorizer(preprocessor=stem_sentence), classifier, X, y))
    print(model_score(CountVectorizer(ngram_range=(1,2)), classifier, X, y))

In [14]:
# function that helps guage which hyperparameters to tune in order to avoid running too many parameters in Grid Search

def tfidf_vec_options(classifier, X, y):
   # tfidf vectorizer
    print(model_score(TfidfVectorizer(), classifier, X, y))
    print(model_score(TfidfVectorizer(stop_words = 'english'), classifier, X, y))
    print(model_score(TfidfVectorizer(stop_words = 'english', max_features = 1_000), classifier, X, y))
    print(model_score(TfidfVectorizer(preprocessor=split_into_lemmas), classifier, X, y))
    print(model_score(TfidfVectorizer(preprocessor=stem_sentence), classifier, X, y))
    print(model_score(TfidfVectorizer(ngram_range=(1,2)), classifier, X, y))

### 5. Testing Models with Different X's

In [28]:
lr = LogisticRegression(random_state=4, max_iter = 10_000)
knn = KNeighborsClassifier()
dtc = DecisionTreeClassifier(random_state=4)
bag = BaggingClassifier(random_state=4)
rfc = RandomForestClassifier(random_state=4)
ada = AdaBoostClassifier(random_state=4)
gb = GradientBoostingClassifier(random_state = 4)

In [29]:
# models = [lr, knn, dtc, bag, rfc, ada, gb]
models = [lr, knn, dtc, bag, rfc, ada]

In [35]:
X = disney['clean_text']
y = disney['park']

In [31]:
for model in models:
    print(model_score(CountVectorizer(), model, X, y))

 model: (CountVectorizer(), LogisticRegression(max_iter=10000, random_state=4))
 test score: 0.858
 model: (CountVectorizer(), KNeighborsClassifier())
 test score: 0.538
 model: (CountVectorizer(), DecisionTreeClassifier(random_state=4))
 test score: 0.719
 model: (CountVectorizer(), BaggingClassifier(random_state=4))
 test score: 0.778
 model: (CountVectorizer(), RandomForestClassifier(random_state=4))
 test score: 0.781
 model: (CountVectorizer(), AdaBoostClassifier(random_state=4))
 test score: 0.777


In [32]:
X = disney[['clean_text', 'text_word_count', 'subjectivity', 'tb_polarity', 'vs_polarity']]
y = disney['park']

In [33]:
for model in models:
    print(model_score_more_feats(CountVectorizer(), model, X, y))

 model: (CountVectorizer(), LogisticRegression(max_iter=10000, random_state=4))
 test score: 0.761
 model: (CountVectorizer(), KNeighborsClassifier())
 test score: 0.532
 model: (CountVectorizer(), DecisionTreeClassifier(random_state=4))
 test score: 0.716
 model: (CountVectorizer(), BaggingClassifier(random_state=4))
 test score: 0.774
 model: (CountVectorizer(), RandomForestClassifier(random_state=4))
 test score: 0.778
 model: (CountVectorizer(), AdaBoostClassifier(random_state=4))
 test score: 0.779


### 6. Logistic Regression

In [36]:
count_vec_options(lr, X, y)

 model: (CountVectorizer(), LogisticRegression(max_iter=10000, random_state=4))
 test score: 0.858
 model: (CountVectorizer(stop_words='english'), LogisticRegression(max_iter=10000, random_state=4))
 test score: 0.857
 model: (CountVectorizer(max_features=1000, stop_words='english'), LogisticRegression(max_iter=10000, random_state=4))
 test score: 0.841
 model: (CountVectorizer(preprocessor=<function split_into_lemmas at 0x1ab33d820>), LogisticRegression(max_iter=10000, random_state=4))
 test score: 0.861
 model: (CountVectorizer(preprocessor=<function stem_sentence at 0x110712a60>), LogisticRegression(max_iter=10000, random_state=4))
 test score: 0.859
 model: (CountVectorizer(ngram_range=(1, 2)), LogisticRegression(max_iter=10000, random_state=4))
 test score: 0.863
