In [1]:
import re
from tqdm import tqdm
import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.stem import PorterStemmer as stemmer
from nltk.corpus import stopwords

import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Flatten, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Dropout
from tensorflow.keras.metrics import Recall, Precision, MeanSquaredError

from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, mean_squared_error

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pande\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pande\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Cleaning

In [2]:
reviews_labeled = "data/reviews_labeled.csv"
reviews_scored = "data/reviews_scored.csv"

In [3]:
dataset1 = pd.read_csv(reviews_labeled)
dataset2 = pd.read_csv(reviews_scored)

In [4]:
dataset1.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [5]:
dataset2.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


Cleaning Dataset 1

In [6]:
# rename columns
dataset1 = dataset1.rename({"Description": "text", "Is_Response": "sentiment"}, axis=1)[["text", "sentiment"]]

# replace not happy to 0 and happy to 1
dataset1.replace("not happy", 0, inplace=True)
dataset1.replace("happy", 1, inplace=True)

Cleaning Dataset 2

In [7]:
# rename columns
dataset2 = dataset2.rename({"Review": "text", "Rating": "sentiment"}, axis=1)

# remove data with sentiment values == 3 since we're only looking at values 1-2 for not happy and 4-5 for happy
dataset2 = dataset2[dataset2["sentiment"] != 3]

# converting sentiment score to boolean
sentiments = [0 if x < 3 else 1 for x in dataset2["sentiment"]]
dataset2["sentiment"] = sentiments

Generating a combined Dataset

In [8]:
# concatenating datasets
dataset = pd.concat([dataset1, dataset2])

# removing repeated indexes by resetting the index of the dataframe
dataset.reset_index(drop=True, inplace=True)

Preprocessing

In [9]:
# function to remove stopwords + numbers + special characters and convert text to lowercase
def preProcess(text):
    sents = nltk.sent_tokenize(text)

    for i in range(len(sents)):
        words = nltk.word_tokenize(sents[i])
        for j in range(len(words)):
            if words[j] not in set(stopwords.words('english')): 
                words[j] = re.sub('[^A-Za-z]+','', words[j]) # maybe add numbers
                words[j] = words[j].lower()
                words[j] = stemmer().stem(words[j])
        sents[i] = ' '.join([w for w in words if w != ""])
    
    sents = ' '.join(sents)
    return sents

In [10]:
texts = list(dataset["text"])
for i in tqdm(range(len(texts  ))):
    dataset.loc[i, ["text"]] = preProcess(texts[i])

100%|██████████| 57239/57239 [47:12<00:00, 20.21it/s]  


# Support Vector Machines (SVM)

TF-IDF Vectorization

In [11]:
vectorizer = TfidfVectorizer(strip_accents=None, 
                        lowercase=None, 
                        preprocessor=None, 
                        tokenizer=None, 
                        use_idf=True, 
                        norm='l2', 
                        smooth_idf=True)

y = dataset.sentiment.values
X = vectorizer.fit_transform(dataset.text)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

Cross-validation

In [12]:
folds = 5
kfold = KFold(n_splits=folds, shuffle=True)

scores = []
for train, test in kfold.split(X_train, y_train):
    model = SVC(kernel='rbf')
    model.fit(X_train[train], y_train[train])
    
    y_pred = model.predict(X_train[test])
    s = [recall_score(y_train[test], y_pred), precision_score(y_train[test], y_pred), f1_score(y_train[test], y_pred), accuracy_score(y_train[test], y_pred), mean_squared_error(y_train[test], y_pred)]
    scores.append(s)

In [32]:
print("CV Recall:", np.mean(scores[0]))
print("CV Precision:", np.mean(scores[1]))
print("CV F-measure:", np.mean(scores[2]))
print("CV Accuracy:", np.mean(scores[3]))
print("CV MSE:", np.mean(scores[4]))

CV Recall: 0.9574466509461803
CV Precision: 0.9240982853537277
CV F-measure: 0.940472225019046
CV Accuracy: 0.9119259560913869
CV MSE: 0.08807404390861302


Fitting the model on X_train & y_train

In [13]:
model = SVC(kernel='rbf')
model.fit(X_train, y_train)

SVC()

Testing the Model

In [14]:
y_pred = model.predict(X_test)

print("Recall", recall_score(y_test, y_pred))
print("Precision", precision_score(y_test, y_pred))
print("F-measure", f1_score(y_test, y_pred))
print("Accuracy", accuracy_score(y_test, y_pred))
print("MSE", mean_squared_error(y_test, y_pred))

Recall 0.9565947242206235
Precision 0.9250927643784786
F-measure 0.940580051874558
Accuracy 0.9119496855345912


# Convolutional Neural Networks (CNN)

In [16]:
y = dataset.sentiment.values
X = dataset.text.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

Preprocessing

In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# converting text to sequences
sequences_length = 50
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_train = pad_sequences(sequences_train, maxlen=sequences_length)

sequences_test = tokenizer.texts_to_sequences(X_test)
sequences_test = pad_sequences(sequences_test, maxlen=sequences_length)

# +1 for OOV words
vocabulary_size = len(tokenizer.word_index) + 1

In [18]:
def CNN_Model(seq_len):
    embedding_dim = 16
    units = 32
    
    model = Sequential()
    model.add(Embedding(vocabulary_size, embedding_dim, input_length=seq_len))
    model.add(Conv1D(filters=units, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Recall(), Precision(), MeanSquaredError()])
    
    return model

Cross-validation

In [19]:
folds = 5
kfold = KFold(n_splits=folds, shuffle=True)

scores = []
for train, test in kfold.split(sequences_train, y_train):
    model = CNN_Model(sequences_length)
    model.summary()
    
    model.fit(sequences_train[train], y_train[train], epochs = 3, batch_size=10, verbose = 1)
    
    s = model.evaluate(sequences_train[test], y_train[test], verbose=1)
    scores.append(s)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 16)            1106944   
                                                                 
 conv1d (Conv1D)             (None, 43, 32)            4128      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 21, 32)           0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 672)               0         
                                                                 
 dense (Dense)               (None, 10)                6730      
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                        

CV Metrics

In [20]:
for j, name in enumerate(model.metrics_names):
    print(f"{name}:", np.mean([score[model.metrics_names.index(name)] for score in scores]))

loss: 0.4007066547870636
accuracy: 0.8485509991645813
recall_4: 0.9286358594894409
precision_4: 0.871467399597168
mean_squared_error: 0.11377326995134354


Test Metrics

In [21]:
model = CNN_Model(sequences_length)
model.fit(sequences_train, y_train, epochs=3, batch_size=10, verbose=1)
metrics = model.evaluate(sequences_test, y_test, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Long Short Term Memory (LSTM)

In [22]:
def LSTM_Model():
    embedding_dim = 16
    lstm_units = 32
    
    model = Sequential()
    model.add(Embedding(vocabulary_size, embedding_dim))
    model.add(LSTM(lstm_units))
    model.add(Dropout(.5))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall(), MeanSquaredError()])
    
    return model

Cross-validation

In [23]:
folds = 5
kfold = KFold(n_splits=folds, shuffle=True)

scores = []
for train, test in kfold.split(sequences_train, y_train):
    model = LSTM_Model()
    model.summary()
    
    model.fit(sequences_train[train], y_train[train], epochs = 3, batch_size=10, verbose = 1)
    
    s = model.evaluate(sequences_train[test], y_train[test], verbose=1)
    
    scores.append(s)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 16)          1106944   
                                                                 
 lstm (LSTM)                 (None, 32)                6272      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_12 (Dense)            (None, 1)                 33        
                                                                 
Total params: 1,113,249
Trainable params: 1,113,249
Non-trainable params: 0
_________________________________________________________________
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Par

CV Metrics

In [24]:
for j, name in enumerate(model.metrics_names):
    print(f"{name}:", np.mean([score[model.metrics_names.index(name)] for score in scores]))

loss: 0.34831780195236206
accuracy: 0.8570898532867431
precision_10: 0.8885136604309082
recall_10: 0.9188119173049927
mean_squared_error: 0.10452398061752319


Test Metrics

In [25]:
model = LSTM_Model()
model.fit(sequences_train, y_train, epochs=3, batch_size=10, verbose=1)
metrics = model.evaluate(sequences_test, y_test, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
