## Deep Learning

In [46]:
import pandas as pd

from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.callbacks import EarlyStopping

## Import Data

In [4]:
data = pd.read_csv("review_sentiment.csv")

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,review_text,restaurant_label,food_score,service_score,ambiance_score,authentic_score,overall_score,sentiment
0,0,come entre nous periodically past year seem se...,0,0.462757,0.427331,0.331545,0.492026,0.428415,1
1,1,absolutely delicious menu lovely offer excepti...,0,0.448536,0.318372,0.282956,0.371091,0.355239,1
2,2,thank much choose entre nous creperie recently...,0,0.430254,0.381057,0.28979,0.40679,0.376973,1
3,3,lovely little french restaurant really authent...,0,0.416918,0.309187,0.258666,0.506409,0.372795,1
4,4,dear rebecca thank much dining entre nous crep...,0,0.414757,0.294917,0.352158,0.438276,0.375027,1


In [6]:
data.drop(columns=["Unnamed: 0"], inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40946 entries, 0 to 40945
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   review_text       40946 non-null  object 
 1   restaurant_label  40946 non-null  int64  
 2   food_score        40946 non-null  float64
 3   service_score     40946 non-null  float64
 4   ambiance_score    40946 non-null  float64
 5   authentic_score   40946 non-null  float64
 6   overall_score     40946 non-null  float64
 7   sentiment         40946 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 2.5+ MB


## Train and modelling

In [9]:
X = data['review_text']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Long Short-Term Memory

In [10]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100, truncating="post")

# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=16, input_length=100))
model.add(LSTM(100))
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, validation_split=0.2)

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100, truncating="post")
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.8389775156974792


In [None]:
## With dropout

In [12]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100, truncating="post")

# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=16, input_length=100))
model.add(LSTM(100, return_sequences=True))  # Return sequences if stacking LSTM layers
model.add(Dropout(0.2))  # Add dropout layer with 20% dropout rate
model.add(LSTM(100))
model.add(Dropout(0.2))  # Add dropout layer with 20% dropout rate
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, validation_split=0.2)

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100, truncating="post")
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.8334418535232544


In [13]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=16, input_length=100))
model.add(LSTM(200, return_sequences=True))  # Return sequences if stacking LSTM layers
model.add(Dropout(0.3))  # Add dropout layer with 30% dropout rate
model.add(LSTM(200))
model.add(Dropout(0.2))  # Add dropout layer with 20% dropout rate
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, validation_split=0.2)

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100, truncating="post")
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.8346629738807678


In [14]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=16, input_length=100))
model.add(LSTM(200, return_sequences=True))  # Return sequences if stacking LSTM layers
model.add(Dropout(0.5))  # Add dropout layer with 30% dropout rate
model.add(LSTM(200))
model.add(Dropout(0.2))  # Add dropout layer with 20% dropout rate
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, validation_split=0.2)

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100, truncating="post")
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.8371865749359131


In [18]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(LSTM(128, return_sequences=True))  # Return sequences if stacking LSTM layers
model.add(Dropout(0.5))  # Add dropout layer with 50% dropout rate
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))  # Add dropout layer with 50% dropout rate
model.add(LSTM(64))
model.add(Dropout(0.2))  # Add dropout layer with 20% dropout rate
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train_padded, y_train, epochs=10, validation_split=0.2)

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.8192771077156067


## Convolution Neural Network

In [20]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(Conv1D(64, 3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, validation_split=0.2)

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.8257896304130554


In [32]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(Conv1D(64, 3, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train_padded, y_train, epochs=10, validation_split=0.2)

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.8012862205505371


In [35]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))  # Add dropout layer with 50% dropout rate
model.add(Dense(64, activation='relu',kernel_regularizer=l2(0.01)))
model.add(Dropout(0.2))  # Add dropout layer with 20% dropout rate
model.add(Dense(1, activation="sigmoid"))


# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X_train_padded, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Test Accuracy: 0.827092170715332


In [36]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))  # Add dropout layer with 50% dropout rate
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))  # Add dropout layer with 40% dropout rate
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X_train_padded, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Test Accuracy: 0.8279876112937927


In [37]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))  # Add dropout layer with 50% dropout rate
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.4))  # Add dropout layer with 40% dropout rate
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X_train_padded, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Test Accuracy: 0.8297785520553589


In [39]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(Conv1D(128, 5, activation='relu', kernel_regularizer=l2(0.01)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.3))  # Add dropout layer with 30% dropout rate
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1))  # Add dropout layer with 10% dropout rate
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X_train_padded, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Test Accuracy: 0.8298599720001221


In [41]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(Conv1D(64, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.3))  # Add dropout layer with 30% dropout rate
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))  # Add dropout layer with 10% dropout rate
model.add(Dense(1, activation="sigmoid"))

# Compile the model

model.compile(optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X_train_padded, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Test Accuracy: 0.8317323327064514


In [41]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(Conv1D(64, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.3))  # Add dropout layer with 30% dropout rate
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))  # Add dropout layer with 10% dropout rate
model.add(Dense(1, activation="sigmoid"))

# Compile the model

model.compile(optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X_train_padded, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])

# Evaluate on the test set
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Test Accuracy: 0.8317323327064514


In [44]:
# Example: Remove Conv1D layer
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])

model.fit(X_train_padded, y_train, epochs=10, validation_split=0.2)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.8220449090003967


In [49]:
# Example: Remove Conv1D layer
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=100))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.7))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])

model.fit(X_train_padded, y_train, epochs=10, validation_split=0.2)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.8250569701194763


In [50]:
# Example: Remove Conv1D layer
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.7))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])

model.fit(X_train_padded, y_train, epochs=10, validation_split=0.2)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.8372679948806763


In [56]:
# Example: Remove Conv1D layer
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=100))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.7))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])

model.fit(X_train_padded, y_train, epochs=10, validation_split=0.2)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.823184609413147


In [57]:
# Example: Remove Conv1D layer
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=100))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.7))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X_train_padded, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping])

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded_3D = X_test_padded.reshape((X_test_padded.shape[0], X_test_padded.shape[1], 1))
accuracy = model.evaluate(X_test_padded, y_test)[1]
print(f"Test Accuracy: {accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Test Accuracy: 0.830511212348938
