# TripAdvisor Hotel Reviews Sentiment Analysis Project
## DistilBERT
This Jupyter notebook performs sentiment analysis on a TripAdvisor Hotel Reviews dataset using DistilBert.

###Install needed libraries

In [None]:
!pip install huggingface_hub



In [None]:
!pip install keras



In [None]:
!pip install tensorflow



In [None]:
!pip install gensim



### Import needed modules and libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset
import os
from transformers import Trainer, TrainingArguments
import torch
from google.colab import drive
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from huggingface_hub import hf_hub_download
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

### Load data

In [None]:

#Load the TripAdvisor dataset from Hugging Face
df = pd.read_parquet("hf://datasets/jniimi/tripadvisor-review-rating/data/train-00000-of-00001.parquet")


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 508038e7-ac6e-48c9-8e28-699eacc1085f)')' thrown while requesting GET https://huggingface.co/datasets/jniimi/tripadvisor-review-rating/resolve/main/data/train-00000-of-00001.parquet
Retrying in 1s [Retry 1/5].


In [None]:
#Add another column called "sentiment" which will contain the negative,
#neutral or positive sentiment depending on the "overall" rating.
df['sentiment'] = df['overall'].apply(lambda x: 0 if x == 1 else 0 if x == 2
                                      else 1 if x == 3 else 2)

In [None]:
#Filter dataframe to columns of interest
df2 = df[["sentiment","review","overall"]]
df2

Unnamed: 0,sentiment,review,overall
0,2,Really excellent Hilton\nStayed here on busine...,5.0
1,2,Exceptional service and comfort\nSpent two nig...,5.0
2,2,Nice room and five star service\nGreat place f...,5.0
3,2,"BRILLIANT hotel, my #1 Chicago pick for busine...",5.0
4,2,Convenient and comfortable\nBEST. BREAKFAST. E...,5.0
...,...,...,...
201290,2,Great find in cool neighborhood\nAlthough a na...,4.0
201291,2,Exceptional Service and great room\nI have sta...,5.0
201292,0,Beware of the Rip Off!\nI received a call for ...,2.0
201293,2,It Deserves the Rating\nI really like Kimptons...,5.0


### DistilBERT Model

In [None]:
df_distilBERT = df2.copy()

In [None]:
#The following class is used to prepare the data for the DistilBERT model
class distilBERTDataset(Dataset):
  def __init__(self, reviews, labels, tokenizer, max_len=400):
    #Initialize attributes
    self.reviews = reviews
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, index):
    review = str(self.reviews[index])
    label = self.labels[index]

    #tokenize the reviews
    encoding = self.tokenizer(
        text=review,
        add_special_tokens=True,
        max_length=self.max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    data_dict = {}
    data_dict["input_ids"] = encoding["input_ids"].squeeze()
    data_dict["attention_mask"] = encoding["attention_mask"].squeeze()
    data_dict["labels"] = torch.tensor(label, dtype=torch.long)

    return data_dict

In [None]:

#Load DistilBertTokenizerFast to tokenize the reviews
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

#Use GPU if available
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

#This is the number of classes (negative, neutral, positive)
num_labels = 3

#Load the pre-trained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)
model.to(device)





Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
#Split data into training, validation and testing
train_reviews, temp_reviews, train_labels, temp_labels = train_test_split(
    df_distilBERT["review"], df_distilBERT["sentiment"], test_size=0.3, stratify=df_distilBERT["sentiment"], random_state=42
)
test_reviews, final_test_reviews, test_labels, final_test_labels = train_test_split(
    temp_reviews, temp_labels, stratify=temp_labels, test_size=0.50, random_state=42
)

train_reviews.reset_index(drop=True, inplace=True)
test_reviews.reset_index(drop=True, inplace=True)
final_test_reviews.reset_index(drop=True, inplace=True)
train_labels.reset_index(drop=True, inplace=True)
test_labels.reset_index(drop=True, inplace=True)
final_test_labels.reset_index(drop=True, inplace=True)

#Create the train, validation and test datasets using the class created previously
train_data = distilBERTDataset(train_reviews, train_labels, tokenizer)
test_data = distilBERTDataset(test_reviews, test_labels, tokenizer)
final_test_dataset = distilBERTDataset(final_test_reviews, final_test_labels, tokenizer)

In [None]:
#Initialize Training arguments
num_train_epochs = 3
learning_rate = 2e-5
per_device_train_batch_size=32
per_device_eval_batch_size=32

#Load the training arguments
training_arguments = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs = num_train_epochs,
    eval_strategy ='epoch',
    save_strategy="epoch",
    learning_rate=learning_rate,
    weight_decay=0.01,
    metric_for_best_model = "Macro F1",
    gradient_accumulation_steps = 4,
    load_best_model_at_end=True,
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

In [None]:
#This function is used to compute the evaluation metrics
def compute_evaluation_metrics(eval_prediction):
  true_labels = eval_prediction.label_ids
  predictions = np.argmax(eval_prediction.predictions, axis=1)

  #Compute F1 score per class
  f1_score_per_class = f1_score(true_labels, predictions, average=None).tolist()
  #Compute the macro F1 score
  f1_macro = f1_score(true_labels, predictions, average="macro")

  #Compute accuracy
  accuracy = accuracy_score(true_labels, predictions)

  #Generate the confusion matrix
  confusion = confusion_matrix(true_labels, predictions)
  confusion_flattened = confusion.flatten().tolist()

  #Generate confusion matrix heatmap
  sns.heatmap(confusion, annot=True, fmt="d", cmap="Greens")
  plt.xlabel("Predicted")
  plt.ylabel("Actual")
  plt.title("Confusion Matrix")

  if(compute_evaluation_metrics.current_epoch <= num_train_epochs):
    plt.savefig("confusion_matrix_at_epoch_" +
                str(compute_evaluation_metrics.current_epoch) + ".png")
  else:
    plt.savefig("confusion_matrix_evaluation.png")

  plt.close()


  #Print classification report to file
  class_report = classification_report(true_labels,predictions)

  if(compute_evaluation_metrics.current_epoch <= num_train_epochs):
    report_name = "classification_report_at_epoch_" + str(compute_evaluation_metrics.current_epoch) + ".txt"
  else:
    report_name = "classification_report_evaluation.txt"

  with open(report_name, "w") as out_file:
    out_file.write(class_report)

  compute_evaluation_metrics.current_epoch += 1

  return {"accuracy": accuracy, "F1 per class": f1_score_per_class,
          "Macro F1": f1_macro, "Confusion matrix": confusion_flattened}

In [None]:
#Load the trainer
trainer = Trainer(
    model = model,
    args = training_arguments,
    train_dataset = train_data,
    eval_dataset = test_data,
    compute_metrics = compute_evaluation_metrics,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [None]:
#Disable WANDB
os.environ["WANDB_DISABLED"] = "true"

#Initialize the current_epoch
compute_evaluation_metrics.current_epoch = 1

#Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 per class,Macro f1,Confusion matrix
1,0.2551,0.243036,0.900609,"[0.7986832986832987, 0.6333165322580645, 0.9586546700942588]",0.796885,"[2305, 483, 37, 588, 2513, 1131, 54, 708, 22375]"
2,0.2208,0.235763,0.90614,"[0.8077344035023714, 0.657914292791677, 0.960454390160574]",0.808701,"[2214, 570, 41, 412, 2656, 1164, 31, 616, 22490]"
3,0.1903,0.2418,0.905379,"[0.8082313287209508, 0.6522608043967024, 0.96044496737619]",0.806979,"[2278, 511, 36, 497, 2611, 1124, 37, 652, 22448]"


TrainOutput(global_step=3303, training_loss=0.23102499938465917, metrics={'train_runtime': 4421.7337, 'train_samples_per_second': 95.6, 'train_steps_per_second': 0.747, 'total_flos': 4.37479315941744e+16, 'train_loss': 0.23102499938465917, 'epoch': 3.0})

In [None]:
#Evaluate model
trainer.evaluate(final_test_dataset)

#Get best model
print(trainer.state.best_model_checkpoint)

./results/checkpoint-2202


In [None]:
#Save model and tokenizer
trainer.save_model("./best_model")
tokenizer.save_pretrained("./best_tokenizer")

('./best_tokenizer/tokenizer_config.json',
 './best_tokenizer/special_tokens_map.json',
 './best_tokenizer/vocab.txt',
 './best_tokenizer/added_tokens.json',
 './best_tokenizer/tokenizer.json')

In [None]:
#Mount Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Save the best model and tokenizer to Drive
!cp -r /content/best_model /content/drive/MyDrive/FinalProject/
!cp -r /content/best_tokenizer /content/drive/MyDrive/FinalProject/



In [None]:
#Save images to Drive
!cp -r /content/*.txt /content/drive/MyDrive/FinalProject/
!cp -r /content/*.png /content/drive/MyDrive/FinalProject/