**Author**: Victor Teixidó López

In [21]:
!pip install -q transformers
!pip install -q torchvision

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

import torch

from matplotlib.pyplot import plot

In [23]:
import torch
import gc

device = 'cuda'

gc.collect()
torch.cuda.empty_cache()

In [24]:
df = pd.read_csv('/content/drive/MyDrive/UNI/tfg/preprocessed_data.csv')
df.head()

Unnamed: 0,ratings,reviews
0,4,ever disneyland anywhere find disneyland hong ...
1,4,since last time visit hk disneyland yet time s...
2,4,thanks god hot humid visit park otherwise woul...
3,4,hk disneyland great compact park unfortunately...
4,4,location city take around 1 hour kowlon kid li...


# Data splitting

We will split the data in train, test and validation.

In [25]:
from sklearn.model_selection import train_test_split

X = df['reviews']
y = df['ratings']

# Divide the data in train (70%) and test (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Divide the train data in train (80%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Tokenization

After having all the data preprocessed and divided, we continue with their tokenization by words.

In [26]:
from transformers import AutoTokenizer

#checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
checkpoint = 'nlptown/bert-base-multilingual-uncased-sentiment'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [27]:
X_train_tokenized = tokenizer(X_train.tolist(), truncation=True, padding=True, return_tensors='pt')
#X_val_tokenized = tokenizer(X_train.tolist(), truncation=True, padding=True, return_tensors="pt")

# Try the model!

In [28]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

model = model.to(device)

In [29]:
from torch.utils.data import DataLoader, TensorDataset

X_train_tensors = [X_train_tokenized[key] for key in X_train_tokenized.keys()]

# Create a TensorDataset
dataset = TensorDataset(*X_train_tensors)

batch_size = 4
# Create a DataLoader to iterate over the data in batches
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [30]:
model.eval()  # Set the model in evaluation mode

first_output = ""
progress = 0

for batch in dataloader:
    batch = [data.to(device) for data in batch]

    # Unpack the batch and feed it to the model
    inputs = {key: value for key, value in zip(X_train_tokenized.keys(), batch)}
    output = model(**inputs)

    progress = progress + 1
    if (progress % 2000 == 0):
      #print(type(output))
      print("Progress: " + str(progress) + " of " + str(len(dataloader)))

    # Process the model output as needed
    #results.append(output)
    if progress == 1:
      first_output = output

#eval_results = torch.cat(results, dim=0)

Progress: 2000 of 13096
Progress: 4000 of 13096
Progress: 6000 of 13096
Progress: 8000 of 13096
Progress: 10000 of 13096
Progress: 12000 of 13096


In [31]:
class_probs = first_output.logits.softmax(dim=1)
predicted_labels = torch.argmax(class_probs, dim=-1)
print(class_probs)
print(predicted_labels+1)

tensor([[0.1353, 0.0788, 0.1120, 0.2116, 0.4622],
        [0.2668, 0.3868, 0.2453, 0.0824, 0.0188],
        [0.0067, 0.0245, 0.1920, 0.6124, 0.1644],
        [0.0032, 0.0070, 0.1374, 0.5611, 0.2913]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([5, 2, 4, 4], device='cuda:0')


In [33]:
for i, input_text in enumerate(X_train[0:4]):
    print(f"Input: {input_text}")
    print(f"Predicted Class Probabilities: {class_probs[i]}")
    print(f"Predicted Label: {predicted_labels[i]+1}")
    print()

Input: disneyland park times adult without child time bring two year old daughter say bring kid see disneyland kid eye different experience see light see character priceless avoid come time take picture hilarious hahaha guess different see far away get close character amazing super patient
Predicted Class Probabilities: tensor([0.1353, 0.0788, 0.1120, 0.2116, 0.4622], device='cuda:0',
       grad_fn=<SelectBackward0>)
Predicted Label: 5

Input: time step symbolic globe uss every minute feel magical enter place lot visitor jut like we excite overwhelmed beautiful sight inside gigantic place actually book ticket advance fortunately experience hassle difficulty buy ticket booth fast though long queue enjoy attraction feel disappointed transformer water world attraction cancel due technical reason due bad weather condition overall experience instagrammable photo disappoint feel satisfied may suggest wear comfortable shoe walk patient queuing come early enjoy ride attraction
Predicted Class

In [None]:
#device = "cpu"
#X_train_tokenized = X_train_tokenized.to(device)

#output = model(**X_train_tokenized)

In [None]:
#outputs = model(**inputs)
#print(outputs.last_hidden_state.shape)

In [11]:
inputs = tokenizer(["I love you", "I hate you"], truncation=True, padding=True, return_tensors='pt')
inputs.to(device)

outputs = model(**inputs)

In [19]:
class_probs = outputs.logits.softmax(dim=1)
predicted_labels = torch.argmax(class_probs, dim=-1)
print(class_probs)
print(predicted_labels+1)

tensor([[0.0055, 0.0050, 0.0204, 0.1145, 0.8547],
        [0.6346, 0.1560, 0.0513, 0.0405, 0.1175]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([5, 1], device='cuda:0')
