In [1]:
import pandas as pd
import torch

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [50]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment").to(device)

In [4]:
data = pd.read_csv('data/preprocessed_dataset.csv', sep=';')['reviewText']
data = pd.DataFrame(data)

In [5]:
data

Unnamed: 0,reviewText
0,I have read a lot of the reviews and spoken to...
1,Or maybe not. The good news is that for less ...
2,Having previously owned the LG VX9800 (one of ...
3,3.5 stars. I was very excited about the launc...
4,I want to warn IT professionals that Verizon W...
...,...
9661,No instructions. Makes me question if is real...
9662,Finally using the phone. Works great and was ...
9663,does funny things rarely . like FB will just r...
9664,This phone is no good!! Keeps freezing up. Eve...


In [6]:
data['#tokens'] = data['reviewText'].apply(lambda x: len((tokenizer.encode(x, return_tensors='pt'))[0]))

Token indices sequence length is longer than the specified maximum sequence length for this model (801 > 512). Running this sequence through the model will result in indexing errors


In [7]:
data.sort_values(by='#tokens', ascending=False)[:600]

Unnamed: 0,reviewText,#tokens
7920,"WARNING: This is a LONG, complicated review. I...",7936
5420,When you open the box your first reaction is o...,5974
149,Beware that this is lengthy! I figure that if...,4498
39,Beware that this is lengthy! I figure that if...,4498
3879,"I've been a long time Nexus brand user, owning...",4087
...,...,...
8930,This phone should terrify Apple and Samsung an...,500
4014,I got this through my carrier because my contr...,500
8813,UPDATE TO MY PREVIOUS 1 star review:\n\nSince ...,497
5577,I ordered this phone on March 27th from ProMob...,496


circa 10% delle review con #tokens > 512

In [29]:
def get_input_ids_and_attention_mask_chunk(tokens):
    """
    This function splits the input_ids and attention_mask into chunks of size 'chunksize'. 
    It also adds special tokens (101 for [CLS] and 102 for [SEP]) at the start and end of each chunk.
    If the length of a chunk is less than 'chunksize', it pads the chunk with zeros at the end.
    
    Returns:
        input_id_chunks (List[torch.Tensor]): List of chunked input_ids.
        attention_mask_chunks (List[torch.Tensor]): List of chunked attention_masks.
    """
    chunksize = 512
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
    attention_mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))
    
    for i in range(len(input_id_chunks)):
        input_id_chunks[i] = torch.cat([
            torch.tensor([101], device=device), input_id_chunks[i], torch.tensor([102], device=device)
        ])
        
        attention_mask_chunks[i] = torch.cat([
            torch.tensor([1], device=device), attention_mask_chunks[i], torch.tensor([1],device=device)
        ])
        
        pad_length = chunksize - input_id_chunks[i].shape[0]
        
        if pad_length > 0:
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.tensor([0] * pad_length, device=device)
            ])
            attention_mask_chunks[i] = torch.cat([
                attention_mask_chunks[i], torch.tensor([0] * pad_length, device=device)
            ])
            
    return input_id_chunks, attention_mask_chunks 

In [30]:
def sentiment_score(review):
    tokens = tokenizer.encode_plus(review, add_special_tokens=False, return_tensors='pt').to(device)
    input_id_chunks, attention_mask_chunks = get_input_ids_and_attention_mask_chunk(tokens)
    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(attention_mask_chunks)
    input_dict = {
        'input_ids' : input_ids.long(),
        'attention_mask' : attention_mask.int()
    }
    
    outputs = model(**input_dict)
    probabilities = torch.nn.functional.softmax(outputs[0], dim = -1)
    mean_probabilities = probabilities.mean(dim = 0)
    
    return torch.argmax(mean_probabilities).item() + 1
        

In [31]:
from tqdm import tqdm
tqdm.pandas()

In [32]:
%%time
data['sentiment'] = data['reviewText'].progress_apply(sentiment_score)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████████████████████████████████████████████████████████████████████████| 9666/9666 [03:42<00:00, 43.46it/s]

CPU times: total: 3min 4s
Wall time: 3min 42s





In [33]:
data

Unnamed: 0,reviewText,#tokens,sentiment
0,I have read a lot of the reviews and spoken to...,801,4
1,Or maybe not. The good news is that for less ...,557,3
2,Having previously owned the LG VX9800 (one of ...,483,4
3,3.5 stars. I was very excited about the launc...,398,3
4,I want to warn IT professionals that Verizon W...,405,1
...,...,...,...
9661,No instructions. Makes me question if is real...,84,1
9662,Finally using the phone. Works great and was ...,22,5
9663,does funny things rarely . like FB will just r...,68,4
9664,This phone is no good!! Keeps freezing up. Eve...,29,1


In [34]:
data['sentiment'].value_counts()

sentiment
5    3952
4    2275
1    1449
2    1023
3     967
Name: count, dtype: int64

In [36]:
data['reviewText'][9664]

'This phone is no good!! Keeps freezing up. Even after touching the power and home button at the same time!!'

In [47]:
import nltk
import re
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #sentence_tokenizer

In [38]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('universal_tagset')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Livio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Livio\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Livio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Livio\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Livio\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Livio\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already u

True

In [39]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [40]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [54]:
# Funzione per splittare le recensioni in frasi
def preprocess_analyze_sentences(review, sa):
    sentences_sentiments = {}
    sentences = sent_tokenizer.tokenize(review)
    
    for sentence in sentences:
        afinn_scores = []
        if(sa == 'afinn'):
            sentence_clean = re.sub(r'[^\w\s]', '', sentence) #cleaning
            # Tokenizza in parole
            words = nltk.word_tokenize(sentence_clean)
            # Rimuovi stopwords e applica lemmatization
            preprocessed_words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in stop_words]
            # Ricostruisci la frase preprocessata come stringa
            preprocessed_sentence = ' '.join(preprocessed_words)
            # Calcola il sentiment
            for word in preprocessed_words:
                score = afinn.score(word)
                if score != 0:
                    afinn_scores.append(score)
                    
            sentences_sentiments[preprocessed_sentence] = normalize_RSS((sum(afinn_scores)/len(afinn_scores))) if len(afinn_scores) != 0 else normalize_RSS(0)
        elif(sa == 'bert'):
            sentences_sentiments[sentence] = sentiment_score(sentence)
        
        #sentences_sentiments.append((preprocessed_sentence, (sum(afinn_scores)/len(afinn_scores)) if len(afinn_scores) != 0 else 0))

    return sentences_sentiments


In [43]:
# Funzione per combinare tutti i dizionari di sentences per ogni 'asin'
def combine_sentences(sent_dicts):
    combined_dict = {}
    for d in sent_dicts:
        combined_dict.update(d)
    return combined_dict

In [55]:
r = """This cell phone has passed the proof of time under really really tough conditions. . . . .Great
things: 1. Signal: Superb, I have had many cell phones before, including Nokia which I think has a great signal,
but HTC TYTN II has much better signal. This one sustains signal in elevators while my Nokia can’t . . . . 3.
Screen: Touch screen works really well. Tilting (40 degrees) screen is nice and comfortable to work with when
you are writing over the table. 4. Sliding QWERTY keyboard is the main reason to buy it for us who don’t like
front keyboards, this makes the phone a little bulky but is great. 5. Plenty of buttons: Has plenty of buttons that
make it easy to operate. The 360 degree 3 way jog wheel paired with OK button (left side) is fantastic, great option
to operate the phone while you are driving. 6. Setting e-mail/sms accounts was really easy and fast . . . ..Good
things: 1. Processor: 400 MHz, works OK, it is not super-fast but certainly it is not slow. Phone turns on fast
(less than 1 minute to operate). 2. Platform: Windows mobile 6 is good. Until date I have had to re-start the
phone 3 or 4 times due to system fail (unable to detect end call), besides this it has worked well. 3. Camera 3
mega-pixels: Has good definition, works precisely. 4. HTC Home screen is nice, very interactive. . . . Not so
Good: 1. Camera: Does not have flash, so don’t expect to get good inside pictures. 2. Battery: Weak point, don’t
expect your battery to last more than 24 hours, and much less if you use it heavily. Requires car charger, charge
through USB. 3. Speaker: It is not so loud . . ."""

preprocess_analyze_sentences(r, sa='bert')

{'This cell phone has passed the proof of time under really really tough conditions.': 5,
 '.': 3,
 '.Great\nthings: 1.': 5,
 'Signal: Superb, I have had many cell phones before, including Nokia which I think has a great signal,\nbut HTC TYTN II has much better signal.': 5,
 'This one sustains signal in elevators while my Nokia can’t .': 5,
 '3.': 3,
 'Screen: Touch screen works really well.': 5,
 'Tilting (40 degrees) screen is nice and comfortable to work with when\nyou are writing over the table.': 4,
 '4.': 4,
 'Sliding QWERTY keyboard is the main reason to buy it for us who don’t like\nfront keyboards, this makes the phone a little bulky but is great.': 4,
 '5.': 5,
 'Plenty of buttons: Has plenty of buttons that\nmake it easy to operate.': 4,
 'The 360 degree 3 way jog wheel paired with OK button (left side) is fantastic, great option\nto operate the phone while you are driving.': 5,
 '6.': 5,
 'Setting e-mail/sms accounts was really easy and fast .': 5,
 '..Good\nthings: 1.': 5,