In [38]:
%%capture
!pip install stanza

In [39]:
use_gpu = False
shrink_dataset = True

In [14]:
import stanza
stanza.download('en') # download English model
nlp = stanza.Pipeline('en', use_gpu=use_gpu) # initialize English neural pipeline


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| constituency | wsj       |
| depparse     | combined  |
| sentiment    | sstplus   |
| ner          | ontonotes |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [15]:
doc = nlp("University Of Arizona is a great place to learn natural language processing") # run annotation over a sentence
print(doc.entities)

[{
  "text": "University Of Arizona",
  "type": "ORG",
  "start_char": 0,
  "end_char": 21
}]


In [16]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
from datasets import load_dataset

In [18]:
dataset = load_dataset('amazon_us_reviews','Personal_Care_Appliances_v1_00') 



  0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
dataset['train'][0]

{'marketplace': 'US',
 'customer_id': '32114233',
 'review_id': 'R1QX6706ZWJ1P5',
 'product_id': 'B00OYRW4UE',
 'product_parent': '223980852',
 'product_title': 'Elite Sportz Exercise Sliders are Double Sided and Work Smoothly on Any Surface. Wide Variety of Low Impact Exercise’s You Can Do. Full Body Workout, Compact for Travel or Home Ab Workout',
 'product_category': 'Personal_Care_Appliances',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': 0,
 'verified_purchase': 1,
 'review_headline': 'Good quality. Shipped',
 'review_body': 'Exactly as described. Good quality. Shipped fast',
 'review_date': '2015-08-31'}

In [20]:
dataset['train'].shape

(85981, 15)

In [21]:
dataset['train'][0]

{'marketplace': 'US',
 'customer_id': '32114233',
 'review_id': 'R1QX6706ZWJ1P5',
 'product_id': 'B00OYRW4UE',
 'product_parent': '223980852',
 'product_title': 'Elite Sportz Exercise Sliders are Double Sided and Work Smoothly on Any Surface. Wide Variety of Low Impact Exercise’s You Can Do. Full Body Workout, Compact for Travel or Home Ab Workout',
 'product_category': 'Personal_Care_Appliances',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': 0,
 'verified_purchase': 1,
 'review_headline': 'Good quality. Shipped',
 'review_body': 'Exactly as described. Good quality. Shipped fast',
 'review_date': '2015-08-31'}

In [22]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [23]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma',use_gpu=use_gpu)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| lemma     | combined |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


In [24]:
size = dataset['train'].shape[0]
if shrink_dataset:
  size  = 100

In [25]:
from tqdm.auto import tqdm

In [26]:
def pre_process_review_texts(texts):
  processed_texts = []
  for text in tqdm(texts):
    doc = nlp(text.lower())
    lemmatized_tokens= []
    for sentence in doc.sentences:
        for word in sentence.words:
          if word.lemma and word.lemma not in stopwords.words('english'):
            lemmatized_tokens.append(word.lemma)
    processed_text = ' '.join(lemmatized_tokens)
    processed_texts.append(processed_text)
  return processed_texts

processed_texts = pre_process_review_texts(dataset['train']['review_body'][0:size])

  0%|          | 0/100 [00:00<?, ?it/s]

In [27]:
def pre_process_ratings(ratings):
  sentiments  = []
  for rating in ratings:
    if rating <=2:
      sentiments.append(0)
    elif rating >=4:
      sentiments.append(2)
    else:
      sentiments.append(1)
  return sentiments
true_labels = pre_process_ratings(dataset['train']['star_rating'][0:size])

In [28]:
from collections import defaultdict
nlp_sentiment = stanza.Pipeline(lang='en', processors='tokenize,sentiment',use_gpu=use_gpu)


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: sentiment
INFO:stanza:Done loading processors!


In [29]:
def get_sentiment(text):
  sentiments = defaultdict(int)
  doc = nlp_sentiment(text)
  total =0
  for sentence in doc.sentences:
    sentiments[sentence.sentiment]+=1
    total+=1
  all_values = dict([ (sentiment,sentiments[sentiment]/total) for sentiment in sentiments])
  sorted_values = sorted(all_values.items(), key=lambda x:x[1],reverse=True)
  return sorted_values[0][0]

In [30]:
def get_sentiment_using_stanza(texts,labels):
  predicted_sentiments =[]
  true_sentiments = []
  for idx,text in tqdm(enumerate(texts),total=len(texts)):
    try:
      sentiment = get_sentiment(text)
      predicted_sentiments.append(sentiment)
      true_sentiments.append(labels[idx])
    except IndexError:
       pass
  return predicted_sentiments,true_sentiments
predicted_sentiments_stanza,true_sentiments = get_sentiment_using_stanza(processed_texts,true_labels)

  0%|          | 0/100 [00:00<?, ?it/s]

In [31]:
from sklearn.metrics import classification_report
print(classification_report(true_sentiments, predicted_sentiments_stanza))

              precision    recall  f1-score   support

           0       0.60      0.55      0.57        11
           1       0.08      0.75      0.15         4
           2       0.96      0.60      0.74        85

    accuracy                           0.60       100
   macro avg       0.55      0.63      0.49       100
weighted avg       0.89      0.60      0.70       100



In [32]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [33]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [34]:
def preprocess_review_texts_using_nltk(texts):
  processed_texts = []
  for text in tqdm(texts):
      tokens = word_tokenize(text.lower())
      filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
      lemmatizer = WordNetLemmatizer()
      lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
      processed_text = ' '.join(lemmatized_tokens)
      processed_texts.append(processed_text)
  return processed_texts
processed_texts = preprocess_review_texts_using_nltk(dataset['train']['review_body'][0:size])

  0%|          | 0/100 [00:00<?, ?it/s]

In [35]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [36]:
analyzer = SentimentIntensityAnalyzer()

def get_sentiment_using_nltk(texts,labels):
  predicted_sentiments =[]
  true_sentiments = []
  for idx,text in tqdm(enumerate(texts),total=len(texts)):
    try:
      scores = analyzer.polarity_scores(text)
      sentiment_score = scores['compound']
      predicted_sentiments.append(round(sentiment_score+1))
      true_sentiments.append(labels[idx])
    except IndexError:
       pass
  return predicted_sentiments,true_sentiments
predicted_sentiments_nltk,true_sentiments = get_sentiment_using_nltk(processed_texts,true_labels)

  0%|          | 0/100 [00:00<?, ?it/s]

In [37]:
print(classification_report(true_sentiments, predicted_sentiments_nltk))

              precision    recall  f1-score   support

           0       0.50      0.18      0.27        11
           1       0.05      0.50      0.09         4
           2       0.96      0.65      0.77        85

    accuracy                           0.59       100
   macro avg       0.51      0.44      0.38       100
weighted avg       0.88      0.59      0.69       100

