## 1. Import Dependencies

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np

## 2. Instantiae Model

In [2]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

## 3. Encode and Calculate Sentiment

In [7]:
example = "Hi my name is nadzmi! I love to eat banana"
tokens = tokenizer.encode(example, return_tensors='pt')
tokens

tensor([[  101, 11463, 11153, 11221, 10127, 12145, 75520,   106,   151, 11157,
         10114, 39999, 64916,   102]])

In [8]:
tokenizer.decode(tokens[0])

'[CLS] hi my name is nadzmi! i love to eat banana [SEP]'

In [9]:
result = model(tokens)
result

SequenceClassifierOutput(loss=None, logits=tensor([[-0.9407, -1.4766, -0.5453,  0.6227,  1.9150]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [10]:
torch.argmax(result.logits) + 1 # Rating from 1 to 5

tensor(5)

In [11]:
# Try with positive sentiment
tokens = tokenizer.encode("She is as pretty as the moon!", return_tensors='pt')
result = model(tokens)
torch.argmax(result.logits) + 1

tensor(5)

In [12]:
# Try with neutral sentiment
tokens = tokenizer.encode("It is meh", return_tensors='pt')
result = model(tokens)
torch.argmax(result.logits) + 1

tensor(3)

In [13]:
# Try with negative sentiment
tokens = tokenizer.encode("The product is terrible", return_tensors='pt')
result = model(tokens)
torch.argmax(result.logits) + 1

tensor(1)

## 4. Collect Reviews

In [14]:
url = 'https://www.yelp.com/biz/pepper-lunch-melbourne'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
regex = re.compile("comment")
results = soup.find_all('p', {"class":regex})
reviews = [result.text for result in results]

## 5. Load Reviews into DataFrame

In [15]:
reviews

['I was walking past and I was thirsty so I stopped in to grab a bottle of water. And then that is when I smelt the delicious smell of steak. I had a look around and then I realised that you could cook it yourself. So instantly I sat down and ordered. I had the wagyu steak and it was great. Of course I had a side of chips with it.Everything about this meal was amazing good price as well. Thanks Pepper Lunch!',
 "3.5 stars!A chain of Japanese restaurants specializing in rice and pasta dishes on iron plates, Pepper Lunch is a casual, quick, and efficient place to try out this unique dish. At this location, there's a self-ordering machine, where you simply grab a number, scan it in, and select the items you want on the screen. Then you grab a seat, and everything is brought out! On my visit, I tried the pepper rice plate with seafood, which came with salmon, scallops, and shrimp surrounding a mound of rice with pepper, scallions, and corn. You press the rice down into the plate and wait f

In [16]:
df = pd.DataFrame(np.array(reviews), columns=['review'])

In [19]:
def predict_review(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits) + 1)

In [20]:
df['sentiment'] = df['review'].apply(lambda x: predict_review(x[:512])) #512 tokens limit

In [21]:
df

Unnamed: 0,review,sentiment
0,I was walking past and I was thirsty so I stop...,5
1,3.5 stars!A chain of Japanese restaurants spec...,3
2,I like Pepper Lunch because it reminds me of g...,4
3,Came here on a Thursday night around 9pm and i...,4
4,Thinking of quick but hearty lunch along Melbo...,3
5,"Pepper Lunch is all about that sizzle, bout th...",4
6,"If you're after a quick, efficient and tasty m...",5
7,I went here for dinner with my partner on a Su...,5
8,"Today was my first visit here, so I didn't kno...",4
9,Pepper lunch is so good. The beef pepper lunch...,5


## 6. Pipeline

In [22]:
def collect_reviews(url, css_selector):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    regex = re.compile(css_selector)
    results = soup.find_all('p', {"class":regex})
    reviews = [result.text for result in results]
    return reviews

def predict_reviews(urls, css_selector):
    if isinstance(urls, str): #If it's a single url
        urls = [urls] 
    
    reviews = []
    for url in urls:
        reviews += collect_reviews(url, css_selector)

    df = pd.DataFrame(np.array(reviews), columns=['review'])
    df['sentiment'] = df['review'].apply(lambda x: predict_review(x[:512])) #512 tokens limit

    return df