# Install and Import Dependencies

In [7]:
# install pytorch

In [5]:
!pip install transformers requests beautifulsoup4 pandas numpy

Collecting transformers
  Downloading transformers-4.37.1-py3-none-any.whl.metadata (129 kB)
     -------------------------------------- 129.4/129.4 kB 3.8 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.12.25-cp38-cp38-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 42.0/42.0 kB 2.0 MB/s eta 0:00:00
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.1-cp38-none-win_amd64.whl.metadata (6.8 kB)
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.2-cp38-none-win_amd64.whl.metadata (3.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.19.3->transformers)
  Downloading fsspec-2023.12.2-py3-none-any.whl.metadata (6.8 kB)
Downloading transformers-4.37.1-py3-none-any.whl (8.4 MB)
   --------------------------------------


[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: C:\Users\owner\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


    Found existing installation: huggingface-hub 0.12.1
    Uninstalling huggingface-hub-0.12.1:
      Successfully uninstalled huggingface-hub-0.12.1
Successfully installed fsspec-2023.12.2 huggingface-hub-0.20.3 regex-2023.12.25 safetensors-0.4.2 tokenizers-0.15.1 transformers-4.37.1


In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np 
import pandas as pd 

# Instantiate and Setup the Model

In [8]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

# Encode and Calculate a Sentiment

In [10]:
# Encoding
tokens = tokenizer.encode('It was good but couldve been better. Great', return_tensors='pt')

# Checking the tokens
tokens

tensor([[  101, 10197, 10140, 12050, 10502, 12296, 10598, 10662, 16197,   119,
         11838,   102]])

In [11]:
# Decoding
tokenizer.decode(tokens[0])

'[CLS] it was good but couldve been better. great [SEP]'

In [12]:
result = model(tokens)
result # the logits represent the probability of the class

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7768, -1.2353,  1.4419,  1.9804,  0.4584]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [14]:
SS = int(torch.argmax(result.logits))+1 # Gives the Sentiment Score from 1 to 5
SS

4

# Collect Reviews using Yelp

In [15]:
r = requests.get('https://www.yelp.com/biz/social-brew-cafe-pyrmont') # Gets the website html
soup = BeautifulSoup(r.text, 'html.parser') # Setting up parser
regex = re.compile('.*comment.*') # All reviews start with 'comment' in the html code
results = soup.find_all('p', {'class':regex}) # 'p' = paragraphs '{'class':regex}'= class that matches our regex  
reviews = [result.text for result in results]    # ] 

# Loading the Reviews in a Dataframe and Scoring

In [24]:
# Function to calculate sentiment
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [26]:
# Creating a Dataframe
df = pd.DataFrame(np.array(reviews), columns=['Review'])
df.head()

Unnamed: 0,Review
0,Very cute coffee shop and restaurant. They hav...
1,Six of us met here for breakfast before our wa...
2,"Great service, lovely location, and really ama..."
3,Great place with delicious food and friendly s...
4,Some of the best Milkshakes me and my daughter...


In [28]:
score = []
for x in range(0, len(df['Review'])):
    score.append(sentiment_score(df['Review'].iloc[x]))

In [34]:
df['Sentiment'] = score

In [35]:
df.head()

Unnamed: 0,Review,Sentiment
0,Very cute coffee shop and restaurant. They hav...,5
1,Six of us met here for breakfast before our wa...,4
2,"Great service, lovely location, and really ama...",5
3,Great place with delicious food and friendly s...,5
4,Some of the best Milkshakes me and my daughter...,5


# OR

We can avoid using the loop and use lambda instead as:

In [36]:
# Creating a Dataframe
df = pd.DataFrame(np.array(reviews), columns=['Review'])
df.head()

Unnamed: 0,Review
0,Very cute coffee shop and restaurant. They hav...
1,Six of us met here for breakfast before our wa...
2,"Great service, lovely location, and really ama..."
3,Great place with delicious food and friendly s...
4,Some of the best Milkshakes me and my daughter...


In [38]:
df['Sentiment'] = df['Review'].apply(lambda x: sentiment_score(x[:512])) # Our data pipeline is limited to 512 tokens
df.head()

Unnamed: 0,Review,Sentiment
0,Very cute coffee shop and restaurant. They hav...,4
1,Six of us met here for breakfast before our wa...,4
2,"Great service, lovely location, and really ama...",5
3,Great place with delicious food and friendly s...,5
4,Some of the best Milkshakes me and my daughter...,5
