In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## BERT MODEL

In [57]:
# We'll be installing a pre trained model from hugging face
# A tokenizer allows us to pass a string and convert that into a sequence of numbers
# which is then passed to the NLP model
# AutoModelForSequenceClassification its the architecture from transformers

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# we gonna use the argmax from torch to extract the highest sequense results
import torch
# request used to grab the webpage
import requests
# beautiful soup allows us to extract the data, that we want
from bs4 import BeautifulSoup
import re

In [None]:
# Instantiate a model
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [59]:
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

Downloading:   0%|          | 0.00/638M [00:00<?, ?B/s]

### Encode and Calculate Sentiment

In [60]:
# the string was converted into a sequence of numbers
# results its in a list os a list
# pt - pytorch
tokens = tokenizer.encode('It was good but couldve been better. Great', return_tensors='pt')
tokens

tensor([[  101, 10197, 10140, 12050, 10502, 12296, 10598, 10662, 16197,   119,
         11838,   102]])

In [61]:
#to convert back to string 
# we grab internal string
tokenizer.decode(tokens[0])

'[CLS] it was good but couldve been better. great [SEP]'

In [62]:
# What we get is SequenceClassifierOutput class
# the model output is a one hot encoded list of score
# the position with the highest score represent our sentiment rating
# positions (index) [1,2,3,4,5]
# in pytorch position is [0,1,2,3,4]
# 1 - negative, 3 - neutral,  5 - positive sentiment
result = model(tokens)
result

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7768, -1.2353,  1.4419,  1.9804,  0.4584]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [63]:
result.logits

tensor([[-2.7768, -1.2353,  1.4419,  1.9804,  0.4584]],
       grad_fn=<AddmmBackward0>)

In [64]:
# arg max get the highest value
# for our string we get a positive sentiment of 4
# max 5 sentiment
int(torch.argmax(result.logits))+1

4

### Collect Reviews

In [65]:
r = requests.get('https://www.yelp.com/biz/social-brew-cafe-pyrmont')


In [66]:
# response code
r

<Response [200]>

In [None]:
# to get webpage text 
# this everything that compromises of that webpage
r.text

In [68]:
soup = BeautifulSoup(r.text, 'html.parser')

In [None]:
# format in which beautiful soup
# is actually able to search through
soup

In [70]:
# we are extracting specific components that we want from this webpage
# we are looking for anything that has a comment within the class
regex = re.compile('.*comment.*')

In [71]:
#soup.find_all() - used to find all the tags within
# that soup that match our specific formatting
# p - paragraph, in this case we are looking for paragraphs
# we are looking for anything that has a class that matches our regex
# in this case is 'comment'
# on the site all of our reviews are wrapped in a p tag which is a paragraph of class comment
results = soup.find_all('p', {'class':regex})


In [72]:
# we got all the different reviews
#
results

[<p class="comment__09f24__gu0rG css-qgunke"><span class="raw__09f24__T4Ezm" lang="en">Great food amazing coffee and tea. Short walk from the harbor. Staff was very friendly</span></p>,
 <p class="comment__09f24__gu0rG css-qgunke"><span class="raw__09f24__T4Ezm" lang="en">It was ok. Had coffee with my friends. I'm new in the area, still need to discover new places.</span></p>,
 <p class="comment__09f24__gu0rG css-qgunke"><span class="raw__09f24__T4Ezm" lang="en">Great staff and food.  Must try is the pan fried Gnocchi!  The staff were really friendly and the coffee was good as well</span></p>,
 <p class="comment__09f24__gu0rG css-qgunke"><span class="raw__09f24__T4Ezm" lang="en">Ricotta hot cakes! These were so yummy. I ate them pretty fast and didn't share with anyone because they were that good ;). <br/><br/>I ordered a green smoothie to balance it all out. Smoothie was a nice way to end my brekkie at this restaurant. <br/><br/>Others with me ordered the salmon Benedict and the smoke

In [73]:
# the reviews from results are wrapped up in html tags
# we dont want that we just want the reviews
# gives just rext from the html tag
results[0].text

'Great food amazing coffee and tea. Short walk from the harbor. Staff was very friendly'

In [74]:
# extract text from results
reviews = [result.text for result in results]

In [75]:
# we only get the text and all html tags removed
reviews

['Great food amazing coffee and tea. Short walk from the harbor. Staff was very friendly',
 "It was ok. Had coffee with my friends. I'm new in the area, still need to discover new places.",
 'Great staff and food. \xa0Must try is the pan fried Gnocchi! \xa0The staff were really friendly and the coffee was good as well',
 "Ricotta hot cakes! These were so yummy. I ate them pretty fast and didn't share with anyone because they were that good ;). I ordered a green smoothie to balance it all out. Smoothie was a nice way to end my brekkie at this restaurant. Others with me ordered the salmon Benedict and the smoked salmon flatbread. They were all delicious and all plates were empty. Cheers!",
 'I came to Social brew cafe for brunch while exploring the city and on my way to the aquarium. I sat outside. The service was great and the food was good too!I ordered smoked salmon, truffle fries, black coffee and beer.',
 "We came for brunch twice in our week-long visit to Sydney. Everything on the 

### Load Reviews into DataFrame and Score

In [77]:
df = pd.DataFrame(np.array(reviews), columns=['review'])

In [78]:
df.head()

Unnamed: 0,review
0,Great food amazing coffee and tea. Short walk ...
1,It was ok. Had coffee with my friends. I'm new...
2,Great staff and food. Must try is the pan fri...
3,Ricotta hot cakes! These were so yummy. I ate ...
4,I came to Social brew cafe for brunch while ex...


In [79]:
df.shape

(11, 1)

In [80]:
df['review'].iloc[1]

"It was ok. Had coffee with my friends. I'm new in the area, still need to discover new places."

In [81]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [82]:
sentiment_score(df['review'][1])

3

In [83]:
# x[:512] the NLP pipeline is limited on how much text/ tokens you can
# pass through to it at one particular time
# in this case its limited to 512 tokens
# we grab the 1st 512 tokens fromthe reviews bt this influence results of the sentimtnt pipeline
# we can append this altogether or do it in multimple steps & get the average
df['sentiment'] = df['review'].apply(lambda x: sentiment_score(x[:512]))

In [84]:
df

Unnamed: 0,review,sentiment
0,Great food amazing coffee and tea. Short walk ...,5
1,It was ok. Had coffee with my friends. I'm new...,3
2,Great staff and food. Must try is the pan fri...,5
3,Ricotta hot cakes! These were so yummy. I ate ...,5
4,I came to Social brew cafe for brunch while ex...,5
5,We came for brunch twice in our week-long visi...,4
6,It was ok. The coffee wasn't the best but it w...,3
7,I went here a little while ago- a beautiful mo...,2
8,Great coffee and vibe. That's all you need. C...,5
9,Great coffee and vibe. That's all you need. C...,4


In [85]:
# the sentiment is negative but you can tel the review is sarcastic but overal 5 stars, this could be due to limiting tokens to 512
df['review'].iloc[7]

'I went here a little while ago- a beautiful morning,a lovely little brew house on a quaint street corner- perfection.I went to this cafe with my step-daughter Lucille.She was always raving about how great it was to her mother, so I thought it would be a nice idea to go here with her for her birthday... boy was I wrong.She announced her hatred for me while I was waiting for my extra large iced frappé. It felt like hours of awkward silence once she said those four words; "you\'re a low-life."Was it in my mind, or was my drink taking ages to arrive? The hands on the clock didn\'t budge from the last time I glanced at them- 7:43AM, where the fuck is my drink?"Why do you always feel you have to be my friend? You\'re not my dad!" She fired.I could only sit there, my head facing down towards the floral tablecloth that lay beneath my quivering arms. The bullet lodged in my heart.I don\'t understand why she hates me so much; is it my jokes? The funny way I walk? The fact that I often scream my

## Hugging face pipeline

In [86]:
# Method 1 - use pipeline
from transformers import pipeline

In [None]:
sentiment_classifier = pipeline('sentiment-analysis')

In [88]:
# example
sentiment_classifier(['this was just fine, a bit long though','lame, total waste of money'])

[{'label': 'POSITIVE', 'score': 0.9989022016525269},
 {'label': 'NEGATIVE', 'score': 0.9998000264167786}]

In [89]:
train_df['Phrase'][1]

'This quiet , introspective and entertaining independent is worth seeking .'

In [90]:
sentiment_classifier(train_df['Phrase'][1])

[{'label': 'POSITIVE', 'score': 0.9998384714126587}]

In [91]:
train_df['Pipeline'] = train_df['Phrase'].apply(lambda x: sentiment_classifier(x))

In [92]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Pipeline
0,1,1,A series of escapades demonstrating the adage ...,1,"[{'label': 'NEGATIVE', 'score': 0.998948395252..."
1,64,2,"This quiet , introspective and entertaining in...",4,"[{'label': 'POSITIVE', 'score': 0.999838471412..."
2,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1,"[{'label': 'NEGATIVE', 'score': 0.999007523059..."
3,117,4,A positively thrilling combination of ethnogra...,3,"[{'label': 'POSITIVE', 'score': 0.999826610088..."
4,157,5,Aggressive self-glorification and a manipulati...,1,"[{'label': 'NEGATIVE', 'score': 0.999470055103..."
