In [2]:
#!pip install transformers

In [4]:
import pandas as pd
from transformers import pipeline

In [5]:
# load the data
df = pd.read_csv('Hotel_Reviews.csv')

# keep only the reviews that are greater than or equal to 8.5 
# or less than or equal to 6.5
df = df.loc[(df.Reviewer_Score>=8.5) | (df.Reviewer_Score<=6.5)].copy()

# create a function for the label

def review_label(x):
    if x>=8.5:
        return 1
    else:
        return 0
    

# create the label column    
df['label'] = df['Reviewer_Score'].apply(lambda x: review_label(x))


# replace the 'No Positive' and the 'No Negative' 
# from the corresponding columns with an empty string

df['Positive_Review'].replace('^No Positive$', '', inplace=True, regex=True)
df['Negative_Review'].replace('^No Negative$', '', inplace=True, regex=True)

# concatenate the Positive and Negative Review columns into 'Reviews'
df['reviews'] = df['Negative_Review'] + df['Positive_Review']

# remove the empty reviews
df = df.loc[df['reviews'].apply(lambda x:len(x.split()))>0]
df = df[['reviews', 'label']].copy()
df.reset_index(drop = True, inplace = True)
df.head(10)

Unnamed: 0,reviews,label
0,I am so angry that i made this post available...,0
1,My room was dirty and I was afraid to walk ba...,0
2,Cleaner did not change our sheet and duvet ev...,0
3,Apart from the price for the brekfast Everyth...,1
4,Even though the pictures show very clean room...,0
5,Nothing all great Rooms were stunningly deco...,1
6,6 30 AM started big noise workers loading woo...,0
7,The floor in my room was filfy dirty Very bas...,0
8,This hotel is being renovated with great care...,1
9,The staff in the restaurant could of been mor...,1


In [6]:
df.shape

(361844, 2)

In [7]:
# undersample the data

df_grouped_by = df.groupby(['label'])
 
df_balanced = df_grouped_by.apply(lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))
 
df_balanced = df_balanced.droplevel(['label'])

# shuffle the data frame
df = df_balanced.sample(frac=1, random_state=1).reset_index(drop=True)
df.head(10) 

Unnamed: 0,reviews,label
0,Going to London Central took us an hour every...,1
1,Nothing specific Location,0
2,The room cleanliness we had to ask everyday f...,0
3,Room was to hot for babies It s all on a sens...,0
4,It s old and not a 5 star,0
5,So far everything is okay Really a great loca...,1
6,Nice place to stay,1
7,location,1
8,Noise Staff were so helpful,1
9,No free wifi even its 5 star hotel Location o...,0


In [8]:
# create the train and test datasets
train = df.sample(frac=0.75, random_state=1)
test = df.drop(train.index)

In [9]:
# import the "sentiment-analysis" classifier
from transformers import pipeline
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [15]:
classifier("I hate you")[0]['label']

'NEGATIVE'

In [16]:
predictions = []
for r in test['reviews']:
  try:
    pred = classifier(r)[0]['label']
    if pred == 'NEGATIVE':
      predictions.append(0)
    else:
      predictions.append(1)


  except:
    predictions.append(0)
    print(r)


 We really didn t like that we couldn t get picked up from the train station and that the shuttle was not 24 hours The neighbourhood was a little bit sketchy and the hotel is well hidden in between buildings and back alleys so it would have been nice to get a ride We eventually figured out the bus system but it was a pain the first couple of days We went to the hotel restaurant one of the nights and it was awful They had one waiter on which would have been fine considering there were only 3 or 4 tables to serve But he spent 15 minutes talking to the table next to us and then it took another 10 for him to come over to us to talk to us for the first time since we had been seated I ordered off of the kids menu a margarita pizza and fries which turned out to be half of a frozen store bought pizza which I figured out when I asked for less sauce on the pizza and he told me they were pre made My husband got a burger and the meat was the cheapest you could buy It was very fatty and very grey T

In [18]:
from sklearn.metrics import classification_report

print(classification_report(test['label'], predictions))

              precision    recall  f1-score   support

           0       0.77      0.84      0.80     17107
           1       0.82      0.74      0.78     16893

    accuracy                           0.79     34000
   macro avg       0.79      0.79      0.79     34000
weighted avg       0.79      0.79      0.79     34000

