# ABSA using existing pre-trained model from HuggingFace
##### The model that is being used is yangheng/deberta-v3-large-absa-v1.1. Additional details about the model can be found here: https://huggingface.co/yangheng/deberta-v3-base-absa-v1.1

## Load the required libraries

In [38]:
# Import all the required libraries

from transformers import pipeline
import pandas as pd
import ast
from pandas import json_normalize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from optimum.bettertransformer import BetterTransformer
from timeit import default_timer as timer
import math 

import warnings
warnings.filterwarnings("ignore")

## Get the data and clean the data for analysis

In [39]:
# Load the 2000 sample CA restuarants data set. 
temp_ca_resturants_df = pd.read_csv('random_2k.csv')
ca_resturants_df = temp_ca_resturants_df.copy()

In [40]:
# Expand the attributes column
ca_resturants_df['attributes'] = ca_resturants_df['attributes'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

#  flatten Json 
flattened_attributes = json_normalize(ca_resturants_df['attributes'])

# Combine the flattened attributes with the original DataFrame
ca_resturants_df = pd.concat([ca_resturants_df, flattened_attributes], axis=1)

for column in ['Ambience', 'GoodForMeal']:
    ca_resturants_df[column] = ca_resturants_df[column].fillna('{}')
    ca_resturants_df[column] = ca_resturants_df[column].apply(ast.literal_eval)
    flattened_attributes = json_normalize(ca_resturants_df[column])

ca_resturants_df = ca_resturants_df.drop(['attributes','Ambience', 'GoodForMeal'], axis=1)

ca_resturants_df[['business_id','review_id','RestaurantsGoodForGroups','RestaurantsPriceRange2',\
    'NoiseLevel','GoodForKids','text']].to_csv('random_2k_expanded.csv')

## Load the expanded data set for using in Deberta model

In [41]:
# The expanded data set
df = pd.read_csv('random_2k_expanded.csv')

In [42]:
# print the first five rows
df[['business_id','review_id','text']].head(1)

Unnamed: 0,business_id,review_id,text
0,bp482wqF6v80AltDNlPDfw,QlrkbeGxv3Tna7iAP9eP9A,I'm only giving one star because it's the lowe...


In [43]:
# Load the pre-trained model from huggingface. This model was trained based on the FAST-LCF-BERT model with microsoft/deberta-v3-base

model_name = "yangheng/deberta-v3-large-absa-v1.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name) # use the pre-trained model

# Build the classifier which will be used to classify text data for a given aspect
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [44]:
# In the initial model, we used all the sentences and noticed that the model was hallucinating and scoring very high for certain 
# aspects without any mention of that aspect in the review

# To ensure that the model is not hallucinating and that the model is scoring aspects with high confidence scores only when an aspect is mentioned,
# we used the approach of only sentences with aspects or aspect-related words

# To obtain aspect-related sentences we relied on ChatGPT 3.5 to give us synonymns for our specific aspect words

# We then manually removed words which isn't usually used in reviews type writing or any duplicates 
aspect_synonymns = {
                    'ambiance':['ambiance', 'Environment', 'Setting', 'Mood', 'Vibes', 'Decor', 'Surroundings'],
                    'atmosphere': ['atmosphere', 'Environment', 'Ambiance', 'Vibe', 'Mood', 'Setting', 'Aura', 'Feel'],
                    'dessert':['dessert', 'Sweet treats', 'Confections', 'Pastries', 'Sweets', 'After-dinner delights', 'Pudding', 'Treats', 'Goodies', 'Delicacies', 'Sundaes'],
                    'drinks':['drinks', 'Beverages', 'Cocktails', 'Refreshments', 'Brews'],
                    'entertainment':['entertainment', 'Amusement', 'Recreation', 'Leisure', 'Fun', 'Enjoyment', 'Pleasure', 'Pastime'],
                    'experience':['experience', 'Event', 'Adventure', 'Impressions', 'Occasion', 'Observation', 'Encounter', 'Interaction'],
                    'food':['food', 'flavor', 'taste', 'palate', 'delicious', 'savoriness', 'tastiness', 'flavorsome', 'yumminess', 'appetizing'],
                    'food portion':['food portion', 'Serving size', 'Quantity', 'Portion size', 'Size', 'Serving'],
                    'kid friendly':['kid friendly', 'Child friendly', 'Family friendly', 'Suitable for kids', 'Safe for children', 'Geared toward children', 'Appropriate for families','Friendly for kids'],
                    'location':['location', 'Place', 'Position', 'Locale', 'venue', 'site', 'area'],
                    'price':['price', 'cost', 'fee', 'pricing', 'rate', 'value', 'expensive', 'rate', 'affordability'],
                    'seating':['seating', 'seats', 'chairs', 'accomodation', 'places to sit'],
                    'service':['service', 'assistance', 'help', 'support', 'customer service', 'care', 'treatment'],
                    'setting':['setting', 'environement','ambiance', 'surroundings', 'scene', 'location', 'atmosphere', 'background'],
                    'space':['space', 'area','room','place','spaciousness'],
                    'utilities':['utilities', 'utility', 'service', 'amenities', 'facilities', 'features', 'resources', 'necessities'],
                    'waiting time':['wait', 'waiting time', 'delay', 'time to wait', 'delay', 'time to wait'],
                    'group friendly':['group friendly', 'friendly for groups', 'suitable for groups', 'groups'],
                    'noise level':['noise level', 'noise'],
                    'food expensive':['food expensive', 'price of food', 'food price']
                    }

### The next code block is where we find the sentiment and confidence score for each aspect of each review.
#### The algorithm is as follows
1) For each review, find the individual sentences first
2) Out of all the sentences, keep only the ones that have aspects or aspect-related sentences
3) Then, for that given sentence that contains the aspects (or related words) pass it through the classifier and obtain the sentiment and confidence score

In [45]:

# Given this block of code runs for a long time, upwards of two hours, please use sample of "df' dataframe to execute this code block
start = timer()

# The original set of aspects
aspects = ['ambiance','atmosphere','dessert','drinks','entertainment','experience','food','food portion','kid friendly',\
          'location','price','seating','service','setting','space','utilities','waiting time', 'group friendly', 'noise level',\
          'food expensive']

temp_result_df = pd.DataFrame()

# If this code is going to be executed, please use the sample of the data frame to run it to avoid any issues
review_df = df[['business_id','review_id', 'text']].dropna()#.sample(10, random_state=42)

for i in range(len(review_df)):
    y=review_df[i:i+1]
    sequence_to_classify = y['text'].values[0]
    review_id = y['review_id'].values[0]
    business_id	= y['business_id'].values[0]

    #print(i)
    for aspect in aspects: # for each aspect
        for txt in sequence_to_classify.split('.'): # split the review text by period operator
            for asp_syn in aspect_synonymns[aspect]: # for a given aspect, find all the related words using the 'aspect_synonymns' dictionary  
                
                if asp_syn.lower() in txt.lower(): # if the aspect related words is in the review sentence

                    results = classifier(sequence_to_classify,  text_pair=aspect) # find the sentiment and confidence score

                    # Add the result to a dataframe
                    
                    df2 = pd.DataFrame([[business_id,
                                             review_id, 
                                             aspect, # use the asepect word for easy aggregation later
                                             results[0]['label'],
                                             results[0]['score']]])
                    # Store all the results
                    temp_result_df = pd.concat([temp_result_df,df2]) 
end = timer()

print(end - start) # to keep tab of the time it takes to run this code block

In [10]:
# Print the top 5 rows
temp_result_df.head(1)

Unnamed: 0,0,1,2,3,4
0,bp482wqF6v80AltDNlPDfw,QlrkbeGxv3Tna7iAP9eP9A,experience,Negative,0.999823


In [11]:
# print the total number of rows in the data frame
len(temp_result_df)

10987

In [12]:
# Given that in a given review, there can be multiple sentences with the aspect as follows:
# for example. 'The food was very good. I really like how nicely they cooked the food'
# There are two sentences for the aspect 'food'. So we will have scores for both these sentences 
# which we need to aggregate since we only need one aspect per review. This is what 
# we are doing below.

temp_result_gped_df = temp_result_df.groupby(by=[0, 1, 2, 3]).mean().reset_index()

In [13]:
# We want to pivot the data frame so that aspects become columns
result_df = temp_result_gped_df.pivot_table(values=[3, 4], \
                                       index=[temp_result_gped_df[0], temp_result_gped_df[1]], \
                                       columns=[2], \
                                       aggfunc='first').reset_index()

In [14]:
# Print the top row in the data frame 
result_df.head(1)

Unnamed: 0_level_0,0,1,3,3,3,3,3,3,3,3,...,4,4,4,4,4,4,4,4,4,4
2,Unnamed: 1_level_1,Unnamed: 2_level_1,ambiance,atmosphere,dessert,drinks,entertainment,experience,food,food portion,...,kid friendly,location,noise level,price,seating,service,setting,space,utilities,waiting time
0,-3AooxIkg38UyUdlz5oXdw,Bl1VqhkkGx7EoUrDOCFW8w,,Negative,,,,,Positive,,...,,0.517108,,,,0.453472,,0.537013,,0.680859


In [15]:
# The result dataframe is multi-index so drop one level
result_df.columns = result_df.columns.droplevel()

In [17]:
# Set the columns of the result data frame

result_df.columns = ['business_id', 'review_id', \
                     'ambiance_label', 'atmosphere_label', 'dessert_label', 'drinks_label', 'entertainment_label',	\
                     'experience_label', 'food_label', 'food_portion_label', 'group_friendly_label', 'kid_friendly_label', 'location_label', \
                     'noise_level_label', 'price_label',	'seating_label', 'service_label', 'setting_label', 'space_label', \
                     'utilities_label', 'waiting_time_label', 'ambiance_score', 'atmosphere_score', 'dessert_score', 'drinks_score',\
                     'entertainment_score',	'experience_score',	'food_score', 'food_portion_score', 'group_friendly_score', 'kid_friendly_score',\
                     'location_score', 'noise_level_score', 'price_score', 'seating_score', 'service_score', 'setting_score', 'space_score',	\
                     'utilities_score', 'waiting_time_score']


In [18]:
# Print one row of the result data frame
result_df.head(1)

Unnamed: 0,business_id,review_id,ambiance_label,atmosphere_label,dessert_label,drinks_label,entertainment_label,experience_label,food_label,food_portion_label,...,kid_friendly_score,location_score,noise_level_score,price_score,seating_score,service_score,setting_score,space_score,utilities_score,waiting_time_score
0,-3AooxIkg38UyUdlz5oXdw,Bl1VqhkkGx7EoUrDOCFW8w,,Negative,,,,,Positive,,...,,0.517108,,,,0.453472,,0.537013,,0.680859


In [19]:
# Save the results from the model with business id, review id and aspect names to the csv
result_df.to_csv('deberta_aspect_score_sentiment_results.csv')

## Processing of results from Deberta model

In [20]:
# Create global variables to store results

global model_result_temp_df
global model_result_df

model_result_temp_df = pd.DataFrame(result_df.business_id.unique(), columns=['business_id'])

In [21]:

def findSentimentPerBusiness(x):
    """ 
    Purpose of the function is to find sentiment per business per aspect
    For each business per aspect:
    - the sentiment is marked negative, if more than 50% of reviews are negative
    - the sentiment is marked positive, if more than 50% of reviews are positive
    - the sentiment is marked neutral, if equal number of reviews are negative and positive
    - the sentiment is marked no mention, if there is no mention of that aspect and in all other cases

    Input: Dataframe with aspect label and negative and positive scores for each business id
    Output: Dataframe with sentiment per aspect per business
    """
    if x.dr!=0:
        if x.negative/(x.negative+x.positive)==0.5:
            return 'Neutral'
        elif (x.negative/(x.negative+x.positive)) > 0.5:
            return 'Negative'
        else:
            return 'Positive'
    else:
        return 'No mention'


def findSentimentPerBusinessModelBased(temp_df, aspect):
    """ 
    Purpose of the function is to find sentiment per business per aspect starting with sentiment per review per aspect
    For each review per aspect:
    - the label is marked negative, if the confidence score is >=0.7 and the label is negative
    - the label is marked positive, if the confidence score is >=0.7 and the label is positive

    Aggregate the review at the business level and then find the sentiment per business per aspect using the function 
    'findSentimentPerBusiness'.
    
    Input: Dataframe with aspect label and aspect score for each review id
    Output: Dataframe with sentiment for all aspects and all business
    """
    global model_result_temp_df
    global model_result_df
    
    label_col = aspect+'_label'
    score_col = aspect+'_score'
    
    # For each review and aspect, the label is marked negative, if the confidence score is >=0.7 and the label is negative
    temp_df['negative'] = temp_df.apply(lambda x: 1 if (x[label_col]=='Negative') & \
                                        (x[score_col] >= 0.7) else 0, axis=1)
    
    # For each review and aspect, the label is marked positive, if the confidence score is >=0.7 and the label is positive
    temp_df['positive'] = temp_df.apply(lambda x: 1 if (x[label_col]=='Positive') & \
                                        (x[score_col] >= 0.7) else 0, axis=1)

    # Aggregate the review at business level
    
    temp_gp_df = temp_df[['business_id','negative','positive']].groupby(by=['business_id']).sum().reset_index()
    temp_gp_df['dr'] = temp_gp_df['negative'] + temp_gp_df['positive']
    temp_gp_df['aspect'] = aspect

    #Aggregate the review at the business level and then find the sentiment per business per aspect
    temp_gp_df[score_col] = temp_gp_df.apply(lambda x: findSentimentPerBusiness(x), axis=1)

    # store the result in final data frame
    model_result_df = pd.merge(model_result_temp_df, temp_gp_df[['business_id',score_col]], on='business_id', how='left')
    model_result_temp_df = model_result_df

    # return the final data frame
    return model_result_df

In [22]:
# the aspect columns to consider 
columns = ['ambiance','atmosphere','dessert','drinks','entertainment','experience','food','food_portion','kid_friendly',\
           'location','noise_level', 'price','seating','service','setting','space','waiting_time']

# find sentiment per aspect for all business
for aspect in columns:
    findSentimentPerBusinessModelBased(result_df[['business_id',aspect+'_label', aspect+'_score']], aspect)

In [23]:
# Print the top row of the model result data frame
model_result_df.head(1)

Unnamed: 0,business_id,ambiance_score,atmosphere_score,dessert_score,drinks_score,entertainment_score,experience_score,food_score,food_portion_score,kid_friendly_score,location_score,noise_level_score,price_score,seating_score,service_score,setting_score,space_score,waiting_time_score
0,-3AooxIkg38UyUdlz5oXdw,No mention,No mention,No mention,No mention,No mention,Positive,Neutral,No mention,No mention,No mention,No mention,Negative,No mention,No mention,No mention,No mention,No mention


## Get sentiments that restaurant owners provided for their restaurant

In [25]:
# Get copy of the original data set which has the restaurant owner provided information about the data set
review_df = df.copy()

In [29]:

def findSentimentPerBusinessProvidedOwners(x):
    """ 
    Purpose of the function is to find sentiment for aspect since it is provided as a categorical data set.
    
    Input: Dataframe with aspect and its values
    Output: Dataframe with sentiment created for each aspect 
    """
    if math.isnan(x):
        return 'No mention'
    elif x:
        return 'Positive'
    else:
        return 'Negative'

def findNoiseLevel(x):
    """ 
    Purpose of the function is to find sentiment for noise level since it is provided as a categorical data set.
    
    Input: Dataframe with 'noise level' aspect and its values
    Output: Dataframe with sentiment created for noise level aspect 
    """
    if pd.isnull(x):
        return 'No mention'
        
    elif 'average' in x or 'quiet' in x:
        return 'Positive'
        
    elif 'loud' in x:
        return 'Negative'
        
    else:
        return 'No mention'
        
# convert the sentiment of the 'Restaurants_good_for_groups_owner_provided' aspect
review_df['Restaurants_good_for_groups_owner_provided'] = review_df['RestaurantsGoodForGroups'].\
                                            apply(lambda x : findSentimentPerBusinessProvidedOwners(x))

# convert the sentiment of the 'good_for_kids_owner_provided' aspect
review_df['good_for_kids_owner_provided'] = review_df['GoodForKids'].\
                                            apply(lambda x : findSentimentPerBusinessProvidedOwners(x))

# convert the sentiment of the 'restaurant_is_expensive_owner_provided' aspect
review_df['restaurant_is_expensive_owner_provided'] = review_df['RestaurantsPriceRange2'].\
                                            apply(lambda x : 'No mention' if math.isnan(x) \
                                                  else ('Positive' if x>=3 else 'Negative'))

# convert the sentiment of the 'noise_level_owner_provided' aspect
review_df['noise_level_owner_provided'] = review_df['NoiseLevel'].\
                                            apply(lambda x : findNoiseLevel(x))

# Group the review data frame
review_gp_df = review_df[['business_id','Restaurants_good_for_groups_owner_provided', 'good_for_kids_owner_provided',\
                         'restaurant_is_expensive_owner_provided','noise_level_owner_provided']].\
                    groupby(by=['business_id','Restaurants_good_for_groups_owner_provided', 'good_for_kids_owner_provided',\
                               'restaurant_is_expensive_owner_provided','noise_level_owner_provided']).count().reset_index()

In [30]:
# print top one row per review
review_gp_df.head(1)

Unnamed: 0,business_id,Restaurants_good_for_groups_owner_provided,good_for_kids_owner_provided,restaurant_is_expensive_owner_provided,noise_level_owner_provided
0,-3AooxIkg38UyUdlz5oXdw,Positive,Positive,Negative,Positive


In [31]:
# Final data set, which is a join of results from the Deberta model and results from the restaurant review data set
enriched_df = pd.merge(model_result_df, review_gp_df, on='business_id', how='left')

In [33]:
# print top one row per review
enriched_df.head(1)

Unnamed: 0,business_id,ambiance_score,atmosphere_score,dessert_score,drinks_score,entertainment_score,experience_score,food_score,food_portion_score,kid_friendly_score,...,price_score,seating_score,service_score,setting_score,space_score,waiting_time_score,Restaurants_good_for_groups_owner_provided,good_for_kids_owner_provided,restaurant_is_expensive_owner_provided,noise_level_owner_provided
0,-3AooxIkg38UyUdlz5oXdw,No mention,No mention,No mention,No mention,No mention,Positive,Neutral,No mention,No mention,...,Negative,No mention,No mention,No mention,No mention,No mention,Positive,Positive,Negative,Positive


In [36]:
# Results after merging results from the Deberta model results and restaurant 
enriched_df.to_csv('deberta_aspect_score_sentiment_with_owner_provided_info_results_final.csv')

In [35]:
# Done