In [26]:
!pip install -U sentence-transformers



In [2]:
from sentence_transformers import SentenceTransformer
import spacy
from textblob import TextBlob
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
import pandas as pd

In [3]:
train = pd.read_csv("/content/MAMS_train.csv")
train.head()

Unnamed: 0,ID,Review,Aspects,Sentiment
0,1,It might be the best sit down food I've had in...,"food, place","positive, neutral"
1,2,Hostess was extremely accommodating when we ar...,"staff, miscellaneous","positive, neutral"
2,3,We were a couple of minutes late for our reser...,"miscellaneous, staff","neutral, negative"
3,4,"Though the service might be a little slow, the...","service, staff","negative, positive"
4,5,Although we arrived at the restaurant 10 min l...,"staff, miscellaneous","negative, neutral"


In [4]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #word embeddings for comparison
#pre-trained sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis") #by default diltiled bert will be used

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

## Understanding Data

In [16]:
print("Review: ", train['Review'][0])
print("Aspect: ", train['Aspects'][0])
print("Sentiment: ", train['Sentiment'][0])

Review:  It might be the best sit down food I've had in the area, so if you are going to the upright citizen brigade, or the garden, it could be just the place for you.
Aspect:  food, place
Sentiment:  positive, neutral


In [8]:
aspect_counts = train['Aspects'].value_counts()

print("\nAspect Counts:")
print(aspect_counts)
#the data is stored as a single single
#need to seperate each aspect individually


Aspect Counts:
staff, food                          386
food, staff                          200
food, miscellaneous                  169
food, service                        126
miscellaneous, food                  123
                                    ... 
place, staff, ambience                 1
ambience, food, service                1
miscellaneous, food, menu, staff       1
food, staff, miscellaneous, menu       1
staff, food, miscellaneous, price      1
Name: Aspects, Length: 301, dtype: int64


## Extract the fixed aspects as a list

In [6]:
aspects_df = train['Aspects'].str.split(',', expand=True) #extract each individual unqiue aspect (not sets)

# Reshape the DataFrame
stacked_aspects = aspects_df.stack().str.strip()
aspect_counts = stacked_aspects.value_counts()

print("Aspect Counts:")
print(aspect_counts)

Aspect Counts:
food             2307
staff            1383
miscellaneous     954
place             694
service           631
menu              475
ambience          324
price             322
dtype: int64


In [10]:
# Get the total number of unique aspects
total_unique_aspects = stacked_aspects.unique().shape[0]

print("Total Number of Unique Aspects:", total_unique_aspects)

AspectsList = stacked_aspects.unique().tolist() #get the list of all unique aspects to treat them as a fixed label

print("Unique Aspects List:")
print(AspectsList)

Total Number of Unique Aspects: 8
Unique Aspects List:
['food', 'place', 'staff', 'miscellaneous', 'service', 'price', 'menu', 'ambience']


# ABSA

In [20]:
AspectsList= ['food', 'place', 'staff', 'miscellaneous', 'service', 'price', 'menu', 'ambience']

## Extracting Aspects

In [11]:
def extract_and_map_aspects(text, model, AspectsList):
    nlp = spacy.load('en_core_web_sm')
    label_embeddings = model.encode(AspectsList) #get the embeddings of the Aspects

    doc = nlp(text)
    real_aspects = set()  # Using a set to avoid duplicates for real aspects
    mapped_aspects = {}  # To store mapping of real aspects to their closest aspect label

    for chunk in doc.noun_chunks:
        root_text = chunk.root.text  # The actual aspect mentioned in the text
        aspect_embedding = model.encode([root_text])
        similarities = cosine_similarity(aspect_embedding, label_embeddings)
        closest_label_index = similarities.argmax()
        mapped_label = AspectsList[closest_label_index]

        real_aspects.add(root_text)
        mapped_aspects[root_text] = mapped_label  # Map the real aspect to its closest aspect label

    # Restrict to no more than 4 aspects
    if len(real_aspects) > 4:
        real_aspects = list(real_aspects)[:4]  # Convert to list and take first 4
        # Update mapped_aspects to only include these 4 real aspects
        mapped_aspects = {ra: mapped_aspects[ra] for ra in real_aspects}
    else:
        real_aspects = list(real_aspects)

    return real_aspects, mapped_aspects

## Extracting Opinion (Feature)

In [12]:
def extract_features(text, real_aspects, mapped_aspects):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    aspect_adjectives = {mapped_aspects[aspect].lower(): [] for aspect in real_aspects}  # Initialize with mapped aspects in lowercase

    for chunk in doc.noun_chunks:
        for real_aspect in real_aspects:
            mapped_aspect = mapped_aspects[real_aspect].lower()  # Work with lowercase mapped aspect
            if real_aspect.lower() in chunk.text.lower():  # Use real_aspect to find features
                for token in chunk.root.head.children:
                    if token.pos_ == "ADJ":
                        sequence = [token.text.lower()]  # Lowercase for consistency
                        if token.nbor(1).text.lower() == "and" and token.nbor(2).pos_ == "ADJ":
                            sequence += [token.nbor(1).text.lower(), token.nbor(2).text.lower()]
                        aspect_adjectives[mapped_aspect].append(" ".join(sequence))

    return aspect_adjectives

## Get sentiment on the basis of the aspect's opinion

In [13]:
def analyze_sentiment(features_opinions, sentiment_pipeline):
    sentiments = {}

    for aspect, feature_opinion in features_opinions.items():
        #Check if feature_opinion is not empty
        if feature_opinion:
            #Convert feature_opinion list to string, if it's not already
            feature_opinion_str = ", ".join(feature_opinion) if isinstance(feature_opinion, list) else feature_opinion
            result = sentiment_pipeline(feature_opinion_str)
            if result:  # Check if result is non-empty
                sentiment = result[0]['label']
                if sentiment == 'POSITIVE':
                    sentiments[aspect] = 'Positive'
                elif sentiment == 'NEGATIVE':
                    sentiments[aspect] = 'Negative'
                else:
                    sentiments[aspect] = 'Neutral'
            else:
                sentiments[aspect] = 'Neutral'  # Default to Neutral if no sentiment is detected
        else:
            sentiments[aspect] = 'Neutral'  # Default to Neutral if feature_opinion is empty

    return sentiments

## Single test input

In [28]:
# Example text
text= "Hostess was extremely accommodating when we arrived an hour early for our reservation"
# Extract real and mapped aspects
real_aspects, mapped_aspects = extract_and_map_aspects(text, model, AspectsList)

# Extract features using real aspects and their mappings
feature_opinions = extract_features(text, real_aspects, mapped_aspects)

# Analyze sentiment for the mapped aspects based on their features
sentiments = analyze_sentiment(feature_opinions, sentiment_pipeline)

#'feature_opinions' and 'sentiments' use the mapped aspect labels,
# but the extraction of features is based on the actual text content (real aspects).

In [29]:
print("Review: \n", text)
print("\nAspects from Review: ", real_aspects)
print("mapped_aspects from Review: ", mapped_aspects)
print("Opinion of aspect from Review: ", feature_opinions)
print("Sentiments in Review: ", sentiments)

Review: 
 Hostess was extremely accommodating when we arrived an hour early for our reservation

Aspects from Review:  ['reservation', 'Hostess', 'we']
mapped_aspects from Review:  {'Hostess': 'staff', 'we': 'service', 'reservation': 'place'}
Opinion of aspect from Review:  {'place': [], 'staff': ['accommodating'], 'service': []}
Sentiments in Review:  {'place': 'Neutral', 'staff': 'Positive', 'service': 'Neutral'}


## testing on dataframe

### making a sample test dataframe from train dataframe

In [39]:
#first 400 rows from train set
train_subset = train.head(400)

# Extract the Reviews column from the subset
reviews_column = train_subset['Review']

# Create a new DataFrame with the Reviews column
SampleTest = pd.DataFrame(reviews_column)

# Add empty columns for Aspects, Sentiment, and Feature
SampleTest['Aspects'] = ""
SampleTest['Sentiment'] = ""
SampleTest['Feature'] = ""

# Display the new DataFrame
SampleTest.head()

Unnamed: 0,Review,Aspects,Sentiment,Feature
0,It might be the best sit down food I've had in...,,,
1,Hostess was extremely accommodating when we ar...,,,
2,We were a couple of minutes late for our reser...,,,
3,"Though the service might be a little slow, the...",,,
4,Although we arrived at the restaurant 10 min l...,,,


### applying ABSA on sample test

In [40]:
def process_review(review, model, sentiment_pipeline, aspect_labels, nlp):
    # Extract real aspects and their mapped labels
    real_aspects, mapped_aspects = extract_and_map_aspects(review, model, aspect_labels)

    # Extract features/opinions based on real aspects and their mappings
    feature_opinions_dict = extract_features(review, real_aspects, mapped_aspects)

    # Analyze sentiment based on mapped aspects and extracted feature opinions
    sentiments_dict = analyze_sentiment(feature_opinions_dict, sentiment_pipeline)

    # Convert all mapped aspects to lowercase for 'Aspects' column
    aspects_list = [mapped_aspect.lower() for mapped_aspect in mapped_aspects.values()]

    # Convert all feature opinions to lowercase for 'Feature_Opinion' list
    feature_opinions_list = [", ".join([opinion.lower() for opinion in feature_opinions_dict.get(mapped_aspect, [])]) for mapped_aspect in aspects_list]

    # Convert all sentiments to lowercase for 'Sentiment' list
    sentiments_list = [sentiments_dict.get(mapped_aspect, 'neutral').lower() for mapped_aspect in aspects_list]

    return aspects_list, feature_opinions_list, sentiments_list



# Initialize 'Aspects', 'Feature_Opinion', and 'Sentiment' columns to hold empty lists instead of NaN
SampleTest['Aspects'] = SampleTest['Aspects'].apply(lambda x: [] if pd.isna(x) else x)
SampleTest['Feature'] = SampleTest['Feature'].apply(lambda x: [] if pd.isna(x) else x)
SampleTest['Sentiment'] = SampleTest['Sentiment'].apply(lambda x: [] if pd.isna(x) else x)


nlp = spacy.load('en_core_web_sm')

# Loop to process and update the DataFrame
i = 0
for index, row in SampleTest.iterrows():
    aspects, feature_opinions, sentiments = process_review(row['Review'], model, sentiment_pipeline, AspectsList, nlp)
    SampleTest.at[index, 'Aspects'] = aspects
    SampleTest.at[index, 'Feature'] = feature_opinions
    SampleTest.at[index, 'Sentiment'] = sentiments
    i += 1
    if i % 50 == 0:
        print("\nprocessed", i, "rows")


processed 50 rows

processed 100 rows

processed 150 rows

processed 200 rows

processed 250 rows

processed 300 rows

processed 350 rows

processed 400 rows


In [41]:
SampleTest

Unnamed: 0,Review,Aspects,Sentiment,Feature
0,It might be the best sit down food I've had in...,"[place, place, price, food]","[neutral, neutral, neutral, neutral]","[, , , ]"
1,Hostess was extremely accommodating when we ar...,"[staff, service, place]","[positive, neutral, neutral]","[accommodating, , ]"
2,We were a couple of minutes late for our reser...,"[miscellaneous, service, staff, staff]","[neutral, negative, neutral, neutral]","[, late, , ]"
3,"Though the service might be a little slow, the...","[service, food, staff]","[neutral, neutral, positive]","[, , friendly]"
4,Although we arrived at the restaurant 10 min l...,"[place, staff, food, service]","[neutral, neutral, neutral, neutral]","[, , , ]"
...,...,...,...,...
395,Since the menu is not organized in the way to ...,"[menu, food, food, miscellaneous]","[neutral, neutral, neutral, negative]","[, , , allergic]"
396,A reservation did not save us from a wait at t...,"[food, place, miscellaneous, place]","[neutral, neutral, neutral, neutral]","[, , , ]"
397,"The bartender was skilled, the owners were ver...","[place, food, food, price]","[negative, positive, positive, negative]","[longer, skilled, skilled, longer]"
398,Their cappuccino's are served in a generous mu...,"[staff, food, food, miscellaneous]","[neutral, neutral, neutral, positive]","[, , , much]"


In [42]:
#save sample test to be comparison testing
SampleTest.to_csv('/content/SampleTest1.csv', index=False)