In [None]:
# installs dependencies and imports required libraries.
%pip install -q pandas torch nltk sentence-transformers transformers tqdm
import re
from pathlib import Path

import nltk
import pandas as pd
import torch
from nltk.corpus import words
from sentence_transformers import SentenceTransformer, util
from tqdm.auto import tqdm
from transformers import pipeline


Note: you may need to restart the kernel to use updated packages.


In [None]:
# specifying the input data location.
DATA_PATH = Path('../data/BDFoodSent-334k.csv')  # Adjust this path as needed


In [None]:
# preparing aspect questions and initializing the qa pipeline.
aspect_questions = {
    'Taste_Aspect': 'What is the opinion about the taste, quantity or food quality?',
    'Service_Aspect': 'What is the feedback on the service or delivery?',
    "Ambiance_Aspect": "What is the description of the restaurant's atmosphere or environment?"
}

df_reviews = pd.read_csv(DATA_PATH)
qa_pipeline = pipeline('question-answering', model='deepset/bert-base-cased-squad2')


Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: b2e4521f-a55c-4196-bdee-e0814cf0e350)')' thrown while requesting HEAD https://huggingface.co/deepset/bert-base-cased-squad2/resolve/main/processor_config.json
Retrying in 1s [Retry 1/5].
'(ProtocolError('Connec

In [None]:
# running qa extraction for each review and aspect.
results = []

for _, row in tqdm(df_reviews.iterrows(), total=len(df_reviews), desc='QA extraction'):
    review_text = row['text']
    restaurant_id = row['name']
    city = row['city']

    for aspect, question_text in aspect_questions.items():
        try:
            output = qa_pipeline(question=question_text, context=review_text)
            answer = output.get('answer', '')
            score = output.get('score', 0.0)
        except Exception:
            answer = ''
            score = 0.0

        results.append({
            'review': review_text,
            'restaurant_id': restaurant_id,
            'city': city,
            'question': aspect,
            'answer': answer,
            'confidence': score
        })

df_results = pd.DataFrame(results)


QA extraction: 100%|██████████| 100/100 [00:04<00:00, 21.89it/s]
QA extraction: 100%|██████████| 100/100 [00:04<00:00, 21.89it/s]


In [None]:
# previewing the qa extraction results.
df_results.head()


Unnamed: 0,review,restaurant_id,city,question,answer,confidence
0,too much small amount,Restaurant 2914,Dhaka,Taste_Aspect,too much small amount,0.136931
1,too much small amount,Restaurant 2914,Dhaka,Service_Aspect,too much small amount,0.082128
2,too much small amount,Restaurant 2914,Dhaka,Ambiance_Aspect,too much small amount,0.071102
3,very small in amount,Restaurant 2914,Dhaka,Taste_Aspect,very small,0.365033
4,very small in amount,Restaurant 2914,Dhaka,Service_Aspect,very small in amount,0.000749


In [None]:
# computing cosine similarity between questions and answers.
tqdm.pandas()
sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def calculate_similarity(row):
    """Calculate cosine similarity between question and answer embeddings.
    
    Args:
        row (pd.Series): A row from the DataFrame containing 'question' and 'answer' columns.

    Returns:
        float: Cosine similarity score between question and answer embeddings.
    """
    question = row['question']
    answer = row['answer']

    if not isinstance(answer, str) or not answer.strip():
        return 0.0

    q_emb = sentence_model.encode(question, convert_to_tensor=True)
    a_emb = sentence_model.encode(answer, convert_to_tensor=True)

    if q_emb.dim() == 1:
        q_emb = q_emb.unsqueeze(0)
    if a_emb.dim() == 1:
        a_emb = a_emb.unsqueeze(0)

    return util.cos_sim(q_emb, a_emb).item()

df_results['cosine_sim'] = df_results.progress_apply(calculate_similarity, axis=1)


100%|██████████| 300/300 [00:11<00:00, 26.93it/s]
100%|██████████| 300/300 [00:11<00:00, 26.93it/s]


In [None]:
# downloading vocabulary resources and defining filtering helpers.
nltk.download('words')
ENGLISH_WORDS = set(words.words())
WORD_RE = re.compile(r'[A-Za-z]+')

def filter_short_sentences(df, column, min_words=5):
    """Filter out sentences with fewer than a specified number of words.

    Args:
        df (pd.DataFrame): DataFrame containing the text data.
        column (str): Name of the column containing text to filter.
        min_words (int, optional): Minimum number of words required to keep a sentence. Defaults to 5.

    Returns:
        pd.DataFrame: Filtered DataFrame with sentences meeting the word count criteria.
    """
    mask = df[column].apply(lambda text: len(str(text).split()) >= min_words)
    return df[mask].reset_index(drop=True)

def filter_english_heavy(df, column, threshold=0.5):
    """Filter out sentences with a low ratio of English words.
    Args:
        df (pd.DataFrame): DataFrame containing the text data.
        column (str): Name of the column containing text to filter.
        threshold (float, optional): Minimum ratio of English words required to keep a sentence. Defaults to 0.5.
    Returns:
        pd.DataFrame: Filtered DataFrame with sentences meeting the English word ratio criteria.
    """
    def english_ratio(text):
        """Calculate the ratio of English words in the given text.
        Args:
            text (str): The text to analyze.
        Returns:
            float: Ratio of English words to total words in the text."""
        tokens = WORD_RE.findall(str(text).lower())
        if not tokens:
            return 0
        eng_count = sum(1 for token in tokens if token in ENGLISH_WORDS)
        return eng_count / len(tokens)

    mask = df[column].apply(lambda text: english_ratio(text) >= threshold)
    return df[mask].reset_index(drop=True)


[nltk_data] Downloading package words to
[nltk_data]     /Users/manasvinsurya/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
# applying text filters to refine the qa results.
filtered_df = filter_short_sentences(df_results, 'review', min_words=3)
filtered_df = filter_english_heavy(filtered_df, 'review', threshold=0.5)
filtered_df.describe()


Unnamed: 0,confidence,cosine_sim
count,255.0,255.0
mean,0.17288,0.083248
std,0.192067,0.150791
min,4e-06,-0.182185
25%,0.032478,-0.037973
50%,0.100414,0.070207
75%,0.242805,0.186579
max,0.907947,0.645607


In [None]:
# selecting top reviews per aspect using cosine similarity.
def filter_top_aspect(df, aspect_label):
    """Filter reviews by aspect and select the top 10% based on cosine similarity.

    Args:
        df (pd.DataFrame): DataFrame containing the review data.
        aspect_label (str): The aspect label to filter by.

    Returns:
        tuple: A tuple containing the full subset and the top 10% subset based on cosine similarity.
    """
    subset = df[df['question'] == aspect_label].copy()
    subset = subset.sort_values(by='cosine_sim', ascending=False)
    top_n = int(len(subset) * 0.1)
    top_subset = subset.head(top_n) if top_n > 0 else subset
    return subset, top_subset

food_full, df_food = filter_top_aspect(filtered_df, 'Taste_Aspect')
ambiance_full, df_ambiance = filter_top_aspect(filtered_df, 'Ambiance_Aspect')
service_full, df_service = filter_top_aspect(filtered_df, 'Service_Aspect')

print(f"Food (Taste): Full Shape {food_full.shape}, Top 10% Shape {df_food.shape}")
print(f"Ambiance: Full Shape {ambiance_full.shape}, Top 10% Shape {df_ambiance.shape}")
print(f"Service: Full Shape {service_full.shape}, Top 10% Shape {df_service.shape}")


Food (Taste): Full Shape (85, 7), Top 10% Shape (8, 7)
Ambiance: Full Shape (85, 7), Top 10% Shape (8, 7)
Service: Full Shape (85, 7), Top 10% Shape (8, 7)


In [None]:
# performing sentiment scoring on the aspect-specific reviews.
sentiment_pipe = pipeline(
    'sentiment-analysis',
    model='nlptown/bert-base-multilingual-uncased-sentiment',
    device=0 if torch.cuda.is_available() else -1
)

def get_sentiment_score(review):
    """Get sentiment score for a given review text.
    Args:
        review (str): The review text to analyze.
    Returns:
        float: Sentiment score of the review.
    """
    if not isinstance(review, str):
        review = str(review)
    return sentiment_pipe(review)[0]['score']

print('Processing Food Reviews...')
df_food['sentiment'] = df_food['review'].progress_apply(get_sentiment_score)

print('Processing Ambiance Reviews...')
df_ambiance['sentiment'] = df_ambiance['review'].progress_apply(get_sentiment_score)

print('Processing Service Reviews...')
df_service['sentiment'] = df_service['review'].progress_apply(get_sentiment_score)


Device set to use cpu


Processing Food Reviews...


100%|██████████| 8/8 [00:00<00:00, 15.01it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_food['sentiment'] = df_food['review'].progress_apply(get_sentiment_score)
100%|██████████| 8/8 [00:00<00:00, 15.01it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_food['sentiment'] = df_food['review'].progress_apply(get_sentiment_score)


Processing Ambiance Reviews...


100%|██████████| 8/8 [00:00<00:00, 27.41it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ambiance['sentiment'] = df_ambiance['review'].progress_apply(get_sentiment_score)
100%|██████████| 8/8 [00:00<00:00, 27.41it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ambiance['sentiment'] = df_ambiance['review'].progress_apply(get_sentiment_score)


Processing Service Reviews...


100%|██████████| 8/8 [00:00<00:00, 30.85it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_service['sentiment'] = df_service['review'].progress_apply(get_sentiment_score)
100%|██████████| 8/8 [00:00<00:00, 30.85it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_service['sentiment'] = df_service['review'].progress_apply(get_sentiment_score)


In [None]:
# writing sentiment outputs to the results directory.
results_dir = Path('../results')
results_dir.mkdir(exist_ok=True)

df_food.to_csv(results_dir / 'Food_sentiment_bert_base_cased.csv', index=False)
df_ambiance.to_csv(results_dir / 'Ambiance_sentiment_bert_base_cased.csv', index=False)
df_service.to_csv(results_dir / 'Service_sentiment_bert_base_cased.csv', index=False)

print('Saved Food_sentiment_bert_base_cased.csv, Ambiance_sentiment_bert_base_cased.csv, Service_sentiment_bert_base_cased.csv to results/')


Saved results_food_sentiment.csv, results_ambiance_sentiment.csv, results_service_sentiment.csv


In [None]:
# performing no operation in this cell.
