In [None]:
# installs dependencies and imports required libraries.
%pip install -q pandas torch nltk sentence-transformers transformers tqdm
import re
from pathlib import Path

import nltk
import pandas as pd
import torch
from nltk.corpus import words
from sentence_transformers import SentenceTransformer, util
from tqdm.auto import tqdm
from transformers import pipeline


In [None]:
# specifying the input data location.
DATA_PATH = Path('../review_datasets/BDFoodSent-334k.csv')  # Adjust this path as needed


aspect_questions maps taste, service, and ambiance queries so df_reviews can load from DATA_PATH and initialize qa_pipeline.


In [None]:
# preparing aspect questions and initializing the qa pipeline.
aspect_questions = {
    'Taste_Aspect': 'What is the opinion about the taste, quantity or food quality?',
    'Service_Aspect': 'What is the feedback on the service or delivery?',
    "Ambiance_Aspect": "What is the description of the restaurant's atmosphere or environment?"
}

df_reviews = pd.read_csv(DATA_PATH)
qa_pipeline = pipeline('question-answering', model='deepset/bert-base-cased-squad2')


In [None]:
# running qa extraction for each review and aspect.
results = []

for _, row in tqdm(df_reviews.iterrows(), total=len(df_reviews), desc='QA extraction'):
    review_text = row['text']
    restaurant_id = row['name']
    city = row['city']

    for aspect, question_text in aspect_questions.items():
        try:
            output = qa_pipeline(question=question_text, context=review_text)
            answer = output.get('answer', '')
            score = output.get('score', 0.0)
        except Exception:
            answer = ''
            score = 0.0

        results.append({
            'review': review_text,
            'restaurant_id': restaurant_id,
            'city': city,
            'question': aspect,
            'answer': answer,
            'confidence': score
        })

df_results = pd.DataFrame(results)


df_results collects review, restaurant_id, city, question, answer, and confidence columns before previewing the first rows.


In [None]:
# previewing the qa extraction results.
df_results.head()


df_results gains cosine_sim scores to quantify alignment between each question-answer pair.


In [None]:
# computing cosine similarity between questions and answers.
tqdm.pandas()
sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def calculate_similarity(row):
    """Calculate cosine similarity between question and answer embeddings.
    
    Args:
        row (pd.Series): A row from the DataFrame containing 'question' and 'answer' columns.

    Returns:
        float: Cosine similarity score between question and answer embeddings.
    """
    question = row['question']
    answer = row['answer']

    if not isinstance(answer, str) or not answer.strip():
        return 0.0

    q_emb = sentence_model.encode(question, convert_to_tensor=True)
    a_emb = sentence_model.encode(answer, convert_to_tensor=True)

    if q_emb.dim() == 1:
        q_emb = q_emb.unsqueeze(0)
    if a_emb.dim() == 1:
        a_emb = a_emb.unsqueeze(0)

    return util.cos_sim(q_emb, a_emb).item()

df_results['cosine_sim'] = df_results.progress_apply(calculate_similarity, axis=1)


In [None]:
# downloading vocabulary resources and defining filtering helpers.
nltk.download('words')
ENGLISH_WORDS = set(words.words())
WORD_RE = re.compile(r'[A-Za-z]+')

def filter_short_sentences(df, column, min_words=5):
    """Filter out sentences with fewer than a specified number of words.

    Args:
        df (pd.DataFrame): DataFrame containing the text data.
        column (str): Name of the column containing text to filter.
        min_words (int, optional): Minimum number of words required to keep a sentence. Defaults to 5.

    Returns:
        pd.DataFrame: Filtered DataFrame with sentences meeting the word count criteria.
    """
    mask = df[column].apply(lambda text: len(str(text).split()) >= min_words)
    return df[mask].reset_index(drop=True)

def filter_english_heavy(df, column, threshold=0.5):
    """Filter out sentences with a low ratio of English words.
    Args:
        df (pd.DataFrame): DataFrame containing the text data.
        column (str): Name of the column containing text to filter.
        threshold (float, optional): Minimum ratio of English words required to keep a sentence. Defaults to 0.5.
    Returns:
        pd.DataFrame: Filtered DataFrame with sentences meeting the English word ratio criteria.
    """
    def english_ratio(text):
        """Calculate the ratio of English words in the given text.
        Args:
            text (str): The text to analyze.
        Returns:
            float: Ratio of English words to total words in the text."""
        tokens = WORD_RE.findall(str(text).lower())
        if not tokens:
            return 0
        eng_count = sum(1 for token in tokens if token in ENGLISH_WORDS)
        return eng_count / len(tokens)

    mask = df[column].apply(lambda text: english_ratio(text) >= threshold)
    return df[mask].reset_index(drop=True)


filtered_df retains reviews where review length exceeds three words and english_ratio meets the 0.5 threshold.


In [None]:
# applying text filters to refine the qa results.
filtered_df = filter_short_sentences(df_results, 'review', min_words=3)
filtered_df = filter_english_heavy(filtered_df, 'review', threshold=0.5)
filtered_df.describe()


filter_top_aspect sorts filtered_df by cosine_sim per aspect to extract df_food, df_ambiance, and df_service subsets.


In [None]:
# selecting top reviews per aspect using cosine similarity.
def filter_top_aspect(df, aspect_label):
    """Filter reviews by aspect and select the top 10% based on cosine similarity.

    Args:
        df (pd.DataFrame): DataFrame containing the review data.
        aspect_label (str): The aspect label to filter by.

    Returns:
        tuple: A tuple containing the full subset and the top 10% subset based on cosine similarity.
    """
    subset = df[df['question'] == aspect_label].copy()
    subset = subset.sort_values(by='cosine_sim', ascending=False)
    top_n = int(len(subset) * 0.1)
    top_subset = subset.head(top_n) if top_n > 0 else subset
    return subset, top_subset

food_full, df_food = filter_top_aspect(filtered_df, 'Taste_Aspect')
ambiance_full, df_ambiance = filter_top_aspect(filtered_df, 'Ambiance_Aspect')
service_full, df_service = filter_top_aspect(filtered_df, 'Service_Aspect')

print(f"Food (Taste): Full Shape {food_full.shape}, Top 10% Shape {df_food.shape}")
print(f"Ambiance: Full Shape {ambiance_full.shape}, Top 10% Shape {df_ambiance.shape}")
print(f"Service: Full Shape {service_full.shape}, Top 10% Shape {df_service.shape}")


df_food, df_ambiance, and df_service each receive sentiment scores from sentiment_pipe and expand with a sentiment column.


In [None]:
# performing sentiment scoring on the aspect-specific reviews.
sentiment_pipe = pipeline(
    'sentiment-analysis',
    model='nlptown/bert-base-multilingual-uncased-sentiment',
    device=0 if torch.cuda.is_available() else -1
)

def get_sentiment_score(review):
    """Get sentiment score for a given review text.
    Args:
        review (str): The review text to analyze.
    Returns:
        float: Sentiment score of the review.
    """
    if not isinstance(review, str):
        review = str(review)
    return sentiment_pipe(review)[0]['score']

print('Processing Food Reviews...')
df_food['sentiment'] = df_food['review'].progress_apply(get_sentiment_score)

print('Processing Ambiance Reviews...')
df_ambiance['sentiment'] = df_ambiance['review'].progress_apply(get_sentiment_score)

print('Processing Service Reviews...')
df_service['sentiment'] = df_service['review'].progress_apply(get_sentiment_score)


food_output, ambiance_output, and service_output paths capture the final CSV shapes for downstream notebooks.


In [None]:
# Write sentiment outputs to the results directory.
results_dir = Path('../results')
results_dir.mkdir(exist_ok=True)

food_output = results_dir / 'food_sentiment_bert_base_cased.csv'
ambiance_output = results_dir / 'ambiance_sentiment_bert_base_cased.csv'
service_output = results_dir / 'service_sentiment_bert_base_cased.csv'

df_food.to_csv(food_output, index=False)
df_ambiance.to_csv(ambiance_output, index=False)
df_service.to_csv(service_output, index=False)

print(f'Saved {food_output.name}, {ambiance_output.name}, {service_output.name} to {results_dir}')


In [None]:
# performing no operation in this cell.
