# Biasing

We have three types of bias: 
- "filter": Ranking bias: The star rating is used to rank the reviews.
- "ranking": In the context we provide to the LLM, the star rating is used to rank the reviews.
- "prompt": The LLM is biased by including an explicit instruction in the prompt.

In [None]:
import pandas as pd
import numpy as np

TITLE = "title.y"

# the meta csv joins meta data with the reviews
df = pd.read_csv('electronics_reviews_with_meta.csv')
df = df[df['training'] == 1]

product_code = "B00004ZCJJ"
matching_reviews = df[df['parent_asin'] == product_code]
display(matching_reviews)

Unnamed: 0,parent_asin,rating,title.x,text,asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,title.y,rating_number,store,features,average_rating,description,price,images,training
0,B00004ZCJJ,5,Tiffen 62 UV,Tiffen offers quality filters for good value p...,B00004ZCJJ,AFCYRKRZW6EPSPQEOKEPGIZQBJFA,1.419668e+12,0,True,Camera & Photo,Tiffen 62UVP 62mm UV Protection Filter,1238,Tiffen,"['Most popular protection filter', 'Provides b...",4.7,"['Product Description', 'Tiffen UV Protection ...",6.88,[{'thumb': 'https://m.media-amazon.com/images/...,1
2,B00004ZCJJ,5,Good protector of my big lens.,Use this to protect my $600.00 lens. Works as ...,B00004ZCJJ,AGV7ZLMTDVDPKLG7VCW3IRTBAL4Q,1.588800e+12,0,True,Camera & Photo,Tiffen 62UVP 62mm UV Protection Filter,1238,Tiffen,"['Most popular protection filter', 'Provides b...",4.7,"['Product Description', 'Tiffen UV Protection ...",6.88,[{'thumb': 'https://m.media-amazon.com/images/...,1
3,B00004ZCJJ,5,Quality.,Very good quality filter...price is afordable....,B00004ZCJJ,AGBW3FXSVW5N25ZREDJUT3IDZK4Q,1.606961e+12,0,True,Camera & Photo,Tiffen 62UVP 62mm UV Protection Filter,1238,Tiffen,"['Most popular protection filter', 'Provides b...",4.7,"['Product Description', 'Tiffen UV Protection ...",6.88,[{'thumb': 'https://m.media-amazon.com/images/...,1
4,B00004ZCJJ,5,Good filtet,Good,B00004ZCJJ,AEZB7FXVN3E6WXTYVXWIDWJNDHPQ,1.536951e+12,0,True,Camera & Photo,Tiffen 62UVP 62mm UV Protection Filter,1238,Tiffen,"['Most popular protection filter', 'Provides b...",4.7,"['Product Description', 'Tiffen UV Protection ...",6.88,[{'thumb': 'https://m.media-amazon.com/images/...,1
6,B00004ZCJJ,5,I cannot tell the difference but it did save m...,I cannot tell the difference but it did save m...,B00004ZCJJ,AFVYRFMDG3Y2WX22F6BFDAWY2DWQ,1.481837e+12,0,True,Camera & Photo,Tiffen 62UVP 62mm UV Protection Filter,1238,Tiffen,"['Most popular protection filter', 'Provides b...",4.7,"['Product Description', 'Tiffen UV Protection ...",6.88,[{'thumb': 'https://m.media-amazon.com/images/...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,B00004ZCJJ,3,BROKEN CASE.,"Package came broken, case include as if it bee...",B00004ZCJJ,AESVTGJVRDB55WWDTYZO76WPCA4A,1.558586e+12,0,True,Camera & Photo,Tiffen 62UVP 62mm UV Protection Filter,1238,Tiffen,"['Most popular protection filter', 'Provides b...",4.7,"['Product Description', 'Tiffen UV Protection ...",6.88,[{'thumb': 'https://m.media-amazon.com/images/...,1
196,B00004ZCJJ,5,Great quality filter,Great quality filter and it fit perfectly on m...,B00004ZCJJ,AELMN2O42WB2Q5VDSMTV75U6BCPQ,1.520658e+12,0,True,Camera & Photo,Tiffen 62UVP 62mm UV Protection Filter,1238,Tiffen,"['Most popular protection filter', 'Provides b...",4.7,"['Product Description', 'Tiffen UV Protection ...",6.88,[{'thumb': 'https://m.media-amazon.com/images/...,1
197,B00004ZCJJ,5,Five Stars,Great Tiffen quality.,B00004ZCJJ,AHRIYEQFXD42YCN2DZHGBSHJL6LA,1.461763e+12,0,True,Camera & Photo,Tiffen 62UVP 62mm UV Protection Filter,1238,Tiffen,"['Most popular protection filter', 'Provides b...",4.7,"['Product Description', 'Tiffen UV Protection ...",6.88,[{'thumb': 'https://m.media-amazon.com/images/...,1
198,B00004ZCJJ,5,Tiffen 62mm UV Protection Filter,"Nice filter, reasonably priced. Recommend.",B00004ZCJJ,AGUEXRNHLLNRQELSJYSPBOFIBIJQ,1.420736e+12,0,False,Camera & Photo,Tiffen 62UVP 62mm UV Protection Filter,1238,Tiffen,"['Most popular protection filter', 'Provides b...",4.7,"['Product Description', 'Tiffen UV Protection ...",6.88,[{'thumb': 'https://m.media-amazon.com/images/...,1


In [None]:
import os
import openai
from pprint import pprint
from dotenv import load_dotenv

from sentence_transformers import SentenceTransformer
from sentence_transformers import util
from tqdm.notebook import tqdm

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPENAI_API_KEY

class AI_Summarizer:
    def __init__(self, reviews, 
                 alpha=0.9,
                 max_rating=5.0,
                 similarity_model_name = "sentence-transformers/all-MiniLM-L6-v2", 
                 topk=10,

                 model_openai="gpt-4o-mini"):
        self.model_openai = model_openai
        self.similarity_model = SentenceTransformer(similarity_model_name)
        self.alpha = alpha
        self.max_rating = max_rating
        self.reviews = reviews

        self.system_prompt="You are an expert AI product review summarizer."
        self.topk = topk
        self.max_context_length = 4096

    def create_context(self, reviews, biased=False):
        context = []
        for i, review in reviews.iloc[:self.topk].iterrows():
            review_text = f"Title: {review[TITLE]}\nReview: {review['text']}\n\n"
            if len(context) + len(review_text) > self.max_context_length:
                break
            context.append([review_text, review['rating']])

        if biased:
            # We sort the context by rating to prioritize higher-rated reviews
            context = sorted(context, key=lambda x: x[1], reverse=True)
        context = "".join([c[0] for c in context])
        return context

    def get_response(self, prompt, system_prompt=None): 
        if system_prompt is None:
            system_prompt = self.system_prompt
        response = openai.chat.completions.create(
            model=self.model_openai,
            messages=[
                {"role": "user", "content": prompt},
                {"role": "system", "content": system_prompt},
            ],
        )
        content = response.choices[0].message.content
        return content

    def get_prompt(self, query, context, biased=False):
        query = f"Aspect: {query}"

        instructions = """
        Given the aspect highlighted, summarize the reviews that were given to you in two short sentences. 
        Stay factual and do not include your opinion. Do not use additional information, besides the reviews provided.
        """
        instructions_system = "You are an expert AI product review summarizer. "

        if biased: 
            instructions_system += "You are biased towards the reviews with higher ratings."

        prompt = f"{query}\n\n{context}\n\n{instructions}"
        return prompt, instructions_system

    def get_reviews(self, query, reviews, biased=False): 
        query_embedding = self.similarity_model.encode(query)

        similarities = []
        for index, row in tqdm(reviews.iterrows(), total=len(reviews)):
            text_to_embed = f"{row[TITLE]} {row['text']}"
            embeddings = self.similarity_model.encode(text_to_embed)

            similarity_cosine = util.cos_sim(query_embedding, embeddings)
            if biased:
                normalized_rating = row['rating'] / self.max_rating
                similarity = self.alpha * similarity_cosine + (1 - self.alpha) * normalized_rating
            else: 
                similarity = similarity_cosine
                
            similarities.append(similarity.item())

        reviews.loc[:, 'similarity'] = similarities
        reviews = reviews.sort_values(by='similarity', ascending=False)
        return reviews

    def get_summary(self, query, bias_type="None"):
        """
        We have three types of bias: 
        - "filter": Ranking bias: The star rating is used to rank the reviews.
        - "ranking": In the context we provide to the LLM, the star rating is used to rank the reviews.
        - "prompt": The LLM is biased by including an explicit instruction in the prompt.
        """

        # TODO: to make it faster we should not recalculate reviews for each 
        # iteration, getting embeddings is incredibly slow. For now I will leave it 
        # like this to make it easier to understand
        if bias_type == "filter":
            reviews_sorted = self.get_reviews(query, self.reviews, biased=True)
            context = self.create_context(reviews_sorted, biased=False)
            prompt, system_prompt = self.get_prompt(query, context, biased=False)
            answer = self.get_response(prompt, system_prompt=system_prompt)
        elif bias_type == "ranking":
            reviews_sorted = self.get_reviews(query, self.reviews, biased=False)
            context = self.create_context(reviews_sorted, biased=True)
            prompt, system_prompt = self.get_prompt(query, context, biased=False)
            answer = self.get_response(prompt, system_prompt=system_prompt)
        elif bias_type == "prompt":
            reviews_sorted = self.get_reviews(query, self.reviews, biased=False)
            context = self.create_context(reviews_sorted, biased=False)
            prompt, system_prompt = self.get_prompt(query, context, biased=True)
            answer = self.get_response(prompt, system_prompt=system_prompt)
        else:
            reviews_sorted = self.get_reviews(query, self.reviews, biased=False)
            context = self.create_context(reviews_sorted, biased=False)
            prompt, system_prompt = self.get_prompt(query, context, biased=False)
            answer = self.get_response(prompt, system_prompt=system_prompt)

        answer_obj = {
            "query": query,
            "answer": answer,
            "context": context,
            "topk": self.topk,
            "bias_type": bias_type,
            # "reviews": reviews_sorted, # Keep for debug
            "prompt": prompt,}
        return answer_obj

query = "image quality"
bias_types = {
    "none": [],
    "filter": [],
    "ranking": [], 
    "prompt": [],
}


if __name__ ==  "__main__":
    model = AI_Summarizer(matching_reviews)

    for bias_type, bias_objs in bias_types.items():
        print(f"Bias Type: {bias_type}")
        answer_obj = model.get_summary(query, bias_type=bias_type)
        bias_objs.append(answer_obj)

    # Display the results
    for bias_type, answer_obj in bias_types.items():
        answer_obj = answer_obj[0]
        print(f"Bias Type: {answer_obj['bias_type']}")
        print(f"Query: {answer_obj['query']}")
        pprint(f"Answer: {answer_obj['answer']}")
        print("\n")

Bias Type: none


  0%|          | 0/138 [00:00<?, ?it/s]

Bias Type: filter


  0%|          | 0/138 [00:00<?, ?it/s]

Bias Type: ranking


  0%|          | 0/138 [00:00<?, ?it/s]

Bias Type: prompt


  0%|          | 0/138 [00:00<?, ?it/s]

Bias Type: none
Query: image quality
('Answer: The Tiffen 62UVP 62mm UV Protection Filter is widely regarded as a '
 'good value for its protective qualities, with several users noting no '
 'significant impact on image quality. Some reviews highlight its '
 'effectiveness in shielding lenses from scratches and damage, while a few '
 'caution about potential vignetting when used with specific wide-angle '
 'lenses.')


Bias Type: filter
Query: image quality
('Answer: The Tiffen 62UVP 62mm UV Protection Filter is generally praised for '
 'its ability to protect lenses without causing noticeable degradation in '
 'image quality. Users appreciate its affordability and effectiveness as a '
 'protective measure, though some note potential vignetting issues when used '
 'with specific wide-angle lenses.')


Bias Type: ranking
Query: image quality
('Answer: The Tiffen 62UVP 62mm UV Protection Filter generally does not '
 'degrade image quality and is seen as an effective lens protector by use