<a href="https://colab.research.google.com/github/kr5red/automated-customer-reviews/blob/main/Model3_Ki.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1)

In [None]:
!pip install -q transformers accelerate torch sentencepiece bitsandbytes tqdm

In [None]:
import pandas as pd
from tqdm import tqdm
import re

df = pd.read_csv("reviews_with_sentiment.csv")   # adjust filename
PRODUCT_COL = "name"

def light_clean(s): return re.sub(r"\s+", " ", str(s).strip())
df["text_for_sum"] = df["text_merged"].fillna("").apply(light_clean)
df_prod = (
    df.groupby(PRODUCT_COL)["text_for_sum"]
      .apply(lambda xs: "\n\n".join(xs.tolist()))
      .reset_index()
      .rename(columns={"text_for_sum":"all_reviews"})
)

tqdm.pandas()
df_prod["review_summary"] = df_prod["all_reviews"].progress_apply(lambda t: mistral_summarize(t))

df_prod.to_csv("product_review_summaries_mistral_colab.csv", index=False)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"

# 4-bit quantized loading â†’ fits in Colab GPU memory
bnb_cfg = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_cfg,
    device_map="auto"
)

In [None]:
def mistral_summarize(text, max_new_tokens=220, temperature=0.3):
    system = ("You are a helpful assistant that writes coherent, fair product review summaries. "
              "Write a single paragraph that naturally blends positive and negative opinions.")
    user = f"Summarize the following customer reviews:\n\n{text}"
    prompt = f"<s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n{user} [/INST]"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=0.9)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # keep only model reply after [/INST]
    return result.split("[/INST]")[-1].strip()