## News Personalization: Leveraging RAG for Targeted Content Delivery

## Setup

In [None]:
!pip install --upgrade pip

In [None]:
!pip install -qU \
    kaggle \
    sagemaker \
    pinecone-client==2.2.1 \
    ipywidgets==7.0.0\
    seaborn\
    sentence-transformers\
    torch==1.13.1 \
    transformers==4.27.2

## 1. Extracting the archived news dataset from Kaggle

In [None]:
#!pip install --q kaggle 

[Note: According to kaggle api documentation the location where credentials json is looking for is ~/.kaggle/kaggle.json]

In [None]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

In [None]:
api_token={"username":"abc","key":"1234"}

In [None]:
import json
import os

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d rmisra/news-category-dataset --unzip

In [None]:
os.getcwd()+"/Data/News_Category_Dataset_v3.json"

### READ IN THE DATASET

In [None]:
import os
import pandas as pd

df = pd.read_json(os.getcwd()+"/Data/News_Category_Dataset_v3.json",
                 lines=True)
#print(df.shape)
#df

In [None]:
#df.groupby('category').agg(_num_articles=('headline','count'),
#                           _min_dt=('date','min'),
#                          _max_dt=('date','max')).reset_index().sort_values('_num_articles',ascending=False)[:5]

## 2.Encode and upsert the data into a Vector database. 

### Setup - Sentence encoder

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model_name='sentence-transformers/all-MiniLM-L6-v2'
encoder = SentenceTransformer(model_name_or_path=model_name)

### checking sentence encoding
sentences = ["This is an example sentence", "Each sentence is converted"]

embeddings = encoder.encode(sentences)
print(f"Number of sentences embedded = {len(embeddings)}")
print(f"Length of emberddings embedded = {len((embeddings[0]))}")
print(f"First 10 elements of the embedding =\n {embeddings[0][:10]}")

In [None]:
df_subset_test=df.loc[0:1000]

In [None]:
def generate_item_sentence(item: "pd.Series", 
                           text_columns:"list of columns") -> str:
    """
    
    This function concatenates columns of interest and generates sentence embeddings of the concatenated text.
    
    Args: 
        item (pd.Series): row of a pandas dataframe
        text_columns (list): list of columns'
    Returns:
         str: concatenated string
    """
    return ' '.join([item[column] for column in text_columns])

In [None]:
df_subset_test["sentence"] = df_subset_test.apply(lambda row: generate_item_sentence(row,["headline","short_description"]), 
                                                  axis=1)

df_subset_test["sentence_embedding"] = df_subset_test["sentence"].apply(encoder.encode)

In [None]:
generate_item_sentence.__annotations__
help(generate_item_sentence)

#### PINECONE SETUP

In [None]:
import pinecone
import os

PINECONE_API_KEY="234324dsfdsfs"
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["PINECONE_API_ENV"] = "gcp-starter"

pinecone.init(
    api_key = os.environ.get('PINECONE_API_KEY'),
    environment = os.environ.get('PINECONE_API_ENV')
)

#listing all the indexes
pinecone.list_indexes()

In [None]:
import time

index_name = 'news-articles-rag-aws'

if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

#Index deleted 
print(pinecone.list_indexes())
    
pinecone.create_index(
    name=index_name,
    dimension=encoder.get_sentence_embedding_dimension(),
    metric='cosine'
)
# wait for index to finish initialization
while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)

In [None]:
#Checking the indexes creation:
pinecone.list_indexes()

### Upsert data into Pinecone

In [None]:
from tqdm.auto import tqdm

batch_size = 2  # can increase but needs larger instance size otherwise instance runs out of memory
vector_limit = df_subset_test.shape[0]#1000

answers = df_subset_test[:vector_limit]
index = pinecone.Index(index_name)

for i in tqdm(range(0, len(answers), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(answers))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    if i%100==0:
        print(f"i = {i}, i_end = {i_end}, ids = {ids}")
    # create metadata batch
    metadatas = [{'text': text} for text in answers["sentence"][i:i_end]]
    #print("--------Metadata----------")
    #print(metadatas)
    # create embeddings
    texts = answers["sentence"][i:i_end].tolist()
    #print("--------Texts----------")
    #print(texts)
    #print("--------Embedding----------")
    embeddings=[encoder.encode(sent).tolist() for sent in texts]
    #print(f"Length of embeddings = {len(embeddings)}")
    #embeddings = embed_docs(texts)
    #df_subset_test["sentence"].apply(encoder.encode)
    # create records list for upsert
    #print("---------Records------------")
    records = zip(ids, embeddings, metadatas)
    #print(f"records = {records}")
    # upsert to Pinecone
    index.upsert(vectors=records)

In [None]:
# check number of records in the index
index = pinecone.Index(index_name)
index.describe_index_stats()

## 3. Leveraging RAG for Targeted Content Delivery

### LLM Setup

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig
import torch
import pandas as pd

model_name='google/flan-t5-base'

model_flan = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer_flan = AutoTokenizer.from_pretrained(model_name, use_fast=True)

### RAG Setup

In [None]:
from sentence_transformers import SentenceTransformer

### checking sentence encoding

model_name='sentence-transformers/all-MiniLM-L6-v2'
encoder = SentenceTransformer(model_name_or_path=model_name)

In [None]:
import pinecone
import os

PINECONE_API_KEY="2343242adasda"
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["PINECONE_API_ENV"] = "gcp-starter"

pinecone.init(
    api_key = os.environ.get('PINECONE_API_KEY'),
    environment = os.environ.get('PINECONE_API_ENV')
)

#listing all the indexes
pinecone.list_indexes()

In [None]:
# check number of records in the index
index_name='news-articles-rag-aws'
index = pinecone.Index(index_name)
index.describe_index_stats()

### Helper Functions

In [None]:
def retriver (query_text: str,top_k: int) -> list[str]:
    """
    This function retrieves the relevant articles from the Pinecone vector database.
    
    Args: 
         query:str: User query
         top_k: int: Top K responses to return
    Returns:
         list[str]: List of relevant top 5 articles
    """
    
    query_vector = encoder.encode(query_text).tolist()

    res = index.query(query_vector, top_k=top_k, include_metadata=True)

    # show the results
    #res

    contexts = [match.metadata['text'] for match in res.matches]
    return contexts

In [None]:
retriver.__annotations__
help(retriver)

In [None]:
from typing import List



def construct_context(contexts: List[str],max_section_len: int,separator: str) -> str:
    
    """
    This function generates the context string from RAG response.
    
    Args: 
         contexts: List[str]: RAG semantic search response
         max_section_len: int: Max length of the context
        separator: str: Seperator between the responses ('/s','/n')
    Returns:
         str: concatenated string
    """
    
    chosen_sections = []
    chosen_sections_len = 0

    for text in contexts:
        text = text.strip()
        # Add contexts until we run out of space.
        chosen_sections_len += len(text) + 2
        if chosen_sections_len > max_section_len:
            break
        chosen_sections.append(text)
    concatenated_doc = separator.join(chosen_sections)
    '''print(
        f"With maximum sequence length {max_section_len}, selected top {len(chosen_sections)} document sections: \n{concatenated_doc}"
    )'''
    return concatenated_doc

In [None]:
construct_context.__annotations__
help(construct_context)

In [None]:
def construct_payload(prompt_template: str,
                      question: str,
                      context_str: str,
                      padding:str="longest")-> str:
    
    """
    This function contructs the prompt for the LLM.
    
    Args: 
        prompt_template: str: Input prompt template
        question:str: LLM question
        context_str: LLM input context information
        max_source_length:int: max source length
        max_target_length:int:=round(max_source_length/2,0)
        padding:str="longest"
    Returns:
         str: LLM prompt
    """
    prompt = prompt_template.replace("{context}", context_str).replace("{question}", question)

    return prompt
    

In [None]:
construct_payload.__annotations__
help(construct_payload)

In [None]:
prompt_template = """Answer the following QUESTION without hallucination.".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

### LLM Base response without RAG

In [None]:
max_source_length=512
max_target_length=round(max_source_length/2,0)
padding="longest"#"max_length" #"longest"
#input_text=example.text[0]

question= "What is the news about air travel?"#"Summarize the text without any hallucination:"#"Who are the entities:" #"What is the sentiment:" #"Provide accurate summarization:"
#Summary

context_str=""

prompt=construct_payload(prompt_template,question,context_str,padding="longest")

#print(prompt)

inputs = tokenizer_flan(prompt,
    max_length=max_source_length,
    return_tensors='pt',
    padding=padding,
    truncation=True)

#max_target_length=max(round(len_input_text/2,0),max_target_length)
#print(f"\nLENGTH OF INPUT TEXT = {len(prompt)}, max_target_length = {max_target_length}")
base_output=tokenizer_flan.decode(model_flan.generate(inputs["input_ids"]
                                                      ,max_new_tokens=max_target_length)[0])
base_output=base_output.replace("<pad> ", "").replace("</s>", "")

print(f'LLM RESPONSE WITHOUT RAG CONTEXT:\n{base_output}')

### LLM response with RAG - FLAN-T5-Base

In [None]:
max_source_length=512
max_target_length=1000#max_source_length
padding="longest"#"max_length" #"longest"
#input_text=example.text[0]

question= "What is the news about air travel?"#"Summarize the text without any hallucination:"#"Who are the entities:" #"What is the sentiment:" #"Provide accurate summarization:"

#Generate the contexts
contexts=retriver(query_text = question,top_k=5)
print(f"{contexts=}")

#Construct the context
context_str = construct_context(contexts=contexts,max_section_len = 2000,separator = '\n')#"\n")
print(f"\n{context_str=}")


#Create the prompt
prompt=construct_payload(prompt_template,question,context_str,padding="longest")
print(f"\n{prompt=}")


inputs = tokenizer_flan(prompt,
    max_length=max_source_length,
    return_tensors='pt',
    padding=padding,
    truncation=True)

rag_output=tokenizer_flan.decode(model_flan.generate(inputs["input_ids"],max_new_tokens=max_target_length)[0])
rag_output=rag_output.replace("<pad> ", "").replace("</s>", "")

print(f'\n\nLLM RESPONSE WITH RAG CONTEXT:\n{rag_output}')


### LLM response with RAG - FLAN-T5-Small

In [None]:
tokenizer_flan_small = AutoTokenizer.from_pretrained("google/flan-t5-small")
model_flan_small = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

In [None]:
max_source_length=512
max_target_length=1000#max_source_length
padding="longest"#"max_length" #"longest"
#input_text=example.text[0]

question= "What is the news about air travel?"#"Summarize the text without any hallucination:"#"Who are the entities:" #"What is the sentiment:" #"Provide accurate summarization:"

#Generate the contexts
contexts=retriver(query_text = question,top_k=5)
print(f"{contexts=}")

#Construct the context
context_str = construct_context(contexts=contexts,max_section_len = 2000,separator = '\n')#"\n")
print(f"\n{context_str=}")


#Create the prompt
prompt=construct_payload(prompt_template,question,context_str,padding="longest")
print(f"\n{prompt=}")


inputs = tokenizer_flan_small(prompt,
    max_length=max_source_length,
    return_tensors='pt',
    padding=padding,
    truncation=True)

rag_output=tokenizer_flan_small.decode(model_flan_small.generate(inputs["input_ids"],max_new_tokens=max_target_length)[0])
#rag_output=rag_output.replace("<pad> ", "").replace("</s>", "")

print(f'\n\nLLM RESPONSE WITH RAG CONTEXT:\n{rag_output}')


### LLM response with RAG - Falconsai/text_summarization

In [None]:
tokenizer_falcon = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
model_falcon = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")

In [None]:
from transformers import pipeline

max_source_length=2000
max_target_length=20000

#Generate the contexts
contexts=retriver(query_text = question,top_k=5)
print(f"{contexts=}")

#Construct the context
context_str = construct_context(contexts=contexts,
                                max_section_len = 2000,separator = '\n')#"\n")
print(f"\n{context_str=}")

summarizer = pipeline("summarization", model="Falconsai/text_summarization")

rag_sumamrizer=summarizer(context_str, max_length=len(context_str)+10, min_length=30, do_sample=False)[0]['summary_text']

print(f"\n\nLLM RESPONSE WITH RAG CONTEXT USING Falconsai/text_summarization =\n{rag_sumamrizer}")