# Script to Create Explanations to Recommendations - [GOOGLE COLAB]

In [1]:
# Install missing libraries
!pip install -q bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
#@title Import Libraries
import os
import pickle
import numpy as np
import random
import itertools
import json

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# If you don't have it, you need to create a token directly from the model page in colab
# https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
!huggingface-cli login

In [5]:
# Function to download the data - Google Colab
def load_dict_from_drive(file_path):
    """
    Load dictionary from Google Drive pickle file.
    """
    with open(file_path, 'rb') as file:
        data_dict = pickle.load(file)

    return data_dict

## Define LLM Task to create Explanations

In [7]:
# Google Colab
class ChatBot:
    def __init__(self, model_name="meta-llama/Llama-3.1-8B-Instruct"):
        """
        Initialize the ChatBot with the specified model name.
        """
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token

        # Optimize model loading with 4-bit quantization
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16
        )

        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            quantization_config=bnb_config
        )

    def get_response(self, message):
        """
        Generate a response from the model based on the given prompt.
        """
        try:
            formatted_prompt = f"[INST] {message} [/INST]"
            inputs = self.tokenizer(
                formatted_prompt,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=1024
            ).to("cuda")

            output = self.model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=300,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
            )

            response = self.tokenizer.decode(output[0], skip_special_tokens=True)

            # Remove input prompt from response if echoed
            return response[len(formatted_prompt):].strip()

        except Exception as e:
            print(f"Error in get_response: {e}")
            return "Error: Unable to generate a response."

    def chat_user_profile(self, user_item_dict, item_info_dict, user_id, user_preferences, user_feedback):
        """
        Generate a user profile based on the user's reading history, preferences, and feedback
        """

        # 1. System Instructions (Formal, Structured)
        content_begin = (
            "[INST] <<SYS>>\n"
            "Generate a structured user profile based on the provided reading history, preferences, and feedback.\n"
            "The response should be **formal, objective, and data-driven** with clear sections.\n"
            "Avoid conversational language. Focus on summarizing the user's reading behavior concisely.\n"
            "Keep the response **under 300 tokens** and avoid unnecessary details.\n"
            "<</SYS>>\n\n"
        )

        # 2. User's Historical Interactions
        content_user_history = "### User's Reading History:\n"
        user_categories = set()

        for item_id in user_item_dict.get(user_id, []):
            book_info = item_info_dict.get(item_id, {})
            title = book_info.get('standard_title', 'Unknown Title')
            categories = book_info.get('parsed_categories', [])

            if isinstance(categories, str):
                categories = [categories]

            user_categories.update(categories)
            content_user_history += f"- {title} ({', '.join(categories)})\n"

        # 3. User's Onboarding Preferences
        user_pref_categories = set(user_preferences.get(user_id, []))
        content_user_prefs = (
            f"\n### User's Preferences:\n"
            f"- Favorite Genres: {', '.join(user_pref_categories) if user_pref_categories else 'No preferences recorded'}\n"
        )

        # 4. User Profile Task
        content_request = (
            "\n### User Profile Analysis:\n"
            "Based on the provided data, generate a structured analysis of the user's reading behavior:\n"
            "- **Primary Reading Interests**: The most frequently read genres or topics.\n"
            "- **Thematic Preferences**: Common themes appearing in past selections.\n"
            "- **Reading Patterns**: Trends in book selection (e.g., focus on non-fiction, recurring authors, etc.).\n"
            "Ensure the profile remains concise and does not exceed 300 tokens.\n"
            "[/INST]"
        )

        # Combine all parts
        content = content_begin + content_user_history + content_user_prefs + content_request

        print("SEND: ")
        print(content)

        # Get response
        response = self.get_response(content)

        print("GPT Profile: ")
        print(response)
        print()

        return response


    def chat_recommendation_explanation(self, user_item_dict, item_info_dict, user_id, recommended_item, user_preferences, user_feedback):
        """
        Explain why a specific book was recommended based on user behavior
        """

        # 1. System Instructions
        content_begin = (
            "[INST] <<SYS>>\n"
            "Generate a direct and concise explanation for why a specific book was recommended to the user.\n"
            "The explanation should focus strictly on the connection between the user's reading history and preferences and the recommended book.\n"
            "Avoid any pleasantries or mannerisms, and focus only on the relevant reasoning.\n"
            "Keep the response under 300 tokens.\n"
            "<</SYS>>\n\n"
            )

        # 2. User's Historical Interactions
        content_user_history = "### User's Past Reads:\n"
        user_categories = set()

        for item_id in user_item_dict.get(user_id, []):
            book_info = item_info_dict.get(item_id, {})
            title = book_info.get('standard_title', 'Unknown Title')
            categories = book_info.get('parsed_categories', [])

            if isinstance(categories, str):
                categories = [categories]

            user_categories.update(categories)
            content_user_history += f"- {title} ({', '.join(categories)})\n"

        # 3. Recommended Book Details
        book_info = item_info_dict.get(recommended_item, {})
        title = book_info.get('standard_title', 'Unknown Title')
        categories = book_info.get('parsed_categories', [])

        if isinstance(categories, str):
            categories = [categories]

        # Determine recommendation reason
        if user_categories.intersection(categories) or set(user_preferences.get(user_id, [])).intersection(categories):
            recommendation_reason = "because it aligns with your past reading choices."
        else:
            recommendation_reason = "since readers with similar interests have enjoyed it."

        content_recommendation = (
            f"\n### Recommended Book: {title}\n"
            f"- Genre(s): {', '.join(categories)}\n"
            f"- Reason: This book was suggested {recommendation_reason}\n"
        )

        # 4. Final Task Instructions
        content_request = (
            "\n### Task: Provide a Short Explanation\n"
            "Explain to the user directly why this book was recommended based on their reading history and preferences.\n"
            "Keep the explanation focused and concise, with no unnecessary language.\n\n"
            "[/INST]"
            )

        # Combine all parts
        content = content_begin + content_user_history + content_recommendation + content_request

        print("SEND: ")
        print(content)

        # Get response
        response = self.get_response(content)

        print("GPT Explanation: ")
        print(response)
        print()

        return response

## Upload the Data Sources

### User-Item Historical Interactions

In [8]:
# Import User-Item interactions dictionary - Colab
file_key = '/content/drive/My Drive/Capstone - Spring 2025 Personal/user_profiles/user_item_dict.pkl'
user_item_dict = load_dict_from_drive(file_key)

In [9]:
for i, (key, value) in enumerate(user_item_dict.items()):
    if i >= 5:
        break
    print(f"User ID: {key}, Items: {value}")

User ID: A01038432MVI9JXYTTK5T, Items: [15693, 17623, 20231, 23671, 25313, 26282, 34776, 34901, 51220, 55669, 56795, 60236]
User ID: A100NGGXRQF0AQ, Items: [141, 3330, 5426, 8736, 15511, 21557, 25655, 26605, 38825, 45116, 59372, 60614, 63574, 65047, 70451]
User ID: A100V1W0C8BWOL, Items: [21901, 24501, 28429, 29020, 30221, 32803, 36113, 10022, 47988, 58373, 64427]
User ID: A100YHBWL4TR4D, Items: [7209, 7396, 11726, 14023, 19355, 21738, 27003, 30061, 33300, 45369, 47419, 45667, 50844, 68750]
User ID: A101446I5AWY0Z, Items: [2219, 2269, 3322, 4343, 5987, 8056, 10827, 15499, 16602, 17109, 23670, 31893, 32068, 32259, 32596, 39109, 40873, 41154, 48451, 51637, 55990, 66303, 67211, 68516, 72230]


In [10]:
###### FOR TESTING ONLY ######
N = 1
sampled_keys = random.sample(list(user_item_dict.keys()), N)
sampled_dict = {key: user_item_dict[key] for key in sampled_keys}

print(sampled_dict)

{'A2T1WFGFHXO0N8': [10216, 13810, 38127, 38464, 43309, 46196, 58516, 58567, 61727, 67369, 71912]}


### Books Metadata

In [11]:
# Import Items (books) metadata dictionary - Colab
file_key ='/content/drive/My Drive/Capstone - Spring 2025 Personal/user_profiles/item_info_dict.pkl'
item_info_dict = load_dict_from_drive(file_key)

In [12]:
for key, value in itertools.islice(item_info_dict.items(), 5):
    print(key, ":", value)

0 : {'standard_title': 'dr seuss american icon', 'parsed_categories': ['Biography & Autobiography']}
1 : {'standard_title': 'wonderful worship in smaller churches', 'parsed_categories': ['Religion']}
2 : {'standard_title': 'rising sons and daughters life among japans new young', 'parsed_categories': ['Social Science']}
3 : {'standard_title': 'muslim womens choices religious belief and social reality', 'parsed_categories': ['Religion']}
4 : {'standard_title': 'dramatica for screenwriters', 'parsed_categories': ['Reference']}


### User's Onboarding Input

In [13]:
file_key = '/content/drive/My Drive/Capstone - Spring 2025 Personal/user_profiles/onboarding_dict.pkl'
onboarding_dict = load_dict_from_drive(file_key)

In [14]:
for key, value in itertools.islice(onboarding_dict.items(), 5):
    print(key, ":", value)

A01038432MVI9JXYTTK5T : ['American fiction', 'POETRY', 'Materia medica', 'Medieval']
A100NGGXRQF0AQ : ['Fruit', 'French drama']
A100V1W0C8BWOL : ['Athens (Greece)', 'Imagination', 'ices', 'Cross-country (Horsemanship)', 'Coming of age']
A100YHBWL4TR4D : ['Israel', 'Nathaniel']
A101446I5AWY0Z : ['Kuṇḍalinī']


### User's Feedback Data

In [15]:
file_key = '/content/drive/My Drive/Capstone - Spring 2025 Personal/user_profiles/feedback_dict.pkl'
feedback_dict = load_dict_from_drive(file_key)

In [16]:
for key, value in itertools.islice(feedback_dict.items(), 5):
    print(key, ":", value)

A01038432MVI9JXYTTK5T : {'liked': [], 'disliked': [25313]}
A100NGGXRQF0AQ : {'liked': [3330], 'disliked': [141]}
A100V1W0C8BWOL : {'liked': [], 'disliked': [30221, 10022, 58373, 64427]}
A100YHBWL4TR4D : {'liked': [7209, 11726, 47419], 'disliked': [27003]}
A101446I5AWY0Z : {'liked': [], 'disliked': []}


### Recommendations

In [17]:
file_key = '/content/drive/My Drive/Capstone - Spring 2025 Personal/user_profiles/cf_dict.pkl'
cf_dict = load_dict_from_drive(file_key)

In [18]:
for key, value in itertools.islice(cf_dict.items(), 5):
    print(key, ":", value)

A01038432MVI9JXYTTK5T : [70013, 65089, 48129, 48178, 33184, 56203]
A100NGGXRQF0AQ : [39119, 38995, 38952, 68863, 2071, 40865]
A100V1W0C8BWOL : [54344, 18725, 33473, 49862]
A100YHBWL4TR4D : [57027, 17324, 67786]
A101446I5AWY0Z : [9830, 69737, 24746, 5877]


### Load the LLM model

In [19]:
# Pick a model
model_name = "meta-llama/Llama-2-7b-chat-hf"

In [20]:
# Create an instance of the ChatBot class
chatbot = ChatBot(model_name=model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [21]:
print(f"Number of unique users: {len(user_item_dict)}")
print(f"Number of unique books: {len(item_info_dict)}")

Number of unique users: 14402
Number of unique books: 72308


In [22]:
# Create dictionary to store results
result_dict = {}
counter = 0

In [23]:
# Loop through sampled users
for user_id in sampled_dict:
    recommended_book = cf_dict.get(user_id, [None])[0]  # Get the first recommendation

    if recommended_book is None:
        print(f"User {user_id} has no recommendations.")
        continue

    # Generate user profile
    user_profile = chatbot.chat_user_profile(
        user_item_dict,   # User-item interactions
        item_info_dict,   # Book metadata
        user_id,          # Current user
        onboarding_dict,  # User preferences
        feedback_dict     # User feedback
    )

    # Generate the recommendation explanation
    explanation = chatbot.chat_recommendation_explanation(
        user_item_dict,   # User-item interactions
        item_info_dict,   # Book metadata
        user_id,          # Current user
        recommended_book, # Recommended book ID
        onboarding_dict,  # User preferences
        feedback_dict     # User feedback
    )

    # Ensure user_id exists in result_dict before storing responses
    if user_id not in result_dict:
        result_dict[user_id] = {}

    # Store both user profile and explanation separately
    result_dict[user_id]["user_profile"] = user_profile
    result_dict[user_id]["recommendation_explanation"] = explanation

    # Print user profile and explanation
    print(f"\n User {user_id} Profile:\n{user_profile}\n")
    print(f"Why '{recommended_book}' was recommended:\n{explanation}\n")

    # Increment and print counter
    counter += 1
    print(counter)

# Final counter
print("Total Users Processed:", counter)

SEND: 
[INST] <<SYS>>
Generate a structured user profile based on the provided reading history, preferences, and feedback.
The response should be **formal, objective, and data-driven** with clear sections.
Avoid conversational language. Focus on summarizing the user's reading behavior concisely.
Keep the response **under 300 tokens** and avoid unnecessary details.
<</SYS>>

### User's Reading History:
- probodx proper body exercise the path to true fitness (Health & Fitness)
- harry potter and the sorcerers stone (Juvenile Fiction)
- the moon is a harsh mistress (Fiction)
- earth abides (Fiction)
- left behind - a novel of the earths last days (FICTION)
- i, robot (Robots)
- i, robot (Robots)
- time enough for love (Fiction)
- earth abides (Fiction)
- time enough for love the lives of lazarus long (Fiction)
- oryx and crake (Fiction)

### User's Preferences:
- Favorite Genres: Abandoned children

### User Profile Analysis:
Based on the provided data, generate a structured analysis of t

In [39]:
result_dict

{'A3T46YOGTFC5IG': None}