#Load Dataset

In [1]:
!wget -O LLM-ReDial-2024.zip "https://www.dropbox.com/scl/fi/x9avfdx2a1k6uq97f0efj/LLM-ReDial-2024.zip?rlkey=ijqpf91d13d6lowek3ebjvd0n&e=2&dl=1"
!unzip LLM-ReDial-2024.zip
!unzip LLM_Redial.zip
!rm LLM_Redial.zip
!rm LLM-ReDial-2024.zip
!rm -rf __MACOSX

--2024-12-04 14:02:05--  https://www.dropbox.com/scl/fi/x9avfdx2a1k6uq97f0efj/LLM-ReDial-2024.zip?rlkey=ijqpf91d13d6lowek3ebjvd0n&e=2&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.81.18, 2620:100:6018:18::a27d:312
Connecting to www.dropbox.com (www.dropbox.com)|162.125.81.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc46966863df72669b36da4ab07f.dl.dropboxusercontent.com/cd/0/inline/Cfl3OXobDz3zKvWqWsAEHR7V9yUE1sxSb0k-QTz6G1gar1uNXpLFIu4kcuA3IUQaSl4LCUeXp4HdCElHdNqDxlgtbq4bLQsLC8GSh1vvZuwH6S1axTf0ISO2G3X1fZKmVKY/file?dl=1# [following]
--2024-12-04 14:02:06--  https://uc46966863df72669b36da4ab07f.dl.dropboxusercontent.com/cd/0/inline/Cfl3OXobDz3zKvWqWsAEHR7V9yUE1sxSb0k-QTz6G1gar1uNXpLFIu4kcuA3IUQaSl4LCUeXp4HdCElHdNqDxlgtbq4bLQsLC8GSh1vvZuwH6S1axTf0ISO2G3X1fZKmVKY/file?dl=1
Resolving uc46966863df72669b36da4ab07f.dl.dropboxusercontent.com (uc46966863df72669b36da4ab07f.dl.dropboxusercontent.com)... 162.125.81.15, 2620:100:6035:15::a

#Load HF model

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
!pip install transformers[torch]
!pip install bitsandbytes
!pip install peft
!pip install datasets



In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"
#model_id = "mistralai/Mistral-7B-Instruct-v0.3"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [8]:
def generate_prompt(data):
  question = data['question']
  ref = data['references']

  chat = [
      {"role": "user", "content": f"Answer the Question: {question} Considering this information: {ref}"},
    ]

  data['prompt'] = tokenizer.apply_chat_template(chat, tokenize=False)
  return data

#dataset['test'] = dataset['test'].map(generate_prompt)

In [9]:
def generate_zeroshot_prompt(likes, dislikes):
  likes_str = "\n".join(likes)
  dislikes_str = "\n".join(dislikes)

  chat = [
    {"role": "user", "content": f"I've liked these movies: {likes_str}. I've disliked these movies: {dislikes_str}. Recommend a new movie for me to watch."}
  ]
  return tokenizer.apply_chat_template(chat, tokenize=False)

In [10]:
def generate_zeroshot(conversation_id, conversations):
    conversation = "\n".join(conversations[conversation_id]["messages"])
    content = (
        f"Pretend you are a movie recommender system. I will give you a conversation between a human and assistant. Based on the conversation, you reply me with 10 recommendations, including only their titles without extra sentences or description. Here is the conversation: {conversation}.\n"
        f"\n"
    )
    messages = [
        {"role": "user", "content": content}
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False)
    return prompt

In [11]:
from Tools import read_user_data
path_movie_data = "/content/Movie/final_data.jsonl"

In [12]:
import json
item_map_path = "/content/Movie/item_map.json"
item_map = json.load(open(item_map_path,'r'))
user_map_path = "/content/Movie/user_ids.json"
user_map = json.load(open(user_map_path,'r'))

In [13]:
conversations_path = "/content/Movie/Conversation.txt"

conversations = []
current_conversation = []
conversation_id = 0
with open(conversations_path, 'r') as file:
    for line in file:
        line = line.strip()
        if not line:
            continue

        if line.isdigit():
            if current_conversation:
                conversations.append({
                    'conversation_id': conversation_id,
                    'messages': current_conversation
                })
            conversation_id = int(line)
            current_conversation = []
        else:
            current_conversation.append(line)

    if current_conversation:
        conversations.append({
            'conversation_id': conversation_id,
            'messages': current_conversation
        })

## DATASET

In [14]:
import random
user_ids = list(user_map.keys())
sample_percent = 0.1

sample_size = int(sample_percent * len(user_ids))
sampled_users = random.sample(user_ids,sample_size)

In [15]:
dataset = []
for user_id in sampled_users:
  user_data = read_user_data(path_movie_data, user_id)
  for conv_number, conversation in enumerate(user_data['Conversation']):
    conversation_id = conversation[f"conversation_{conv_number+1}"]["conversation_id"]
    prompt = generate_zeroshot(conversation_id, conversations)
    dataset.append({
        "user_id": user_id,
        "conversation_id": conversation_id,
        "user_might_like": user_data['user_might_like'],
        "prompt": prompt
    })

In [16]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)
print("Train prompts:", len(train_set))
print("Test prompts:", len(test_set))

Train prompts: 828
Test prompts: 207


In [17]:
conversations_id = []
conversations_train = []
for data in train_set:
  conversations_id.append(data['conversation_id'])

for conversation in conversations:
  if conversation['conversation_id'] in conversations_id:
    conversations_train.append(conversation)

## QLora

In [19]:
import transformers
from datasets import Dataset

tokenizer.pad_token = tokenizer.eos_token

# Convierte a Dataset
dataset = Dataset.from_list(conversations_train)

# Función de tokenización
def flatten_and_tokenize_function(examples):
    combined_texts = []
    for messages in examples['messages']:
        flattened_messages = []
        for message in messages:
            if isinstance(message, list):
                flattened_messages.extend(message)
            else:
                flattened_messages.append(message)
        combined_texts.append(" ".join(flattened_messages))

    return tokenizer(
        combined_texts,
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Aplica tokenización
tokenized_dataset = dataset.map(flatten_and_tokenize_function, batched=True, remove_columns=["conversation_id", "messages"])

########################
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)


trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

Map:   0%|          | 0/828 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33madarriagada[0m ([33madarriagada-pontificia-universidad-cat-lica-de-chile[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,2.5815
2,2.8164
3,2.5554
4,2.4838
5,2.4292
6,2.5911
7,2.239
8,2.4731
9,2.5673
10,2.7383


TrainOutput(global_step=207, training_loss=1.7502285080831408, metrics={'train_runtime': 343.6619, 'train_samples_per_second': 2.409, 'train_steps_per_second': 0.602, 'total_flos': 1793911352721408.0, 'train_loss': 1.7502285080831408, 'epoch': 1.0})

In [20]:
trainer.save_model("Llama-3.2-3B-Instruct.h5")

In [21]:
#mount
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig

base_model = "meta-llama/Llama-3.2-3B-Instruct"
adapter_model = "Llama-3.2-3B-Instruct.h5"

model = AutoModelForCausalLM.from_pretrained(base_model)
model = PeftModel.from_pretrained(model, adapter_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCa

In [24]:
for item in train_set[:37]:
  result = text_generator(item['prompt'], max_new_tokens=200, return_full_text=False)
  item['response'] = result[0]['generated_text']

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

In [26]:
for item in train_set:
  print(item['response'])

assistant

Here are 10 movie recommendations based on the conversation:

1. Vertigo
2. North by Northwest
3. Vertigo
4. Psycho
5. The Birds
6. Vertigo
7. Vertigo
8. North by Northwest
9. The Birds
10. Vertigo
assistant

Based on the conversation, here are 10 movie recommendations:

1. The Godfather
2. The Shawshank Redemption
3. The Silence of the Lambs
4. The Dark Knight
5. The Matrix
6. The Lord of the Rings: The Fellowship of the Ring
7. The Princess Bride
8. The Terminator
9. The Silence of the Lambs
10. A Clockwork Orange
assistant

Based on the conversation, here are 10 movie recommendations for you:

1. House of Sand
2. The Red Shoes
3. Topkapi
4. How To Steal A Million
5. 12 Angry Men
6. The 400 Blows
7. The Rules of the Game
8. The Third Man
9. The Leopard
10. The Cabinet of Dr. Caligari
assistant

Here are 10 movie recommendations based on the conversation:

1. The Dark Knight
2. The Silence of the Lambs
3. The Shawshank Redemption
4. The Godfather
5. The Princess Bride
6. Th

KeyError: 'response'

In [27]:
answers = []
for item in train_set:
  answers.append(item['response'])

KeyError: 'response'

In [28]:
len(answers)

37

# NDCG & Recall

In [29]:
import re

def parse_recommendations(answer):
  pattern = r'\d+\.\s+"*([^"]+?)"*(?:\s\d4)?(?:\s*-\s*.+)?(?:\s*\d-\d)?$'
  recommendations = re.findall(pattern, answer, re.MULTILINE)
  return recommendations

In [30]:
import numpy as np

def recall_at_k(r, k, m):
    return (np.asarray(r)[:k] != 0).sum() / m

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.

def idcg_at_k(k):
    return dcg_at_k(np.ones(k), k)

def ndcg_at_k(r, k, max_relevant):
    idcg = idcg_at_k(min(k, max_relevant))
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

In [31]:
from sentence_transformers import SentenceTransformer

def embedding_init():
  sbert = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
  return sbert

def similarity_score(sbert, movies):
  embed1 = sbert.encode(movies[0])
  embed2 = sbert.encode(movies[1])
  return sbert.similarity(embed1,embed2).numpy()[0][0]


In [32]:
def calculate_similarities(sbert, user_might_like, recommendations):
  might_like_embeddings = sbert.encode(user_might_like)
  recommendations_embeddings = sbert.encode(recommendations)
  return sbert.similarity(might_like_embeddings, recommendations_embeddings).numpy()

def calculate_rel_vector(similarities, threshold=0.75):
  return np.any(np.array(similarities) >= threshold, axis=0)

In [33]:
def evaluate_model(dataset, answers, sbert):

    mean_ndcg = 0
    mean_recall = 0
    recommendations_count = 0
    for i in range(len(answers)):
       recommendations = parse_recommendations(answers[i])
       if len(recommendations) != 10:
         continue
       user_might_like = [item_map[id] for id in dataset[i]["user_might_like"]]
       similarities = calculate_similarities(sbert, user_might_like, recommendations)
       rel_vector = calculate_rel_vector(similarities)


       mean_ndcg += ndcg_at_k(rel_vector, 10, len(user_might_like))
       mean_recall += recall_at_k(rel_vector, 10, len(user_might_like))
       recommendations_count += 1

    mean_ndcg /= recommendations_count
    mean_recall /= recommendations_count

    return mean_ndcg, mean_recall

In [34]:
sbert = embedding_init()
ndcg, recall = evaluate_model(train_set[:37], answers, sbert)
print(f"NDCG: {ndcg:.4f}")
print(f"Recall: {recall:.4f}")

NDCG: 0.1519
Recall: 0.1484
