# Zero-shot Next Item Recommendation on Movie Lens 100k dataset using Open Large Language Models

## Installing required modules

In [None]:
!pip install -q -U bitsandbytes

In [None]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

In [None]:
!pip install cohere
!pip install tiktoken
!pip install uszipcode
!pip install sentencpiece
!pip install langchain



## Installing huggingface hub

In [None]:
!pip install --upgrade huggingface_hub

## Hugging face install teste

In [None]:
!python -c "from huggingface_hub import model_info; print(model_info('gpt2'))"

Token: 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

In [None]:
!huggingface-cli login --token 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

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

## Load Movie Lens 100k data

In [None]:
def read_json(file):
    with open(file) as f:
        return json.load(f)

def write_json(data, file):
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

data_ml_100k = read_json("ml_100k.json")

In [None]:
type(data_ml_100k)

In [None]:
data_ml_100k[0][0]

In [None]:
len(data_ml_100k)

## Movie dictionary

In [None]:
u_item_dict = {}    # movie dict
u_item_p = 0        # movie idx

for elem in data_ml_100k:
    seq_list = elem[0].split(' | ')
    for movie in seq_list:
        if movie not in u_item_dict:
            u_item_dict[movie] = u_item_p   # bind movie name with movie idx
            u_item_p +=1    # increment movie idx

u_item_len = len(u_item_dict)
u_item_len

In [None]:
u_item_dict

## User similarity matrix

TODO: optimize cosine computing

In [None]:
user_matrix = [] # user matrix

for i, elem in  enumerate(data_ml_100k):    # iterate over user watched movies
    item_hot_list = [0 for _ in range(u_item_len)]  # create one hot user-movie vector
    seq_list = elem[0].split(' | ')
    for movie in seq_list:  # iterate over each movie and update one hot vector
        item_pos = u_item_dict[movie]
        item_hot_list[item_pos] = 1
    user_matrix.append(item_hot_list)   # add user vector to user matrix

user_matrix = np.array(user_matrix)
user_matrix_sim = np.dot(user_matrix, user_matrix.transpose()) # compute similarity (dot product)
user_matrix_sim

## Movie popularity dict

In [None]:
pop_dict = {}

for elem in data_ml_100k:   # iterate over dataset
    # elem = data_ml_100k[i]
    seq_list = elem[0].split(' | ')
    for movie in seq_list:  # iterate over each movie
        if movie not in pop_dict:
              pop_dict[movie] = 0
        pop_dict[movie] += 1 # increment movie popularity

pop_dict

## Item similarity matrix

In [None]:
i_item_dict = {}
i_item_id_list = []
i_item_user_dict = {}
i_item_p = 0

for i, elem in  enumerate(data_ml_100k):
    seq_list = elem[0].split(' | ')
    for movie in seq_list:
        if movie not in i_item_user_dict:
            item_hot_list = [0. for ii in range(len(data_ml_100k))]
            i_item_user_dict[movie] = item_hot_list
            i_item_dict[movie] = i_item_p
            i_item_id_list.append(movie)
            i_item_p+=1
#         item_pos = item_dict[movie]
        i_item_user_dict[movie][i] += 1
#     user_list.append(item_hot_list)

i_item_s_list = []
for item in i_item_id_list:
    i_item_s_list.append(i_item_user_dict[item])
#     print (sum(item_user_dict[item]))

item_matrix = np.array(i_item_s_list)
item_matrix_sim = np.dot(item_matrix, item_matrix.transpose())
item_matrix_sim

In [None]:
id_list =list(range(0,len(data_ml_100k)))
id_list

## Sort UF items function

In [None]:
### user filtering
def sort_uf_items(target_seq, us, num_u, num_i):

    candidate_movies_dict = {}
    sorted_us = sorted(list(enumerate(us)), key=lambda x: x[-1], reverse=True)[:num_u]
    dvd = sum([e[-1] for e in sorted_us])
    for us_i, us_v in sorted_us:
        us_w = us_v * 1.0/dvd
#         print (us_i)
        us_elem = data_ml_100k[us_i]
#         print (us_elem[0])
#         assert 1==0
        us_seq_list = us_elem[0].split(' | ')#+[us_elem[1]]

        for us_m in us_seq_list:
#             print (f"{us_m} not in {target_seq}, {us_m not in target_seq}")
#             break
            if us_m not in target_seq:
                if us_m not in candidate_movies_dict:
                    candidate_movies_dict[us_m] = 0.
                candidate_movies_dict[us_m]+=us_w

#         assert 1==0

    candidate_pairs = list(sorted(candidate_movies_dict.items(), key=lambda x:x[-1], reverse=True))
#     print (candidate_pairs)
    candidate_items = [e[0] for e in candidate_pairs][:num_i]
    return candidate_items

## Sort IF items function

In [None]:
### item filtering
def soft_if_items(target_seq, num_i, total_i, item_matrix_sim, item_dict):
    candidate_movies_dict = {}
    for movie in target_seq:
#         print('ttt:',movie)
        sorted_is = sorted(list(enumerate(item_matrix_sim[item_dict[movie]])), key=lambda x: x[-1], reverse=True)[:num_i]
        for is_i, is_v in sorted_is:
            s_item = i_item_id_list[is_i]

            if s_item not in target_seq:
                if s_item not in candidate_movies_dict:
                    candidate_movies_dict[s_item] = 0.
                candidate_movies_dict[s_item] += is_v
#             print (item_id_list[is_i], candidate_movies_dict)
    candidate_pairs = list(sorted(candidate_movies_dict.items(), key=lambda x:x[-1], reverse=True))
#     print (candidate_pairs)
    candidate_items = [e[0] for e in candidate_pairs][:total_i]
#     print (candidate_items)
    return candidate_items

## Candidate ids

In [None]:
'''
In order to economize, our initial step is to identify user sequences that exhibit a high probability
of obtaining accurate predictions from GPT-3.5 based on their respective candidates.
Subsequently, we proceed to utilize the GPT-3.5 API to generate predictions for these promising user sequences.
'''
results_data_15 = []
length_limit = length_limit
num_u= 12
total_i = num_cand
count = 0
total = 0
cand_ids = []

for i in id_list[:1000]:
    elem = data_ml_100k[i]
    seq_list = elem[0].split(' | ')

    candidate_items = sort_uf_items(seq_list, user_matrix_sim[i], num_u=num_u, num_i=total_i)

#     print (elem[-1], '-',seq_list[-1])
    if elem[-1] in candidate_items:
#         print ('HIT: 1')
        count += 1
        cand_ids.append(i)
    else:
        pass
#         print ('HIT: 0')
    total +=1

print (f'count/total:{count}/{total}={count*1.0/total}')
print ('-----------------\n')

In [None]:
cand_ids

## Prompts

In [None]:
temp_1 = """
Candidate Set (candidate movies): {}.
The movies I have watched (watched movies): {}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)?
Answer:
"""

temp_2 = """
Candidate Set (candidate movies): {}.
The movies I have watched (watched movies): {}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)?
Answer: {}.
Step 2: Selecting the most featured movies from the watched movies according to my preferences (Format: [no. a watched movie.]).
Answer:
"""

temp_3 = """
Candidate Set (candidate movies): {}.
The movies I have watched (watched movies): {}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)?
Answer: {}.
Step 2: Selecting the most featured movies (at most 5 movies) from the watched movies according to my preferences in descending order (Format: [no. a watched movie.]).
Answer: {}.
Step 3: Can you recommend 10 movies from the Candidate Set similar to the selected movies I've watched (Format: [no. a watched movie - a candidate movie])?.
Answer:
"""

## Llama2

In [None]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [None]:
model_name = 'meta-llama/Llama-2-7b-chat-hf'

model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=512,
    repetition_penalty=1.1
)

In [None]:
prompt = '''
Write a short poem about roses.
'''

res = pipeline(prompt)
print(res[0]['generated_text'])

## QPs

- Quais outras métricas podem ser utilizadas para avaliar a performance do sistema?

- Posso sumarizar a peformance do sistema como um todo tirando a médida das métricas?

- Preciso gerar várias API_KEYs?

## Open models

- Llama (next week)
- Falcon (try)
- Mistral (try)

!Compare