In [1]:
dataset = 'movielens'

domain = {
    'lastfm': 'music',
    'dbbook': 'book',
    'movielens': 'movie'
}[dataset]
import os
datapath = os.path.join('..', 'data', dataset)

special_token = '<|reserved_special_token_22|>'
k_prompt = 100
resp_frac = 0.5

# Set names

In [2]:
import pandas as pd
import os
import random
import numpy as np
import swifter  

In [3]:
user_item_df_train = pd.read_csv(os.path.join(datapath, 'user-item', 'train.tsv'), \
            names=['user', 'item', 'rating'], sep='\t')

user_item_df_test = pd.read_csv(os.path.join(datapath, 'user-item', 'test.tsv'), \
            names=['user', 'item', 'rating'], sep='\t')


df = pd.concat([user_item_df_train, user_item_df_test], ignore_index=True)
all_items = df['item'].unique()

In [4]:
def get_split_random(list_to_split, frac=0.2, margin=0):
    random.seed(22)
    sample_list = random.sample(list_to_split, int(frac*len(list_to_split))+margin)
    complementary_set = set(list_to_split) - set(sample_list)
    return list(set(complementary_set)), list(set(sample_list))

In [5]:
random.seed(22)
test_frac = 0.2
user_list = df['user'].unique().tolist()
user_train, user_test = get_split_random(user_list, frac=test_frac, margin=0)

In [6]:
user_item_df_train = df[df['user'].isin(user_train)]
user_item_df_test = df[df['user'].isin(user_test)]

In [7]:
len(set(user_item_df_train['user'].unique()) - set(user_item_df_test['user'].unique()))

4829

In [8]:
user_item_df_test['rating'].describe()

count    181586.000000
mean          0.574697
std           0.494390
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: rating, dtype: float64

In [9]:
user_item_df_train['rating'].describe()

count    764534.000000
mean          0.572116
std           0.494772
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: rating, dtype: float64

In [10]:
train_group = user_item_df_train.groupby('user').agg({
    'item': list,
    'rating': list
}).reset_index()
test_group = user_item_df_test.groupby('user').agg({
    'item': list,
    'rating': list
}).reset_index()
train_group['len'] = train_group['item'].apply(len)
test_group['len'] = test_group['item'].apply(len)

In [11]:
def split_prompt_resp(row, resp_frac):
    items = np.array(row['rating'])
    index_likes = np.where(items==1)[0].tolist()
    index_dilikes = np.where(items==0)[0].tolist()
    index_like_prompt = []
    index_like_resp = []    
    index_dislike_prompt = []
    index_dislike_resp = []
    resp_frac=resp_frac
    if len(index_likes) > 0:
        index_like_prompt, index_like_resp = get_split_random(index_likes, frac=resp_frac, margin=1)
    if len(index_dilikes) > 0:
        index_dislike_prompt, index_dislike_resp = get_split_random(index_dilikes, frac=resp_frac, margin=1)
    row['index_like_prompt'] = index_like_prompt
    row['index_like_resp'] = index_like_resp
    row['index_dislike_prompt'] = index_dislike_prompt
    row['index_dislike_resp'] = index_dislike_resp
    return row

In [12]:
test_group['len'].describe()

count    1207.000000
mean      150.444076
std       170.309605
min        17.000000
25%        39.000000
50%        86.000000
75%       193.000000
max      1276.000000
Name: len, dtype: float64

In [13]:
train_group['len'].describe()

count    4829.000000
mean      158.321392
std       185.232239
min        14.000000
25%        42.000000
50%        91.000000
75%       196.000000
max      2078.000000
Name: len, dtype: float64

In [14]:
test_group = test_group.swifter.apply(split_prompt_resp, axis=1, resp_frac=resp_frac)
train_group = train_group.swifter.apply(split_prompt_resp, axis=1, resp_frac=resp_frac)

Pandas Apply:   0%|          | 0/1207 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4829 [00:00<?, ?it/s]

In [15]:
test_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp
0,5,"[8531, 8533, 9055, 6859, 6271, 7363, 6864, 744...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...",70,"[0, 64, 2, 67, 4, 14, 16, 21, 25, 31, 36, 37, ...","[1, 3, 5, 6, 9, 11, 13, 15, 17, 18, 20, 22, 23...","[35, 68, 38, 10, 51, 53, 62, 27, 29, 30]","[32, 33, 7, 8, 12, 45, 46, 48, 19, 24, 28]"
1,7,"[6856, 6042, 6270, 7444, 7889, 8122, 6052, 650...","[1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",129,"[0, 5, 7, 8, 11, 12, 18, 19, 26, 34, 41, 43, 4...","[2, 6, 9, 10, 14, 15, 17, 21, 23, 25, 27, 28, ...","[128, 1, 65, 4, 71, 72, 73, 74, 16, 83, 84, 87...","[3, 13, 20, 22, 24, 29, 31, 36, 39, 42, 50, 51..."


In [16]:
test_group.shape

(1207, 8)

In [17]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp
0,0,"[8213, 6267, 6857, 6858, 6916, 7258, 8270, 686...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...",49,"[0, 2, 3, 4, 6, 9, 15, 19, 23, 29, 30, 31, 32,...","[1, 5, 7, 8, 10, 12, 16, 17, 18, 20, 21, 22, 2...","[48, 42, 47]","[11, 13, 14, 24, 28]"
1,1,"[6493, 8733, 7257, 7887, 7444, 8122, 7061, 789...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...",122,"[0, 2, 6, 10, 14, 15, 20, 28, 31, 37, 42, 46, ...","[1, 5, 8, 13, 25, 26, 30, 34, 36, 38, 40, 44, ...","[66, 3, 67, 70, 7, 71, 73, 74, 11, 19, 21, 85,...","[4, 9, 12, 16, 17, 18, 22, 27, 29, 32, 33, 39,..."


In [18]:
train_group.shape

(4829, 8)

In [19]:
if dataset=='movielens':
    def fix_title(title):
        if ", The (" in title:
            name_film, _, year = title.rpartition(", The (")
            title = "The " + name_film + " (" + year
            return title
        if ", A (" in title:
            name_film, _, year = title.rpartition(", A (")
            title = "A " + name_film + " (" + year
        return title
    df_movies = pd.read_csv(os.path.join(datapath, r"movies.dat"), sep="::", names=["item_id", "name", "geners"], encoding='ISO-8859-1')
    df_movies['name'] = df_movies['name'].apply(lambda x: fix_title(x)[0])
    #df_movies['year'] = df_movies['name'].apply(lambda x: fix_title(x)[1])
    import re

    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), \
            names=['url', 'id_set'], sep='\t')
    dbpedia_mapping = pd.read_csv(os.path.join(datapath, 'MappingMovielens2DBpedia-1.2.tsv'), \
            names=['id_movie', 'name', 'dbpedia_url'], sep='\t')
    df_movies = dbpedia_mapping.set_index('dbpedia_url').join(df_relations.set_index('url')).reset_index()
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub("\s+\(\d+\)$", "", x))
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", ' ', x).strip())
    df_movies.dropna(inplace=True)
    df_movies['id_set'] = df_movies['id_set'].astype(int)
    mapping_dict = df_movies.loc[:, ['id_set', 'name']].set_index('id_set').to_dict()['name']

  df_movies = pd.read_csv(os.path.join(datapath, r"movies.dat"), sep="::", names=["item_id", "name", "geners"], encoding='ISO-8859-1')


In [20]:
itemToInt = {x: i for i, x in enumerate(set(mapping_dict.keys()).union(set(all_items.tolist())))}

In [21]:
temp = {itemToInt[k]:v for k, v in {**mapping_dict, **{x: str(x) for x in set(mapping_dict.keys()).symmetric_difference(set(all_items.tolist()))}}.items()}

In [22]:
len(temp)

3218

In [23]:
max(temp)

3217

In [24]:
pd.to_pickle(temp, os.path.join(datapath, domain+'.pkl'))

In [25]:
if dataset=='dbbook':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), sep='\t')
    df_relations['name'] = df_relations['uri'].apply(lambda x: x.split(";")[0])
    temp = df_relations[df_relations['id'].isin(df['item'].unique())]
    mapping_dict = temp.set_index('id').to_dict()['name']

In [26]:
if dataset=='lastfm':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_items.tsv'), \
            names=['id', 'name'], sep='\t')
    mapping_dict = df_relations.set_index('id').to_dict()['name']

In [27]:
item_desription = pd.read_csv(os.path.join(datapath, dataset+'.txt'), sep=';;', names=['item_id', 'description'],  on_bad_lines='skip')
item_desription = item_desription[item_desription['item_id'].isin(all_items)]

  item_desription = pd.read_csv(os.path.join(datapath, dataset+'.txt'), sep=';;', names=['item_id', 'description'],  on_bad_lines='skip')


In [28]:
#item_desription['description'] = item_desription['description'].apply(lambda x: special_token + " " + x)

In [29]:
import json

with open(os.path.join(datapath, f'{dataset}_domain_kn.jsonl'), 'w') as outfile:
    for new_id, desc in item_desription.to_dict()['description'].items():
        json.dump({'target_id':new_id, 'text':desc}, outfile)
        outfile.write('\n')

In [30]:
item_desc_dict = item_desription.to_dict()['description']

## Prepare prompt

In [31]:
train_group.head()

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp
0,0,"[8213, 6267, 6857, 6858, 6916, 7258, 8270, 686...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...",49,"[0, 2, 3, 4, 6, 9, 15, 19, 23, 29, 30, 31, 32,...","[1, 5, 7, 8, 10, 12, 16, 17, 18, 20, 21, 22, 2...","[48, 42, 47]","[11, 13, 14, 24, 28]"
1,1,"[6493, 8733, 7257, 7887, 7444, 8122, 7061, 789...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...",122,"[0, 2, 6, 10, 14, 15, 20, 28, 31, 37, 42, 46, ...","[1, 5, 8, 13, 25, 26, 30, 34, 36, 38, 40, 44, ...","[66, 3, 67, 70, 7, 71, 73, 74, 11, 19, 21, 85,...","[4, 9, 12, 16, 17, 18, 22, 27, 29, 32, 33, 39,..."
2,2,"[7725, 6597, 7059, 6599, 8902, 6560, 7622, 717...","[1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, ...",50,"[0, 8, 11, 14, 18, 19, 20, 24, 27, 31, 36, 38,...","[2, 3, 6, 10, 12, 13, 15, 16, 22, 25, 28, 29, ...","[34, 35, 21, 39, 9, 26]","[1, 4, 5, 7, 46, 17, 23, 30]"
3,3,"[6459, 8213, 6267, 8430, 8897, 6916, 7139, 697...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, ...",20,"[3, 8, 10, 14, 15, 16, 17, 19]","[0, 1, 2, 5, 6, 9, 11, 12, 18]",[7],"[4, 13]"
4,4,"[8113, 7440, 6493, 6044, 6270, 8538, 7671, 812...","[1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, ...",188,"[0, 2, 130, 10, 11, 138, 141, 19, 147, 155, 28...","[129, 134, 7, 8, 139, 15, 16, 20, 149, 22, 150...","[1, 3, 6, 135, 9, 137, 140, 14, 17, 146, 148, ...","[128, 131, 4, 5, 133, 132, 136, 12, 13, 142, 1..."


In [32]:
mapping_dict[8670]

'Wayne s World'

In [33]:
def map_with_names(row, col, mapping_dict):
    return [mapping_dict.get(x, '') for x in np.array(row['item'])[row[col]] if len(mapping_dict.get(x, ''))>2]


train_group['liked_prompt'] = train_group.apply(map_with_names, axis=1, col='index_like_prompt', mapping_dict=mapping_dict)
train_group['liked_resp'] = train_group.apply(map_with_names, axis=1, col='index_like_resp', mapping_dict=mapping_dict)
train_group['disliked_prompt'] = train_group.apply(map_with_names, axis=1, col='index_dislike_prompt', mapping_dict=mapping_dict)
train_group['disliked_resp'] = train_group.apply(map_with_names, axis=1, col='index_dislike_resp', mapping_dict=mapping_dict)

test_group['liked_prompt'] = test_group.apply(map_with_names, axis=1, col='index_like_prompt', mapping_dict=mapping_dict)
test_group['liked_resp'] = test_group.apply(map_with_names, axis=1, col='index_like_resp', mapping_dict=mapping_dict)
test_group['disliked_prompt'] = test_group.apply(map_with_names, axis=1, col='index_dislike_prompt', mapping_dict=mapping_dict)
test_group['disliked_resp'] = test_group.apply(map_with_names, axis=1, col='index_dislike_resp', mapping_dict=mapping_dict)

In [34]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp
0,0,"[8213, 6267, 6857, 6858, 6916, 7258, 8270, 686...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...",49,"[0, 2, 3, 4, 6, 9, 15, 19, 23, 29, 30, 31, 32,...","[1, 5, 7, 8, 10, 12, 16, 17, 18, 20, 21, 22, 2...","[48, 42, 47]","[11, 13, 14, 24, 28]","[Run Lola Run Lola rennt, Mary Poppins, Dumbo,...","[Ponette, Sound of Music The, Aladdin, Hunchba...","[Tarzan, James and the Giant Peach, Princess B...","[Wallace Gromit The Best of Aardman Animation,..."
1,1,"[6493, 8733, 7257, 7887, 7444, 8122, 7061, 789...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...",122,"[0, 2, 6, 10, 14, 15, 20, 28, 31, 37, 42, 46, ...","[1, 5, 8, 13, 25, 26, 30, 34, 36, 38, 40, 44, ...","[66, 3, 67, 70, 7, 71, 73, 74, 11, 19, 21, 85,...","[4, 9, 12, 16, 17, 18, 22, 27, 29, 32, 33, 39,...","[Remains of the Day The, Lost World Jurassic P...","[Key Largo, Matrix The, Doctor Zhivago, Green ...","[Breakfast Club The, Children of a Lesser God,...","[Man in the Iron Mask The, Get Shorty, Fisher ..."


In [35]:
train = []
test = []
for _, row in test_group.iterrows():
    for item in row['liked_prompt']:
        train.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 1
        })
    for item in row['disliked_prompt']:
        train.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 0
        })
    for item in row['liked_resp']:
        test.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 1
        })
    for item in row['disliked_resp']:
        test.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 0
        })

pd.DataFrame(train).to_csv("train_adaptation.csv")
                    

In [36]:
import random
import itertools

def prepare_user_prompt(row, test=False):
    random.seed(22)
    user_prompt = ''
    liked_list = ', '
    disliked_list = ', '
    liked_part = """I like the following items:\n{liked_list}\n"""
    disliked_part = """I dislike the following items:\n{disliked_list}\n"""
    liked_items = sorted(row['liked_prompt'], key=lambda k: random.random())
    disliked_items = sorted(row['disliked_prompt'], key=lambda k: random.random())
    liked_list = liked_list.join(liked_items)
    disliked_list = disliked_list.join(disliked_items)
    if len(liked_items)>0:
        user_prompt = user_prompt + liked_part.format(liked_list=liked_list)
    if len(disliked_list)>0:
        user_prompt = user_prompt + disliked_part.format(disliked_list=disliked_list)
    
    to_rank_list_str = ', '
    to_rank_list = list(itertools.chain(*[row['liked_resp'], row['disliked_resp']]))
    if test:
        response_part = f"""\nSugges me {k_prompt} items"""
    else:
        response_part = f"""\nSugges me {len(to_rank_list)} items"""
    to_rank_list = sorted(to_rank_list, key=lambda k: random.random())
    to_rank_list = to_rank_list_str.join(to_rank_list)
    user_prompt = user_prompt + response_part#.format(to_rank_list=to_rank_list)
    return user_prompt
        
def get_assistant_prompt(row):
    ranked_list_str = ''
    rank_part = """Here a list:\n{ranked_list}\n"""
    prepared_prompt_inter = []
    for i, x in enumerate(row['liked_resp']):
        ranked_list_str = ranked_list_str + f"{i+1}. {x} <|reserved_special_token_0|>"
        #prepared_prompt_inter.append(rank_part.format(ranked_list=ranked_list_str))
    for i, x in enumerate(row['disliked_resp'], len(row['liked_resp'])):
        ranked_list_str = ranked_list_str + f"{i+1}. {x} <|reserved_special_token_0|>"
    return rank_part.format(ranked_list=ranked_list_str)[:-1]



def get_assistant_prompt(row):
    ranked_list_str = ''
    rank_part = """Here a list:\n{ranked_list}"""
    prepared_prompt_inter = [{'text': rank_part.format(ranked_list="")}]
    itemToInt
    for i, (x, id_) in enumerate(zip(row['liked_resp'], row['index_like_resp'])):
        ranked_list_str = ranked_list_str + f"{i+1}. {x} <|reserved_special_token_0|>"
        prepared_prompt_inter[-1]["target"] = itemToInt[row['item'][id_]]
        prepared_prompt_inter[-1]["target_text"] = f"{i+1}. {x} <|reserved_special_token_0|>"
        prepared_prompt_inter.append({'text': rank_part.format(ranked_list=ranked_list_str)})
    #for i, x in enumerate(row['disliked_resp'], len(row['liked_resp'])):
    #    ranked_list_str = ranked_list_str + f"{i+1}. {x} <|reserved_special_token_0|>"
    return prepared_prompt_inter


train_group['user_prompt'] = train_group.apply(prepare_user_prompt, axis=1)
train_group['assistant_prompt'] = train_group.apply(get_assistant_prompt, axis=1)

test_group['user_prompt'] = test_group.apply(prepare_user_prompt, axis=1, test=True)
test_group['assistant_prompt'] = test_group.apply(get_assistant_prompt, axis=1)

In [37]:
print(train_group.loc[150, 'user_prompt'])

I like the following items:
Maltese Falcon The, Monty Python and the Holy Grail, Magnum Force, Run Silent Run Deep, Who Framed Roger Rabbit, Bell Book and Candle, Bringing Out the Dead, From Here to Eternity, Gladiator, Ravenous, Taking of Pelham One Two Three The, Messenger The Story of Joan of Arc The
I dislike the following items:
Entrapment, Boiler Room, Braddock Missing in Action III, Clerks, Gone in 60 Seconds, Get Carter, Bowfinger, Star Trek The Motion Picture, Space Cowboys, Wild Wild West, Arachnophobia, Mystery Men, Teaching Mrs Tingle, Pitch Black, Sixth Sense The, Cell The, Civil Action A, Reindeer Games, Payback, Back to the Future

Sugges me 34 items


In [38]:
print(train_group.loc[150, 'assistant_prompt'])

[{'text': 'Here a list:\n', 'target': 1925, 'target_text': '1. Rushmore <|reserved_special_token_0|>'}, {'text': 'Here a list:\n1. Rushmore <|reserved_special_token_0|>', 'target': 3010, 'target_text': '2. Soapdish <|reserved_special_token_0|>'}, {'text': 'Here a list:\n1. Rushmore <|reserved_special_token_0|>2. Soapdish <|reserved_special_token_0|>', 'target': 2793, 'target_text': '3. Romeo Must Die <|reserved_special_token_0|>'}, {'text': 'Here a list:\n1. Rushmore <|reserved_special_token_0|>2. Soapdish <|reserved_special_token_0|>3. Romeo Must Die <|reserved_special_token_0|>', 'target': 2597, 'target_text': '4. Dead Calm <|reserved_special_token_0|>'}, {'text': 'Here a list:\n1. Rushmore <|reserved_special_token_0|>2. Soapdish <|reserved_special_token_0|>3. Romeo Must Die <|reserved_special_token_0|>4. Dead Calm <|reserved_special_token_0|>', 'target': 2084, 'target_text': '5. Matrix The <|reserved_special_token_0|>'}, {'text': 'Here a list:\n1. Rushmore <|reserved_special_token_0

In [39]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("../llama3.21B")
tokenizer.pad_token = tokenizer.eos_token
#tokenizer.chat_template =  "{% set ns = namespace(i=0) %}" \
#                            "{% for message in messages %}" \
#                                "{% if message['role'] == 'system' and ns.i == 0 %}" \
#                                       "{{ bos_token +' [INST] <<SYS>>\n' }}" \
#                                       "{{ message['content'] + ' <</SYS>>\n'}}" \
#                                "{% elif message['role'] == 'user' %}" \
#                                    "{{ message['content'] + ' [/INST]\n'}}" \
#                                "<|reserved_special_token_22|>\n"\
#                                "{% elif message['role'] == 'assistant' %}" \
#                                    "{{ message['content'] + ' ' + eos_token }}" \
#                                "{% endif %}" \
#                                "{% set ns.i = ns.i+1 %}" \
#                            "{% endfor %}"
#
tokenizer.chat_template =  "{% set ns = namespace(i=0) %}" \
                            "{% for message in messages %}" \
                                "{% if message['role'] == 'system' and ns.i == 0 %}" \
                                       "{{ bos_token +' [INST] <<SYS>>\n' }}" \
                                       "{{ message['content'] + ' <</SYS>>\n'}}" \
                                "{% elif message['role'] == 'user' %}" \
                                    "{{ message['content'] + ' [/INST]\n'}}" \
                                "{% elif message['role'] == 'assistant' %}" \
                                    "{{ message['content'] + '' + eos_token }}" \
                                "{% endif %}" \
                                "{% set ns.i = ns.i+1 %}" \
                            "{% endfor %}"


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\petru\.pyenv\pyenv-win\versions\3.10.0\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\petru\.pyenv\pyenv-win\versions\3.10.0\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\petru\Documents\python_envs\transformerVenv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\petru\Documents\python_envs\transformerVenv\lib\site-packages\traitlets\config\applic

In [40]:
def get_prompt(row, is_train=True):
    messages = [
    {"role": "system", "content": f"You're operating as a {domain} recommendation system. Your task involves receiving a user's list of liked and disliked items, based on that information, you need to return a ranked list of recommended items the user is most likely to enjoy"},
    {"role": "user", "content": row['user_prompt']}
    ]
    if is_train:
        messages.append({"role": "assistant", "content": row['assistant_prompt']})
    return tokenizer.apply_chat_template(messages, tokenize=False)



mapping_dict_rev = {v:k for k, v in mapping_dict.items()}
def get_prompt_train(row, is_train=True):
    istaces_dict = []
    if is_train:
        for x in row['assistant_prompt'][:-1]:
            messages = [
            {"role": "system", "content": f"You're operating as a {domain} recommendation system. Your task involves receiving a user's list of liked and disliked items, based on that information, you need to return a ranked list of recommended items the user is most likely to enjoy"},
            {"role": "user", "content": row['user_prompt']}
            ]
            messages.append({"role": "assistant", "content": x['text']})
            target_text = []
            targets = []
            for liked_items in row['liked_resp']:
                if liked_items not in x['text']:
                    targets.append(str(itemToInt[mapping_dict_rev[liked_items]]))
                    target_text.append(liked_items)
            istaces_dict.append({'text': tokenizer.apply_chat_template(messages, tokenize=False), "targets_text": ";;".join(target_text), 'target_id': ";;".join(targets), 'target': x['target_text']})
    for ist in istaces_dict[:-1]:
        ist['text'] = ist['text'].replace('<|end_of_text|>', '')
    return istaces_dict[:-1]

In [41]:
train_group['prompt'] = train_group.apply(get_prompt_train, axis=1, is_train=True)
test_group['prompt'] = test_group.apply(get_prompt, axis=1, is_train=False)

In [42]:
mapping_dict[13294]

'Never Cry Wolf'

In [43]:
itemToInt[13294]

3214

In [44]:
print(train_group['prompt'][10])

[{'text': "<|begin_of_text|> [INST] <<SYS>>\nYou're operating as a movie recommendation system. Your task involves receiving a user's list of liked and disliked items, based on that information, you need to return a ranked list of recommended items the user is most likely to enjoy <</SYS>>\nI like the following items:\nSixth Sense The, Being John Malkovich, Spaceballs, Shakespeare in Love, Amadeus\nI dislike the following items:\nHalloween, Fight Club, 10 Things I Hate About You, Breakfast Club The\n\nSugges me 12 items [/INST]\nHere a list:\n", 'targets_text': 'Run Lola Run Lola rennt;;Pulp Fiction;;Children of Paradise Les enfants du paradis;;Sleepy Hollow;;Deer Hunter The;;Gladiator', 'target_id': '2175;;261;;2365;;2499;;994;;2897', 'target': '1. Run Lola Run Lola rennt <|reserved_special_token_0|>'}, {'text': "<|begin_of_text|> [INST] <<SYS>>\nYou're operating as a movie recommendation system. Your task involves receiving a user's list of liked and disliked items, based on that inf

In [45]:
tokenizer(train_group['prompt'][0][0]['text'])['input_ids']

[128000,
 128000,
 510,
 65562,
 60,
 1134,
 39031,
 40171,
 2675,
 2351,
 10565,
 439,
 264,
 5818,
 28782,
 1887,
 13,
 4718,
 3465,
 18065,
 12588,
 264,
 1217,
 596,
 1160,
 315,
 15262,
 323,
 99733,
 3673,
 11,
 3196,
 389,
 430,
 2038,
 11,
 499,
 1205,
 311,
 471,
 264,
 21682,
 1160,
 315,
 11349,
 3673,
 279,
 1217,
 374,
 1455,
 4461,
 311,
 4774,
 366,
 524,
 39031,
 40171,
 40,
 1093,
 279,
 2768,
 3673,
 512,
 35,
 49709,
 11,
 6898,
 89,
 11,
 5124,
 485,
 1565,
 274,
 1796,
 11,
 10455,
 393,
 80496,
 11,
 68077,
 47329,
 578,
 11,
 22331,
 15457,
 220,
 17,
 11,
 8155,
 21882,
 315,
 94729,
 578,
 11,
 77064,
 389,
 220,
 1958,
 339,
 6825,
 11,
 49538,
 9877,
 13960,
 11,
 31601,
 274,
 9601,
 362,
 11,
 19435,
 5929,
 323,
 279,
 31048,
 423,
 11710,
 3933,
 11,
 393,
 511,
 1494,
 546,
 300,
 11,
 31685,
 323,
 279,
 34282,
 11,
 6690,
 19563,
 11,
 426,
 89502,
 11,
 11617,
 22933,
 291,
 11,
 89862,
 11,
 6588,
 97457,
 6588,
 97457,
 5790,
 406,
 11,
 480,
 30637

In [46]:
print(test_group['prompt'][0])

<|begin_of_text|> [INST] <<SYS>>
You're operating as a movie recommendation system. Your task involves receiving a user's list of liked and disliked items, based on that information, you need to return a ranked list of recommended items the user is most likely to enjoy <</SYS>>
I like the following items:
Gone in 60 Seconds, Tequila Sunrise, To Gillian on Her 37th Birthday, Erin Brockovich, Legends of the Fall, Shanghai Noon, Pocahontas, Mary Poppins, Aladdin, Hercules, My Best Friend s Wedding, Top Gun, Straight Story The, Shakespeare in Love, Where the Heart Is, Murphy s Romance, Gladiator, Babe, Grease, Mask of Zorro The, Moonstruck, Umbrellas of Cherbourg The Parapluies de Cherbourg Les, Beauty and the Beast
I dislike the following items:
Pulp Fiction, Speed, Arthur, Peggy Sue Got Married, Dances with Wolves, Other Sister The, Splash, Apple Dumpling Gang The

Sugges me 100 items [/INST]



In [47]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp,user_prompt,assistant_prompt,prompt
0,0,"[8213, 6267, 6857, 6858, 6916, 7258, 8270, 686...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...",49,"[0, 2, 3, 4, 6, 9, 15, 19, 23, 29, 30, 31, 32,...","[1, 5, 7, 8, 10, 12, 16, 17, 18, 20, 21, 22, 2...","[48, 42, 47]","[11, 13, 14, 24, 28]","[Run Lola Run Lola rennt, Mary Poppins, Dumbo,...","[Ponette, Sound of Music The, Aladdin, Hunchba...","[Tarzan, James and the Giant Peach, Princess B...","[Wallace Gromit The Best of Aardman Animation,...","I like the following items:\nDumbo, Antz, Schi...","[{'text': 'Here a list: ', 'target': 232, 'tar...",[{'text': '<|begin_of_text|> [INST] <<SYS>> Yo...
1,1,"[6493, 8733, 7257, 7887, 7444, 8122, 7061, 789...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...",122,"[0, 2, 6, 10, 14, 15, 20, 28, 31, 37, 42, 46, ...","[1, 5, 8, 13, 25, 26, 30, 34, 36, 38, 40, 44, ...","[66, 3, 67, 70, 7, 71, 73, 74, 11, 19, 21, 85,...","[4, 9, 12, 16, 17, 18, 22, 27, 29, 32, 33, 39,...","[Remains of the Day The, Lost World Jurassic P...","[Key Largo, Matrix The, Doctor Zhivago, Green ...","[Breakfast Club The, Children of a Lesser God,...","[Man in the Iron Mask The, Get Shorty, Fisher ...","I like the following items:\nGandhi, Strictly ...","[{'text': 'Here a list: ', 'target': 2694, 'ta...",[{'text': '<|begin_of_text|> [INST] <<SYS>> Yo...


In [48]:
def get_description_liked(row):
    liked_item_index = row['index_like_resp']
    likex_items = np.array(row['item'])[liked_item_index]
    return '\n'.join([item_desc_dict.get(y, '') for y in likex_items])

In [49]:
train_group['descriptions'] = train_group['item'].apply(lambda x: [item_desc_dict.get(y, '') for y in x if len(item_desc_dict.get(y, ''))>2])
#train_group['descriptions'] = train_group.apply(get_description_liked, axis=1)

In [50]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp,user_prompt,assistant_prompt,prompt,descriptions
0,0,"[8213, 6267, 6857, 6858, 6916, 7258, 8270, 686...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...",49,"[0, 2, 3, 4, 6, 9, 15, 19, 23, 29, 30, 31, 32,...","[1, 5, 7, 8, 10, 12, 16, 17, 18, 20, 21, 22, 2...","[48, 42, 47]","[11, 13, 14, 24, 28]","[Run Lola Run Lola rennt, Mary Poppins, Dumbo,...","[Ponette, Sound of Music The, Aladdin, Hunchba...","[Tarzan, James and the Giant Peach, Princess B...","[Wallace Gromit The Best of Aardman Animation,...","I like the following items:\nDumbo, Antz, Schi...","[{'text': 'Here a list: ', 'target': 232, 'tar...",[{'text': '<|begin_of_text|> [INST] <<SYS>> Yo...,[]
1,1,"[6493, 8733, 7257, 7887, 7444, 8122, 7061, 789...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...",122,"[0, 2, 6, 10, 14, 15, 20, 28, 31, 37, 42, 46, ...","[1, 5, 8, 13, 25, 26, 30, 34, 36, 38, 40, 44, ...","[66, 3, 67, 70, 7, 71, 73, 74, 11, 19, 21, 85,...","[4, 9, 12, 16, 17, 18, 22, 27, 29, 32, 33, 39,...","[Remains of the Day The, Lost World Jurassic P...","[Key Largo, Matrix The, Doctor Zhivago, Green ...","[Breakfast Club The, Children of a Lesser God,...","[Man in the Iron Mask The, Get Shorty, Fisher ...","I like the following items:\nGandhi, Strictly ...","[{'text': 'Here a list: ', 'target': 2694, 'ta...",[{'text': '<|begin_of_text|> [INST] <<SYS>> Yo...,[]


In [51]:
train_group.to_pickle(os.path.join(datapath, f'{dataset}_train_dataset_all_items.pkl'))
test_group.to_pickle(os.path.join(datapath, f'{dataset}_test_dataset_all_items.pkl'))

In [52]:
prompts = train_group['prompt'].tolist() * 3

In [53]:
descriptions = list(item_desc_dict.values())

In [54]:
descriptions = [f'{tokenizer.bos_token} ' + x + f' {tokenizer.eos_token}' for x in descriptions]

In [55]:
import random 

random.seed(22)
train_set = prompts + descriptions

random.shuffle(train_set)

In [56]:
len(train_set)

17555

In [57]:
len(list(itertools.chain(*train_group['prompt'].tolist()))[:20])

20

In [59]:
import json

with open(os.path.join(datapath, f'{dataset}_train_set_no_kn_all_items_special_target_temp.jsonl'), 'w') as outfile:
    for x in list(itertools.chain(*train_group['prompt'].tolist()))[:20]:
        json.dump(x, outfile)
        outfile.write('\n')

In [50]:
import json

with open(os.path.join(datapath, f'{dataset}_train_set_all_items.jsonl'), 'w') as outfile:
    for x in train_set:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [51]:
prompts = test_group['prompt'].tolist()

In [52]:
import json

with open(os.path.join(datapath, f'{dataset}_test_set_all_items.jsonl'), 'w') as outfile:
    for x in prompts:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

# Baseline

In [53]:
"""train_bs = pd.concat([test_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']],\
                      train_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']]])
test_bs = test_group.loc[:, ['user', 'index_like_resp', 'index_dislike_resp', 'item']]"""

"train_bs = pd.concat([test_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']],                      train_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']]])\ntest_bs = test_group.loc[:, ['user', 'index_like_resp', 'index_dislike_resp', 'item']]"

In [54]:
"""train_bs['bs_format'] = train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_prompt']], axis=1)
train_bs['bs_format'] = train_bs['bs_format'] + train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_prompt']], axis=1)

test_bs['bs_format'] = test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_resp']], axis=1)
test_bs['bs_format'] = test_bs['bs_format'] + test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_resp']], axis=1)"""

"train_bs['bs_format'] = train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_prompt']], axis=1)\ntrain_bs['bs_format'] = train_bs['bs_format'] + train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_prompt']], axis=1)\n\ntest_bs['bs_format'] = test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_resp']], axis=1)\ntest_bs['bs_format'] = test_bs['bs_format'] + test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_resp']], axis=1)"

In [55]:
"""import itertools

os.makedirs(os.path.join(datapath, 'baseline'), exist_ok=True)
pd.DataFrame(list(itertools.chain(*test_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part3.inter'), sep='\t', index=False)
pd.DataFrame(list(itertools.chain(*train_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part1.inter'), sep='\t', index=False)"""

"import itertools\n\nos.makedirs(os.path.join(datapath, 'baseline'), exist_ok=True)\npd.DataFrame(list(itertools.chain(*test_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part3.inter'), sep='\t', index=False)\npd.DataFrame(list(itertools.chain(*train_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part1.inter'), sep='\t', index=False)"