In [1]:
dataset = 'lastfm'

domain = {
    'lastfm': 'music',
    'dbbook': 'book',
    'movielens': 'movie'
}[dataset]
import os
datapath = os.path.join('..', 'data', dataset)

special_token = '<|reserved_special_token_22|>'

resp_frac = 0.5

# Set names

In [2]:
import pandas as pd
import os
import random
import numpy as np
import swifter  

In [3]:
user_item_df_train = pd.read_csv(os.path.join(datapath, 'user-item', 'train.tsv'), \
            names=['user', 'item', 'rating'], sep='\t')

user_item_df_test = pd.read_csv(os.path.join(datapath, 'user-item', 'test.tsv'), \
            names=['user', 'item', 'rating'], sep='\t')


df = pd.concat([user_item_df_train, user_item_df_test], ignore_index=True)
all_items = df['item'].unique()

In [4]:
def get_split_random(list_to_split, frac=0.2, margin=0):
    random.seed(22)
    sample_list = random.sample(list_to_split, int(frac*len(list_to_split))+margin)
    complementary_set = set(list_to_split) - set(sample_list)
    return list(set(complementary_set)), list(set(sample_list))

In [5]:
random.seed(22)
test_frac = 0.2
user_list = df['user'].unique().tolist()
user_train, user_test = get_split_random(user_list, frac=test_frac, margin=0)

In [6]:
user_item_df_train = df[df['user'].isin(user_train)]
user_item_df_test = df[df['user'].isin(user_test)]

In [None]:
len(set(user_item_df_train['user'].unique()) - set(user_item_df_test['user'].unique()))

In [None]:
user_item_df_test

In [None]:
user_item_df_test['rating'].describe()

In [None]:
user_item_df_train['rating'].describe()

In [11]:
train_group = user_item_df_train.groupby('user').agg({
    'item': list,
    'rating': list
}).reset_index()
test_group = user_item_df_test.groupby('user').agg({
    'item': list,
    'rating': list
}).reset_index()
train_group['len'] = train_group['item'].apply(len)
test_group['len'] = test_group['item'].apply(len)

In [12]:
def split_prompt_resp(row, resp_frac):
    items = np.array(row['rating'])
    index_likes = np.where(items==1)[0].tolist()
    index_dilikes = np.where(items==0)[0].tolist()
    index_like_prompt = []
    index_like_resp = []    
    index_dislike_prompt = []
    index_dislike_resp = []
    resp_frac=resp_frac
    if len(index_likes) > 0:
        index_like_prompt, index_like_resp = get_split_random(index_likes, frac=resp_frac, margin=1)
    if len(index_dilikes) > 0:
        index_dislike_prompt, index_dislike_resp = get_split_random(index_dilikes, frac=resp_frac, margin=1)
    row['index_like_prompt'] = index_like_prompt
    row['index_like_resp'] = index_like_resp
    row['index_dislike_prompt'] = index_dislike_prompt
    row['index_dislike_resp'] = index_dislike_resp
    return row

In [None]:
test_group['len'].describe()

In [None]:
train_group['len'].describe()

In [None]:
test_group = test_group.swifter.apply(split_prompt_resp, axis=1, resp_frac=resp_frac)
train_group = train_group.swifter.apply(split_prompt_resp, axis=1, resp_frac=resp_frac)

In [None]:
test_group.head(2)

In [None]:
test_group.shape

In [None]:
train_group.head(2)

In [None]:
train_group.shape

In [20]:
if dataset=='movielens':
    def fix_title(title):
        if ", The (" in title:
            name_film, _, year = title.rpartition(", The (")
            title = "The " + name_film + " (" + year
            return title
        if ", A (" in title:
            name_film, _, year = title.rpartition(", A (")
            title = "A " + name_film + " (" + year
        return title
    df_movies = pd.read_csv(os.path.join(datapath, r"movies.dat"), sep="::", names=["item_id", "name", "geners"], encoding='ISO-8859-1')
    df_movies['name'] = df_movies['name'].apply(lambda x: fix_title(x)[0])
    #df_movies['year'] = df_movies['name'].apply(lambda x: fix_title(x)[1])
    import re

    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), \
            names=['url', 'id_set'], sep='\t')
    dbpedia_mapping = pd.read_csv(os.path.join(datapath, 'MappingMovielens2DBpedia-1.2.tsv'), \
            names=['id_movie', 'name', 'dbpedia_url'], sep='\t')
    df_movies = dbpedia_mapping.set_index('dbpedia_url').join(df_relations.set_index('url')).reset_index()
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub("\s+\(\d+\)$", "", x))
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", ' ', x).strip())
    df_movies.dropna(inplace=True)
    df_movies['id_set'] = df_movies['id_set'].astype(int)
    mapping_dict = df_movies.loc[:, ['id_set', 'name']].set_index('id_set').to_dict()['name']
    property_mapping = pd.read_json(os.path.join(datapath, 'entities_names.json'), typ='series')
    mapping_relation = pd.read_csv(os.path.join(datapath, 'mapping_relations.tsv'), \
            names=['rel', 'id'], sep='\t')

In [21]:
if dataset=='dbbook':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), sep='\t')
    df_relations['name'] = df_relations['uri'].apply(lambda x: x.split(";")[0])
    temp = df_relations[df_relations['id'].isin(df['item'].unique())]
    mapping_dict = temp.set_index('id').to_dict()['name']

In [22]:
if dataset=='lastfm':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_items.tsv'), \
            names=['id', 'name'], sep='\t')
    mapping_dict = df_relations.set_index('id').to_dict()['name']

In [None]:
item_desription = pd.read_csv(os.path.join(datapath, dataset+'.txt'), sep=';;', names=['item_id', 'description'],  on_bad_lines='skip')
item_desription = item_desription[item_desription['item_id'].isin(all_items)]

In [25]:
#item_desription['description'] = item_desription['description'].apply(lambda x: special_token + " " + x)

In [26]:
import json

with open(os.path.join(datapath, f'{dataset}_domain_kn.jsonl'), 'w') as outfile:
    for new_id, desc in item_desription.to_dict()['description'].items():
        json.dump({'target_id':new_id, 'text':desc}, outfile)
        outfile.write('\n')

In [27]:
item_desc_dict = item_desription.to_dict()['description']

In [None]:
len(item_desc_dict)

## Prepare prompt

In [None]:
train_group.head()

In [30]:
def map_with_names(row, col, mapping_dict):
    return [mapping_dict.get(x, '') for x in np.array(row['item'])[row[col]] if len(mapping_dict.get(x, ''))>2]


train_group['liked_prompt'] = train_group.apply(map_with_names, axis=1, col='index_like_prompt', mapping_dict=mapping_dict)
train_group['liked_resp'] = train_group.apply(map_with_names, axis=1, col='index_like_resp', mapping_dict=mapping_dict)
train_group['disliked_prompt'] = train_group.apply(map_with_names, axis=1, col='index_dislike_prompt', mapping_dict=mapping_dict)
train_group['disliked_resp'] = train_group.apply(map_with_names, axis=1, col='index_dislike_resp', mapping_dict=mapping_dict)

test_group['liked_prompt'] = test_group.apply(map_with_names, axis=1, col='index_like_prompt', mapping_dict=mapping_dict)
test_group['liked_resp'] = test_group.apply(map_with_names, axis=1, col='index_like_resp', mapping_dict=mapping_dict)
test_group['disliked_prompt'] = test_group.apply(map_with_names, axis=1, col='index_dislike_prompt', mapping_dict=mapping_dict)
test_group['disliked_resp'] = test_group.apply(map_with_names, axis=1, col='index_dislike_resp', mapping_dict=mapping_dict)

In [None]:
train_group.head(2)

In [32]:
train = []
test = []
for _, row in test_group.iterrows():
    for item in row['liked_prompt']:
        train.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 1
        })
    for item in row['disliked_prompt']:
        train.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 0
        })
    for item in row['liked_resp']:
        test.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 1
        })
    for item in row['disliked_resp']:
        test.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 0
        })
                    

In [33]:
import random
import itertools

def prepare_user_prompt(row):
    random.seed(22)
    user_prompt = ''
    liked_list = ', '
    disliked_list = ', '
    liked_part = """I like the following items:\n{liked_list}\n"""
    disliked_part = """I dislike the following items:\n{disliked_list}\n"""
    liked_items = sorted(row['liked_prompt'], key=lambda k: random.random())
    disliked_items = sorted(row['disliked_prompt'], key=lambda k: random.random())
    liked_list = liked_list.join(liked_items)
    disliked_list = disliked_list.join(disliked_items)
    if len(liked_items)>0:
        user_prompt = user_prompt + liked_part.format(liked_list=liked_list)
    if len(disliked_list)>0:
        user_prompt = user_prompt + disliked_part.format(disliked_list=disliked_list)
    
    to_rank_list_str = ', '
    response_part = """\nRank the following items:\n{to_rank_list}"""
    to_rank_list = list(itertools.chain(*[row['liked_resp'], row['disliked_resp']]))
    to_rank_list = sorted(to_rank_list, key=lambda k: random.random())
    to_rank_list = to_rank_list_str.join(to_rank_list)
    user_prompt = user_prompt + response_part.format(to_rank_list=to_rank_list)
    return user_prompt
        
def get_assistant_prompt(row):
    ranked_list_str = ''
    rank_part = """Here a list:\n{ranked_list}\n"""
    for i, x in enumerate(row['liked_resp']):
        ranked_list_str = ranked_list_str + f"{i+1}. {x}\n"
    for i, x in enumerate(row['disliked_resp'], len(row['liked_resp'])):
        ranked_list_str = ranked_list_str + f"{i+1}. {x}\n"
    return rank_part.format(ranked_list=ranked_list_str)[:-1]


train_group['user_prompt'] = train_group.apply(prepare_user_prompt, axis=1)
train_group['assistant_prompt'] = train_group.apply(get_assistant_prompt, axis=1)

test_group['user_prompt'] = test_group.apply(prepare_user_prompt, axis=1)
test_group['assistant_prompt'] = test_group.apply(get_assistant_prompt, axis=1)

In [None]:
print(train_group.loc[150, 'user_prompt'])

In [None]:
print(train_group.loc[150, 'assistant_prompt'])

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("../llama3")
tokenizer.pad_token = tokenizer.eos_token
#tokenizer.chat_template =  "{% set ns = namespace(i=0) %}" \
#                            "{% for message in messages %}" \
#                                "{% if message['role'] == 'system' and ns.i == 0 %}" \
#                                       "{{ bos_token +' [INST] <<SYS>>\n' }}" \
#                                       "{{ message['content'] + ' <</SYS>>\n'}}" \
#                                "{% elif message['role'] == 'user' %}" \
#                                    "{{ message['content'] + ' [/INST]\n'}}" \
#                                "<|reserved_special_token_22|>\n"\
#                                "{% elif message['role'] == 'assistant' %}" \
#                                    "{{ message['content'] + ' ' + eos_token }}" \
#                                "{% endif %}" \
#                                "{% set ns.i = ns.i+1 %}" \
#                            "{% endfor %}"
#
tokenizer.chat_template =  "{% set ns = namespace(i=0) %}" \
                            "{% for message in messages %}" \
                                "{% if message['role'] == 'system' and ns.i == 0 %}" \
                                       "{{ bos_token +' [INST] <<SYS>>\n' }}" \
                                       "{{ message['content'] + ' <</SYS>>\n'}}" \
                                "{% elif message['role'] == 'user' %}" \
                                    "{{ message['content'] + ' [/INST]\n'}}" \
                                "{% elif message['role'] == 'assistant' %}" \
                                    "{{ message['content'] + '' + eos_token }}" \
                                "{% endif %}" \
                                "{% set ns.i = ns.i+1 %}" \
                            "{% endfor %}"

In [37]:
def get_prompt(row, is_train=True):
    messages = [
    {"role": "system", "content": f"You're operating as a {domain} recommendation system. Your task involves receiving a user's list of liked and disliked items, along with a set of candidate items, and then reordering them based on preferences."},
    {"role": "user", "content": row['user_prompt']}
    ]
    if is_train:
        messages.append({"role": "assistant", "content": row['assistant_prompt']})
    return tokenizer.apply_chat_template(messages, tokenize=False)

In [38]:
def get_prompt_gpt(row, is_train=True):
    messages = [
    {"role": "system", "content": f"You're operating as a {domain} recommendation system. Your task involves receiving a user's list of liked and disliked items, along with a set of candidate items, and then reordering them based on preferences."},
    {"role": "user", "content": row['user_prompt']}
    ]
    if is_train:
        messages.append({"role": "assistant", "content": row['assistant_prompt']})
    return messages

In [39]:
pd.to_pickle(test_group.apply(get_prompt_gpt, axis=1, is_train=False).tolist(), os.path.join(datapath, "gpt_prompts.pkl"))

In [38]:
train_group['prompt'] = train_group.apply(get_prompt, axis=1, is_train=True)
test_group['prompt'] = test_group.apply(get_prompt, axis=1, is_train=False)

In [None]:
train_group.head(2)

In [40]:
def get_description_liked(row):
    liked_item_index = row['index_like_resp']
    likex_items = np.array(row['item'])[liked_item_index]
    return '\n'.join([item_desc_dict.get(y, '') for y in likex_items])

In [41]:
train_group['descriptions'] = train_group['item'].apply(lambda x: [item_desc_dict.get(y, '') for y in x if len(item_desc_dict.get(y, ''))>2])
#train_group['descriptions'] = train_group.apply(get_description_liked, axis=1)

In [None]:
train_group.head(2)

In [43]:
train_group.to_pickle(os.path.join(datapath, f'{dataset}_train_dataset.pkl'))
test_group.to_pickle(os.path.join(datapath, f'{dataset}_test_dataset.pkl'))

In [44]:
prompts = train_group['prompt'].tolist() #* 3

In [45]:
descriptions = list(item_desc_dict.values())

In [46]:
descriptions = [f'{tokenizer.bos_token} ' + x + f' {tokenizer.eos_token}' for x in descriptions if type(x)==str]

In [47]:
import json

with open(os.path.join(datapath, f'{dataset}_text_kn_train_set.jsonl'), 'w') as outfile:
    for x in descriptions:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [None]:
import random 

random.seed(22)
train_set = prompts*3 + descriptions

random.shuffle(train_set)

In [None]:
len(train_set)

In [None]:
import json

with open(os.path.join(datapath, f'{dataset}_train_set_no_kn.jsonl'), 'w') as outfile:
    for x in train_group['prompt'].tolist():
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [None]:
import json

with open(os.path.join(datapath, f'{dataset}_train_set.jsonl'), 'w') as outfile:
    for x in train_set:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [None]:
prompts = test_group['prompt'].tolist()

In [None]:
import json

with open(os.path.join(datapath, f'{dataset}_test_set.jsonl'), 'w') as outfile:
    for x in prompts:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

# Baseline

In [48]:
train_bs = pd.concat([test_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']],\
                      train_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']]])
test_bs = test_group.loc[:, ['user', 'index_like_resp', 'index_dislike_resp', 'item']]

In [49]:
train_bs['bs_format'] = train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_prompt']], axis=1)
train_bs['bs_format'] = train_bs['bs_format'] + train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_prompt']], axis=1)

test_bs['bs_format'] = test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_resp']], axis=1)
test_bs['bs_format'] = test_bs['bs_format'] + test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_resp']], axis=1)

In [50]:
item_set_bs = set(x['item_id:token'] for x in list(itertools.chain(*test_bs['bs_format'].tolist())) + list(itertools.chain(*train_bs['bs_format'].tolist())))

In [51]:
ent_bs = item_desription[item_desription['item_id'].isin(item_set_bs)].set_index('item_id').join(pd.DataFrame([{'id':k, 'name':v} for k, v in mapping_dict.items() if k in item_set_bs]).set_index('id')).reset_index()

In [52]:
ent_bs = ent_bs.rename(columns={
    'item_id':'item_id:token',
    'name': 'title:token_seq',
    'description': 'description:token_seq',
})

ent_bs.to_csv(os.path.join(datapath, 'baseline', f'{dataset}.item'), sep='\t', index=False)

In [53]:
import itertools

os.makedirs(os.path.join(datapath, 'baseline'), exist_ok=True)
pd.DataFrame(list(itertools.chain(*test_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part3.inter'), sep='\t', index=False)
pd.DataFrame(list(itertools.chain(*train_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part1.inter'), sep='\t', index=False)