In [1]:
dataset = 'dbbook'

domain = {
    'lastfm': 'music',
    'dbbook': 'book',
    'movielens': 'movie'
}[dataset]
import os
datapath = os.path.join('..', 'data', dataset)

special_token = '<|reserved_special_token_22|>'

resp_frac = 0.5

# Set names

In [2]:
import pandas as pd
import os
import random
import numpy as np
import swifter  

In [3]:
user_item_df_train = pd.read_csv(os.path.join(datapath, 'user-item', 'train.tsv'), \
            names=['user', 'item', 'rating'], sep='\t')

user_item_df_test = pd.read_csv(os.path.join(datapath, 'user-item', 'test.tsv'), \
            names=['user', 'item', 'rating'], sep='\t')


df = pd.concat([user_item_df_train, user_item_df_test], ignore_index=True)
all_items = df['item'].unique()

In [4]:
def get_split_random(list_to_split, frac=0.2, margin=0):
    random.seed(22)
    sample_list = random.sample(list_to_split, int(frac*len(list_to_split))+margin)
    complementary_set = set(list_to_split) - set(sample_list)
    return list(set(complementary_set)), list(set(sample_list))

In [5]:
random.seed(22)
test_frac = 0.2
user_list = df['user'].unique().tolist()
user_train, user_test = get_split_random(user_list, frac=test_frac, margin=0)

In [6]:
user_item_df_train = df[df['user'].isin(user_train)]
user_item_df_test = df[df['user'].isin(user_test)]

In [7]:
len(set(user_item_df_train['user'].unique()) - set(user_item_df_test['user'].unique()))

4528

In [8]:
user_item_df_test['rating'].describe()

count    25955.000000
mean         0.526373
std          0.499314
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: rating, dtype: float64

In [9]:
user_item_df_train['rating'].describe()

count    103558.000000
mean          0.526555
std           0.499297
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: rating, dtype: float64

In [10]:
train_group = user_item_df_train.groupby('user').agg({
    'item': list,
    'rating': list
}).reset_index()
test_group = user_item_df_test.groupby('user').agg({
    'item': list,
    'rating': list
}).reset_index()
train_group['len'] = train_group['item'].apply(len)
test_group['len'] = test_group['item'].apply(len)

In [11]:
def split_prompt_resp(row, resp_frac):
    items = np.array(row['rating'])
    index_likes = np.where(items==1)[0].tolist()
    index_dilikes = np.where(items==0)[0].tolist()
    index_like_prompt = []
    index_like_resp = []    
    index_dislike_prompt = []
    index_dislike_resp = []
    resp_frac=resp_frac
    if len(index_likes) > 0:
        index_like_prompt, index_like_resp = get_split_random(index_likes, frac=resp_frac, margin=1)
    if len(index_dilikes) > 0:
        index_dislike_prompt, index_dislike_resp = get_split_random(index_dilikes, frac=resp_frac, margin=1)
    row['index_like_prompt'] = index_like_prompt
    row['index_like_resp'] = index_like_resp
    row['index_dislike_prompt'] = index_dislike_prompt
    row['index_dislike_resp'] = index_dislike_resp
    return row

In [12]:
test_group['len'].describe()

count    1132.000000
mean       22.928445
std         7.870594
min         7.000000
25%        16.000000
50%        23.000000
75%        29.000000
max        42.000000
Name: len, dtype: float64

In [13]:
train_group['len'].describe()

count    4528.000000
mean       22.870583
std         7.754149
min         7.000000
25%        17.000000
50%        23.000000
75%        29.000000
max        42.000000
Name: len, dtype: float64

In [14]:
test_group = test_group.swifter.apply(split_prompt_resp, axis=1, resp_frac=resp_frac)
train_group = train_group.swifter.apply(split_prompt_resp, axis=1, resp_frac=resp_frac)

Pandas Apply:   0%|          | 0/1132 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4528 [00:00<?, ?it/s]

In [15]:
test_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp
0,13,"[8062, 9705, 6314, 8587, 9996, 12354, 7374, 10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...",23,"[16, 19, 21, 11, 14]","[10, 12, 13, 15, 18, 20, 22]","[17, 4, 6, 8, 9]","[0, 1, 2, 3, 5, 7]"
1,15,"[12373, 12374, 12140, 11953, 10049, 10347, 743...","[1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0]",12,"[9, 2, 6]","[0, 1, 10, 5]","[8, 7]","[3, 11, 4]"


In [16]:
test_group.shape

(1132, 8)

In [17]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp
0,0,"[8706, 9303, 6847, 6921, 6606, 11057, 9129, 12...","[1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1]",12,"[8, 9, 4]","[11, 0, 3, 7]","[5, 6]","[1, 2, 10]"
1,1,"[10489, 7463, 12345, 10154, 9875, 12337, 6765,...","[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",20,"[16, 18, 6, 7]","[1, 2, 3, 4, 5, 17]","[19, 12, 13, 14]","[0, 8, 9, 10, 11, 15]"


In [18]:
train_group.shape

(4528, 8)

In [19]:
if dataset=='movielens':
    def fix_title(title):
        if ", The (" in title:
            name_film, _, year = title.rpartition(", The (")
            title = "The " + name_film + " (" + year
            return title
        if ", A (" in title:
            name_film, _, year = title.rpartition(", A (")
            title = "A " + name_film + " (" + year
        return title, year
    df_movies = pd.read_csv(os.path.join(datapath, r"movies.dat"), sep="::", names=["item_id", "name", "geners"], encoding='ISO-8859-1')
    df_movies['name'] = df_movies['name'].apply(lambda x: fix_title(x)[0])
    df_movies['year'] = df_movies['name'].apply(lambda x: fix_title(x)[1])
    import re

    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), \
            names=['url', 'id_set'], sep='\t')
    dbpedia_mapping = pd.read_csv(os.path.join(datapath, 'MappingMovielens2DBpedia-1.2.tsv'), \
            names=['id_movie', 'name', 'dbpedia_url'], sep='\t')
    df_movies = dbpedia_mapping.set_index('dbpedia_url').join(df_relations.set_index('url')).reset_index()
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub("\s+\(\d+\)$", "", x))
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", ' ', x).strip())
    df_movies.dropna(inplace=True)
    df_movies['id_set'] = df_movies['id_set'].astype(int)
    mapping_dict = df_movies.loc[:, ['id_set', 'name', 'year']].set_index('id_set').to_dict()['name']

In [20]:
if dataset=='dbbook':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), sep='\t')
    df_relations['name'] = df_relations['uri'].apply(lambda x: x.split(";")[0])
    temp = df_relations[df_relations['id'].isin(df['item'].unique())]
    mapping_dict = temp.set_index('id').to_dict()['name']

In [21]:
if dataset=='lastfm':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_items.tsv'), \
            names=['id', 'name'], sep='\t')
    mapping_dict = df_relations.set_index('id').to_dict()['name']

In [22]:
item_desription = pd.read_csv(os.path.join(datapath, dataset+'.txt'), sep=';;', names=['item_id', 'description'],  on_bad_lines='skip')
item_desription = item_desription[item_desription['item_id'].isin(all_items)]

  item_desription = pd.read_csv(os.path.join(datapath, dataset+'.txt'), sep=';;', names=['item_id', 'description'],  on_bad_lines='skip')


In [23]:
#item_desription['description'] = item_desription['description'].apply(lambda x: special_token + " " + x)

In [24]:
import json

with open(os.path.join(datapath, f'{dataset}_domain_kn.jsonl'), 'w') as outfile:
    for new_id, desc in item_desription.to_dict()['description'].items():
        json.dump({'target_id':new_id, 'text':desc}, outfile)
        outfile.write('\n')

In [25]:
item_desc_dict = item_desription.to_dict()['description']

## Prepare prompt

In [28]:
train_group.head()

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp
0,0,"[8706, 9303, 6847, 6921, 6606, 11057, 9129, 12...","[1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1]",12,"[8, 9, 4]","[11, 0, 3, 7]","[5, 6]","[1, 2, 10]","[7305, 7599, 6606]","[8670, 8706, 6921, 12020]","[11057, 9129]","[9303, 6847, 9021]"
1,1,"[10489, 7463, 12345, 10154, 9875, 12337, 6765,...","[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",20,"[16, 18, 6, 7]","[1, 2, 3, 4, 5, 17]","[19, 12, 13, 14]","[0, 8, 9, 10, 11, 15]","[6664, 6665, 6765, 12336]","[7463, 12345, 10154, 9875, 12337, 6666]","[9212, 11136, 10509, 11107]","[10489, 10131, 11810, 11257, 8355, 10890]"
2,2,"[6753, 10369, 9000, 8997, 8413, 7323, 12041, 1...","[0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",21,"[16, 6, 9, 11, 13, 15]","[1, 7, 8, 10, 12, 14, 17]","[18, 19, 20]","[0, 2, 3, 4, 5]","[12104, 12041, 12859, 8587, 12150, 8780]","[10369, 12174, 7327, 12142, 8771, 8752, 12120]","[9527, 8563, 7333]","[6753, 9000, 8997, 8413, 7323]"
3,3,"[11383, 12338, 7463, 6323, 6321, 11151, 12777,...","[1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, ...",19,"[8, 9, 18, 11]","[0, 1, 3, 4, 5, 17]","[16, 13, 6, 14]","[2, 7, 10, 12, 15]","[10631, 6304, 12912, 6302]","[11383, 12338, 6323, 6321, 11151, 6316]","[6318, 8957, 12777, 9307]","[7463, 8472, 7374, 7866, 7323]"
4,4,"[11318, 11629, 12769, 10190, 10754, 8522, 9208...","[0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...",17,"[8, 11, 3]","[16, 1, 2, 4]","[10, 12, 13, 15]","[0, 5, 6, 7, 9, 14]","[10523, 9985, 10190]","[9822, 11629, 12769, 10754]","[8549, 9313, 10003, 8140]","[11318, 8522, 9208, 9249, 7603, 10409]"


In [36]:
mapping_dict[8670]

'A Heartbreaking Work of Staggering Genius'

In [37]:
def map_with_names(row, col, mapping_dict):
    return [mapping_dict.get(x, '') for x in np.array(row['item'])[row[col]] if len(mapping_dict.get(x, ''))>2]


train_group['liked_prompt'] = train_group.apply(map_with_names, axis=1, col='index_like_prompt', mapping_dict=mapping_dict)
train_group['liked_resp'] = train_group.apply(map_with_names, axis=1, col='index_like_resp', mapping_dict=mapping_dict)
train_group['disliked_prompt'] = train_group.apply(map_with_names, axis=1, col='index_dislike_prompt', mapping_dict=mapping_dict)
train_group['disliked_resp'] = train_group.apply(map_with_names, axis=1, col='index_dislike_resp', mapping_dict=mapping_dict)

test_group['liked_prompt'] = test_group.apply(map_with_names, axis=1, col='index_like_prompt', mapping_dict=mapping_dict)
test_group['liked_resp'] = test_group.apply(map_with_names, axis=1, col='index_like_resp', mapping_dict=mapping_dict)
test_group['disliked_prompt'] = test_group.apply(map_with_names, axis=1, col='index_dislike_prompt', mapping_dict=mapping_dict)
test_group['disliked_resp'] = test_group.apply(map_with_names, axis=1, col='index_dislike_resp', mapping_dict=mapping_dict)

In [38]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp
0,0,"[8706, 9303, 6847, 6921, 6606, 11057, 9129, 12...","[1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1]",12,"[8, 9, 4]","[11, 0, 3, 7]","[5, 6]","[1, 2, 10]","[The Bell Jar, The Structure of Scientific Rev...","[A Heartbreaking Work of Staggering Genius, A ...","[Me Talk Pretty One Day, Sophie's World]","[Darwin's Black Box, The Solitaire Mystery, Ph..."
1,1,"[10489, 7463, 12345, 10154, 9875, 12337, 6765,...","[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",20,"[16, 18, 6, 7]","[1, 2, 3, 4, 5, 17]","[19, 12, 13, 14]","[0, 8, 9, 10, 11, 15]","[The Lion, the Witch and the Wardrobe, Prince ...","[The Magician's Nephew, Executive Orders, An E...","[A Letter of Mary, The Shelters of Stone, Gods...","[Troubling a Star, The Young Unicorns, Perelan..."


In [39]:
train = []
test = []
for _, row in test_group.iterrows():
    for item in row['liked_prompt']:
        train.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 1
        })
    for item in row['disliked_prompt']:
        train.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 0
        })
    for item in row['liked_resp']:
        test.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 1
        })
    for item in row['disliked_resp']:
        test.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 0
        })

pd.DataFrame(train).to_csv("train_adaptation.csv")
                    

In [40]:
import random
import itertools

def prepare_user_prompt(row):
    random.seed(22)
    user_prompt = ''
    liked_list = ', '
    disliked_list = ', '
    liked_part = """I like the following items:\n{liked_list}\n"""
    disliked_part = """I dislike the following items:\n{disliked_list}\n"""
    liked_items = sorted(row['liked_prompt'], key=lambda k: random.random())
    disliked_items = sorted(row['disliked_prompt'], key=lambda k: random.random())
    liked_list = liked_list.join(liked_items)
    disliked_list = disliked_list.join(disliked_items)
    if len(liked_items)>0:
        user_prompt = user_prompt + liked_part.format(liked_list=liked_list)
    if len(disliked_list)>0:
        user_prompt = user_prompt + disliked_part.format(disliked_list=disliked_list)
    
    to_rank_list_str = ', '
    response_part = """\nRank the following items:\n{to_rank_list}"""
    to_rank_list = list(itertools.chain(*[row['liked_resp'], row['disliked_resp']]))
    to_rank_list = sorted(to_rank_list, key=lambda k: random.random())
    to_rank_list = to_rank_list_str.join(to_rank_list)
    user_prompt = user_prompt + response_part.format(to_rank_list=to_rank_list)
    return user_prompt
        
def get_assistant_prompt(row):
    ranked_list_str = ''
    rank_part = """Here a list:\n{ranked_list}\n"""
    for i, x in enumerate(row['liked_resp']):
        ranked_list_str = ranked_list_str + f"{i+1}. {x}\n"
    for i, x in enumerate(row['disliked_resp'], len(row['liked_resp'])):
        ranked_list_str = ranked_list_str + f"{i+1}. {x}\n"
    return rank_part.format(ranked_list=ranked_list_str)[:-1]


train_group['user_prompt'] = train_group.apply(prepare_user_prompt, axis=1)
train_group['assistant_prompt'] = train_group.apply(get_assistant_prompt, axis=1)

test_group['user_prompt'] = test_group.apply(prepare_user_prompt, axis=1)
test_group['assistant_prompt'] = test_group.apply(get_assistant_prompt, axis=1)

In [41]:
print(train_group.loc[150, 'user_prompt'])

I like the following items:
The Crow Road, The Wasp Factory, Jonathan Strange & Mr Norrell, Ibid: A Life, Espedair Street, Something Wicked This Way Comes
I dislike the following items:
The Boy in the Striped Pyjamas, The Angel of Darkness, American Gods

Rank the following items:
Last Orders, American Psycho, The Razor's Edge, Ella Minnow Pea, The Da Vinci Code, Notes from a Small Island, The Lovely Bones, Winter in Madrid, A Big Boy Did It and Ran Away, The Alienist, Not the End of the World, The Curious Incident of the Dog in the Night-Time


In [42]:
print(train_group.loc[150, 'assistant_prompt'])

Here a list:
1. American Psycho
2. The Curious Incident of the Dog in the Night-Time
3. Ella Minnow Pea
4. Notes from a Small Island
5. Not the End of the World
6. The Lovely Bones
7. The Alienist
8. Winter in Madrid
9. Last Orders
10. A Big Boy Did It and Ran Away
11. The Da Vinci Code
12. The Razor's Edge



In [43]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer.pad_token = tokenizer.eos_token
#tokenizer.chat_template =  "{% set ns = namespace(i=0) %}" \
#                            "{% for message in messages %}" \
#                                "{% if message['role'] == 'system' and ns.i == 0 %}" \
#                                       "{{ bos_token +' [INST] <<SYS>>\n' }}" \
#                                       "{{ message['content'] + ' <</SYS>>\n'}}" \
#                                "{% elif message['role'] == 'user' %}" \
#                                    "{{ message['content'] + ' [/INST]\n'}}" \
#                                "<|reserved_special_token_22|>\n"\
#                                "{% elif message['role'] == 'assistant' %}" \
#                                    "{{ message['content'] + ' ' + eos_token }}" \
#                                "{% endif %}" \
#                                "{% set ns.i = ns.i+1 %}" \
#                            "{% endfor %}"
#
tokenizer.chat_template =  "{% set ns = namespace(i=0) %}" \
                            "{% for message in messages %}" \
                                "{% if message['role'] == 'system' and ns.i == 0 %}" \
                                       "{{ bos_token +' [INST] <<SYS>>\n' }}" \
                                       "{{ message['content'] + ' <</SYS>>\n'}}" \
                                "{% elif message['role'] == 'user' %}" \
                                    "{{ message['content'] + ' [/INST]\n'}}" \
                                "{% elif message['role'] == 'assistant' %}" \
                                    "{{ message['content'] + '' + eos_token }}" \
                                "{% endif %}" \
                                "{% set ns.i = ns.i+1 %}" \
                            "{% endfor %}"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [44]:
def get_prompt(row, is_train=True):
    messages = [
    {"role": "system", "content": f"You're operating as a {domain} recommendation system. Your task involves receiving a user's list of liked and disliked items, along with a set of candidate items, and then reordering them based on preferences."},
    {"role": "user", "content": row['user_prompt']}
    ]
    if is_train:
        messages.append({"role": "assistant", "content": row['assistant_prompt']})
    return tokenizer.apply_chat_template(messages, tokenize=False)

In [45]:
train_group['prompt'] = train_group.apply(get_prompt, axis=1, is_train=True)
test_group['prompt'] = test_group.apply(get_prompt, axis=1, is_train=False)

In [46]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp,user_prompt,assistant_prompt,prompt
0,0,"[8706, 9303, 6847, 6921, 6606, 11057, 9129, 12...","[1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1]",12,"[8, 9, 4]","[11, 0, 3, 7]","[5, 6]","[1, 2, 10]","[The Bell Jar, The Structure of Scientific Rev...","[A Heartbreaking Work of Staggering Genius, A ...","[Me Talk Pretty One Day, Sophie's World]","[Darwin's Black Box, The Solitaire Mystery, Ph...","I like the following items:\nContingency, Iron...",Here a list:\n1. A Heartbreaking Work of Stagg...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...
1,1,"[10489, 7463, 12345, 10154, 9875, 12337, 6765,...","[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",20,"[16, 18, 6, 7]","[1, 2, 3, 4, 5, 17]","[19, 12, 13, 14]","[0, 8, 9, 10, 11, 15]","[The Lion, the Witch and the Wardrobe, Prince ...","[The Magician's Nephew, Executive Orders, An E...","[A Letter of Mary, The Shelters of Stone, Gods...","[Troubling a Star, The Young Unicorns, Perelan...",I like the following items:\nThe Language of G...,Here a list:\n1. The Magician's Nephew\n2. Exe...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...


In [47]:
def get_description_liked(row):
    liked_item_index = row['index_like_resp']
    likex_items = np.array(row['item'])[liked_item_index]
    return '\n'.join([item_desc_dict.get(y, '') for y in likex_items])

In [48]:
train_group['descriptions'] = train_group['item'].apply(lambda x: [item_desc_dict.get(y, '') for y in x if len(item_desc_dict.get(y, ''))>2])
#train_group['descriptions'] = train_group.apply(get_description_liked, axis=1)

In [49]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp,user_prompt,assistant_prompt,prompt,descriptions
0,0,"[8706, 9303, 6847, 6921, 6606, 11057, 9129, 12...","[1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1]",12,"[8, 9, 4]","[11, 0, 3, 7]","[5, 6]","[1, 2, 10]","[The Bell Jar, The Structure of Scientific Rev...","[A Heartbreaking Work of Staggering Genius, A ...","[Me Talk Pretty One Day, Sophie's World]","[Darwin's Black Box, The Solitaire Mystery, Ph...","I like the following items:\nContingency, Iron...",Here a list:\n1. A Heartbreaking Work of Stagg...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...,[]
1,1,"[10489, 7463, 12345, 10154, 9875, 12337, 6765,...","[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",20,"[16, 18, 6, 7]","[1, 2, 3, 4, 5, 17]","[19, 12, 13, 14]","[0, 8, 9, 10, 11, 15]","[The Lion, the Witch and the Wardrobe, Prince ...","[The Magician's Nephew, Executive Orders, An E...","[A Letter of Mary, The Shelters of Stone, Gods...","[Troubling a Star, The Young Unicorns, Perelan...",I like the following items:\nThe Language of G...,Here a list:\n1. The Magician's Nephew\n2. Exe...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...,[]


In [50]:
train_group.to_pickle(os.path.join(datapath, f'{dataset}_train_dataset.pkl'))
test_group.to_pickle(os.path.join(datapath, f'{dataset}_test_dataset.pkl'))

In [51]:
prompts = train_group['prompt'].tolist() * 3

In [52]:
descriptions = list(item_desc_dict.values())

In [53]:
descriptions = [f'{tokenizer.bos_token} ' + x + f' {tokenizer.eos_token}' for x in descriptions]

In [54]:
import random 

random.seed(22)
train_set = prompts + descriptions

random.shuffle(train_set)

In [55]:
len(train_set)

20070

In [56]:
import json

with open(os.path.join(datapath, f'{dataset}_train_set_no_kn.jsonl'), 'w') as outfile:
    for x in train_group['prompt'].tolist():
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [57]:
import json

with open(os.path.join(datapath, f'{dataset}_train_set.jsonl'), 'w') as outfile:
    for x in train_set:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [58]:
prompts = test_group['prompt'].tolist()

In [59]:
import json

with open(os.path.join(datapath, f'{dataset}_test_set.jsonl'), 'w') as outfile:
    for x in prompts:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [60]:
train_bs = pd.concat([test_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']],\
                      train_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']]])
test_bs = test_group.loc[:, ['user', 'index_like_resp', 'index_dislike_resp', 'item']]

In [61]:
train_bs['bs_format'] = train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_prompt']], axis=1)
train_bs['bs_format'] = train_bs['bs_format'] + train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_prompt']], axis=1)

test_bs['bs_format'] = test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_resp']], axis=1)
test_bs['bs_format'] = test_bs['bs_format'] + test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_resp']], axis=1)

In [62]:
import itertools

os.makedirs(os.path.join(datapath, 'baseline'), exist_ok=True)
pd.DataFrame(list(itertools.chain(*test_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part3.inter'), sep='\t', index=False)
pd.DataFrame(list(itertools.chain(*train_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part1.inter'), sep='\t', index=False)

# EXTRA

In [264]:
splits = np.array_split(train_group['descriptions'].to_list(), 10)

In [265]:
lens = []
for split in splits:
    tok = tokenizer(split.tolist(), return_length=True)
    lens.append(tok['length'])

In [266]:
display(pd.Series(list(chain(*lens))).describe())

count     1505.000000
mean      5111.520266
std       3790.858846
min          1.000000
25%       2230.000000
50%       4281.000000
75%       7250.000000
max      24073.000000
dtype: float64

In [243]:
tokenizer.encode("<|reserved_special_token_22|>", add_special_tokens=False)

[128027]

In [None]:
if dataset in ['dbbook', 'movielens']:
    item_prop_df = pd.read_csv(os.path.join(datapath, 'item-prop', 'train.tsv'), \
                names=['item', 'prop', 'rel'], sep='\t')
else:
    item_prop_df = pd.read_csv(os.path.join(datapath, 'item-prop', 'train.tsv'), \
                names=['item', 'rel', 'prop'], sep='\t')

In [None]:
item_prop_df

In [None]:
if dataset == 'dbbook':
    mapping_rel = pd.read_csv(os.path.join(datapath, 'mapping_relations.tsv'), \
            names=['id_rel', 'name_rel'], sep='\t')
else:
    mapping_rel = pd.read_csv(os.path.join(datapath, 'mapping_relations.tsv'), \
                names=['name_rel', 'id_rel'], sep='\t')
mapping_rel.head()

In [None]:
mapping_rel['name_rel_red']=mapping_rel['name_rel'].apply(lambda x: x.split("/")[-1])

In [None]:
mapping_rel

In [None]:
mapping_rel[mapping_rel['id_rel'].isin(item_prop_df['rel'].unique())]

In [None]:
item_desription = pd.read_csv(os.path.join(datapath, dataset+'.txt'), sep=';;', names=['item_id', 'description'],  on_bad_lines='skip')

In [None]:
item_desription.head()

In [None]:
len(item_desription['description'].unique())

In [None]:
item_desription.shape

In [None]:
len(item_desription[item_desription['item_id'].isin(user_item_df['item'].unique())]['description'].unique())