In [1]:
dataset = 'dbbook'

domain = {
    'lastfm': 'music',
    'dbbook': 'book',
    'movielens': 'movie'
}[dataset]

items = {
    'lastfm': 'artist',
    'dbbook': 'book',
    'movielens': 'movie'
}[dataset]
import os
datapath = os.path.join('..', 'data', dataset)

special_token = '<|reserved_special_token_22|>'

resp_frac = 0.5

# Set names

In [2]:
import pandas as pd
import os
import random
import numpy as np
import swifter  

In [3]:
user_item_df_train = pd.read_csv(os.path.join(datapath, 'user-item', 'train.tsv'), \
            names=['user', 'item', 'rating'], sep='\t')

user_item_df_test = pd.read_csv(os.path.join(datapath, 'user-item', 'test.tsv'), \
            names=['user', 'item', 'rating'], sep='\t')


df = pd.concat([user_item_df_train, user_item_df_test], ignore_index=True)
all_items = df['item'].unique()

In [4]:
def get_split_random(list_to_split, frac=0.2, margin=0):
    random.seed(22)
    sample_list = random.sample(list_to_split, int(frac*len(list_to_split))+margin)
    complementary_set = set(list_to_split) - set(sample_list)
    return list(set(complementary_set)), list(set(sample_list))

In [5]:
random.seed(22)
test_frac = 0.2
user_list = df['user'].unique().tolist()
user_train, user_test = get_split_random(user_list, frac=test_frac, margin=0)

In [6]:
user_item_df_train = df[df['user'].isin(user_train)]
user_item_df_test = df[df['user'].isin(user_test)]

In [7]:
len(set(user_item_df_train['user'].unique()) - set(user_item_df_test['user'].unique()))

134987

In [8]:
user_item_df_test['rating'].describe()

count    1.259207e+06
mean     5.314051e-01
std      4.990129e-01
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: rating, dtype: float64

In [9]:
user_item_df_train['rating'].describe()

count    5.020177e+06
mean     5.303371e-01
std      4.990789e-01
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: rating, dtype: float64

In [10]:
train_group = user_item_df_train.groupby('user').agg({
    'item': list,
    'rating': list
}).reset_index()
test_group = user_item_df_test.groupby('user').agg({
    'item': list,
    'rating': list
}).reset_index()
train_group['len'] = train_group['item'].apply(len)
test_group['len'] = test_group['item'].apply(len)

In [11]:
def split_prompt_resp(row, resp_frac):
    items = np.array(row['rating'])
    index_likes = np.where(items==1)[0].tolist()
    index_dilikes = np.where(items==0)[0].tolist()
    index_like_prompt = []
    index_like_resp = []    
    index_dislike_prompt = []
    index_dislike_resp = []
    resp_frac=resp_frac
    if len(index_likes) > 0:
        index_like_prompt, index_like_resp = get_split_random(index_likes, frac=resp_frac, margin=1)
    if len(index_dilikes) > 0:
        index_dislike_prompt, index_dislike_resp = get_split_random(index_dilikes, frac=resp_frac, margin=1)
    row['index_like_prompt'] = index_like_prompt
    row['index_like_resp'] = index_like_resp
    row['index_dislike_prompt'] = index_dislike_prompt
    row['index_dislike_resp'] = index_dislike_resp
    return row

In [12]:
test_group['len'].describe()

count    33746.000000
mean        37.314259
std         23.108289
min         11.000000
25%         18.000000
50%         30.000000
75%         52.000000
max         99.000000
Name: len, dtype: float64

In [13]:
train_group['len'].describe()

count    134987.000000
mean         37.190078
std          23.121372
min          11.000000
25%          18.000000
50%          30.000000
75%          51.000000
max          99.000000
Name: len, dtype: float64

In [14]:
test_group = test_group.swifter.apply(split_prompt_resp, axis=1, resp_frac=resp_frac)
train_group = train_group.swifter.apply(split_prompt_resp, axis=1, resp_frac=resp_frac)

Pandas Apply:   0%|          | 0/33746 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/134987 [00:00<?, ?it/s]

In [15]:
test_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp
0,010Dennis,"[253215, 77076, 155068, 180974, 91514, 113294,...","[1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, ...",40,"[33, 34, 35, 7, 39, 10, 15, 17, 23]","[0, 2, 3, 8, 9, 12, 21, 22, 30, 31]","[32, 36, 5, 37, 38, 14, 18, 19, 25, 26]","[1, 4, 6, 11, 13, 16, 20, 24, 27, 28, 29]"
1,01schafi,"[18, 37380, 875, 13308, 11, 132, 45986, 40760,...","[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, ...",80,"[0, 2, 4, 6, 70, 71, 73, 11, 12, 75, 76, 78, 1...","[1, 3, 5, 8, 9, 13, 14, 17, 18, 19, 21, 26, 28...","[64, 34, 67, 69, 40, 15, 52, 25, 29, 31]","[36, 68, 7, 39, 72, 10, 46, 20, 23, 24, 27, 63]"


In [16]:
test_group.shape

(33746, 8)

In [17]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp
0,mycroft,"[10547, 5029, 15987, 17226, 9609, 7479, 5, 328...","[1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1]",14,"[10, 13, 6]","[0, 2, 3, 4, 5]","[9, 11]","[8, 1, 12, 7]"
1,-Loren-,"[700, 27627, 478, 2471, 138788, 17166, 133038,...","[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, ...",79,"[69, 70, 71, 74, 13, 23, 26, 35, 37, 42, 43, 4...","[1, 66, 38, 7, 41, 10, 9, 45, 46, 15, 78, 54, ...","[64, 3, 5, 8, 73, 76, 16, 18, 22, 29, 33, 36, ...","[0, 2, 4, 6, 11, 12, 14, 17, 19, 20, 21, 27, 2..."


In [18]:
train_group.shape

(134987, 8)

In [19]:
if dataset=='movielens':
    def fix_title(title):
        if ", The (" in title:
            name_film, _, year = title.rpartition(", The (")
            title = "The " + name_film + " (" + year
            return title
        if ", A (" in title:
            name_film, _, year = title.rpartition(", A (")
            title = "A " + name_film + " (" + year
        return title
    df_movies = pd.read_csv(os.path.join(datapath, r"movies.dat"), sep="::", names=["item_id", "name", "geners"], encoding='ISO-8859-1')
    df_movies['name'] = df_movies['name'].apply(lambda x: fix_title(x)[0])
    #df_movies['year'] = df_movies['name'].apply(lambda x: fix_title(x)[1])
    import re

    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), \
            names=['url', 'id_set'], sep='\t')
    dbpedia_mapping = pd.read_csv(os.path.join(datapath, 'MappingMovielens2DBpedia-1.2.tsv'), \
            names=['id_movie', 'name', 'dbpedia_url'], sep='\t')
    df_movies = dbpedia_mapping.set_index('dbpedia_url').join(df_relations.set_index('url')).reset_index()
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub("\s+\(\d+\)$", "", x))
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", ' ', x).strip())
    df_movies.dropna(inplace=True)
    df_movies['id_set'] = df_movies['id_set'].astype(int)
    mapping_dict = df_movies.loc[:, ['id_set', 'name']].set_index('id_set').to_dict()['name']
    property_mapping = pd.read_json(os.path.join(datapath, 'entities_names.json'), typ='series')
    mapping_relation = pd.read_csv(os.path.join(datapath, 'mapping_relations.tsv'), \
            names=['rel', 'id'], sep='\t')

In [20]:
if dataset=='dbbook':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), sep='\t')
    df_relations['name'] = df_relations['uri'].apply(lambda x: x.split(";")[0])
    temp = df_relations[df_relations['id'].isin(df['item'].unique())]
    mapping_dict = temp.set_index('id').to_dict()['name']
    property_mapping = pd.read_json(os.path.join(datapath, 'entities_names.json'), typ='series')
    mapping_relation = pd.read_csv(os.path.join(datapath, 'mapping_relations.tsv'), \
            names=['id', 'rel'], sep='\t')

In [21]:
if dataset=='lastfm':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_items.tsv'), \
            names=['id', 'name'], sep='\t')
    mapping_dict = df_relations.set_index('id').to_dict()['name']
    property_mapping = pd.read_json(os.path.join(datapath, 'entities_names.json'), typ='series')
    mapping_relation = pd.read_csv(os.path.join(datapath, 'mapping_relations.tsv'), \
            names=['rel', 'id'], sep='\t')

In [22]:
if dataset=='boardgamegeek':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapped_entities.csv'), sep=';')
    df_relations['name'] = df_relations['primary']
    mapping_dict = df_relations.set_index('id').to_dict()['name']
    property_mapping = pd.read_csv(os.path.join(datapath, 'mapped_prop.csv'), sep=';;', names=['id', 'name'])
    property_mapping = property_mapping.set_index('id').to_dict()['name'] 
    property_mapping = pd.Series(property_mapping) 
    mapping_relation = pd.read_csv(os.path.join(datapath, 'mapped_rel.csv'), \
            names=['id', 'rel'], sep=';;', index_col=False)

  property_mapping = pd.read_csv(os.path.join(datapath, 'mapped_prop.csv'), sep=';;', names=['id', 'name'])
  mapping_relation = pd.read_csv(os.path.join(datapath, 'mapped_rel.csv'), \


In [23]:
import re 

def parse_name_from_url(url):
    name = url.split("/")[-1]
    name = name.replace("_", " ")
    name = name.split(':')[-1]
    name = re.sub(r"\([^()]*\)", "", name)
    pattern = r",.*$"
    name = re.sub(pattern, "", name)
    name = name[0].upper() + name[1:]
    name = " ".join(re.findall('[A-Z][a-z]*', name))
    return name.strip().capitalize()

mapping_relation['name_rel'] = mapping_relation['rel'].apply(parse_name_from_url)
like_dislike_ids = mapping_relation[mapping_relation['name_rel'].isin(['Like', 'Dislike'])]['id'].tolist()
files_prop = [x for x in os.listdir(os.path.join(datapath, 'item-prop')) if 'tsv' in x]
property_graph = pd.concat([pd.read_csv(os.path.join(datapath, 'item-prop', f), \
            names=['item','prop','rel'], sep='\t') for f in files_prop])
property_graph = property_graph[~property_graph['rel'].isin(like_dislike_ids)]
if dataset == 'lastfm':
    property_graph.rename(columns={
        'prop': 'rel', 'rel': 'prop'
    }, inplace=True)
property_graph = property_graph[property_graph['prop'].isin(property_mapping.index)]

In [24]:
lessicalized_prop = {}
if dataset=='dbbook':
    lessicalized_prop['Author'] = 'The book was written by [Author].'
    lessicalized_prop['Series'] =  'This book is part of a series called [Series].'
    lessicalized_prop['Genre'] =  'This is a [Genre] novel.'
    lessicalized_prop['Publisher'] = 'This book was published by [Publisher].'
    lessicalized_prop['Preceded by'] = 'This book is preceded by [Preceded by].'
    lessicalized_prop['Subject'] = 'The story is centered around the theme of [Subject].'
    lessicalized_prop['Illustrator'] = 'The illustrations were done by [Illustrator].'
    lessicalized_prop['Writers'] = 'The screenplay was written by [Writers].'
    lessicalized_prop['Artists'] = 'The artwork was created by [Artists].'
    lessicalized_prop['Letterers'] = 'The lettering was done by [Letterers].'
    lessicalized_prop['Director'] = 'The film adaptation was directed by [Director].'
    lessicalized_prop['Screenplay'] = 'The screenplay was written by [Screenplay].'
    lessicalized_prop['Starring'] = 'The film stars [Starring].'
    lessicalized_prop['Cinematography'] = 'The cinematography was handled by [Cinematography].'
    lessicalized_prop['Producer'] = 'The film was produced by [Producer].'
    lessicalized_prop['Distributor'] = 'The film was distributed by [Distributor].'
    lessicalized_prop['Editor'] = 'This book was edited by [Editor].'
    lessicalized_prop['Category'] = 'This book falls under the category of [Category].'
    lessicalized_prop['Company'] = 'This book was published by [Company].'
    lessicalized_prop['Writer'] = 'This book was written by [Writer].'
    lessicalized_prop['Place'] = 'The story takes place in [Place].'
    lessicalized_prop['Characters'] = 'The main characters include [Characters].'
    lessicalized_prop['Setting'] = 'The story is set in [Setting].'
    lessicalized_prop['Film adaption'] = 'This book has been adapted into a film titled [Film adaption].'
    lessicalized_prop['Political'] = 'This book explores themes of [Political] politics.'
    lessicalized_prop['Title'] = 'The title of the book is [Title].'
    lessicalized_prop['Editors'] = 'This book was edited by [Editors].'
    lessicalized_prop['Creators'] = 'This book was created by [Creators].'
    lessicalized_prop['Magazine'] = 'This story was originally serialized in [Magazine].'
    lessicalized_prop['Studio'] = 'The film adaptation was produced by [Studio].'
    lessicalized_prop['Network'] = 'This book has been adapted into a TV series on [Network].'
    lessicalized_prop['Narrated'] = 'The audiobook is narrated by [Narrated].'
    lessicalized_prop['Creator'] = 'This work was created by [Creator].'
    lessicalized_prop['Published'] = 'The book was published in [Published].'
    lessicalized_prop['Authors'] = 'This book was written by the following authors: [Authors].'
    lessicalized_prop['Co author'] = 'The author co-authored this book with [Co-author name].'

if dataset == 'lastfm':
    lessicalized_prop['Subject'] = 'The artist\'s work focuses primarily on themes of [Subject].'
    lessicalized_prop['Genre'] = 'The artist is known for their unique contribution to the [Genre] genre.'
    lessicalized_prop['Current members'] = 'Currently, the artist\'s lineup includes [Current members].'
    lessicalized_prop['Origin'] = 'The artist hails from [Origin], bringing a distinctive sound to their music.'
    lessicalized_prop['Past members'] = 'Over the years, the artist has seen several changes, with past members including [Past members].'
    lessicalized_prop['Occupation'] = 'In addition to being a musician, the artist also works as [Occupation].'
    lessicalized_prop['Instrument'] = 'The artist is proficient in playing the [Instrument].'
    lessicalized_prop['Genres'] = 'Their music spans across multiple genres, including [Genres].'
    lessicalized_prop['Home town'] = 'The artist grew up in [Home town], which has greatly influenced their music.'
    lessicalized_prop['Religion'] = 'The artist\'s work is often inspired by their religious beliefs in [Religion].'
    lessicalized_prop['Partner'] = 'In their personal life, the artist is partnered with [Partner].'
    lessicalized_prop['Alias'] = 'The artist is also known by the alias [Alias].'
    lessicalized_prop['Title'] = 'The artist holds the title of [Title] in the music industry.'
    lessicalized_prop['Author'] = 'They are also an accomplished author, having written [Author].'
    lessicalized_prop['Work'] = 'The artist\'s notable works include [Work].'
    lessicalized_prop['Notable instruments'] = 'They are renowned for their skill with notable instruments such as [Notable instruments].'
    lessicalized_prop['Manager'] = 'The artist is managed by [Manager].'
    lessicalized_prop['Former members'] = 'Former members of the artist\'s ensemble include [Former members].'
    lessicalized_prop['Voice type'] = 'The artist is recognized for their [Voice type] voice.'
    lessicalized_prop['Siblings'] = 'The artist has [Siblings] siblings who have also influenced their music career.'
    lessicalized_prop['Nationality'] = 'They proudly represent their [Nationality] heritage in their music.'
    lessicalized_prop['Instruments'] = 'The artist is adept at playing several instruments, including [Instruments].'
    lessicalized_prop['Television'] = 'In addition to music, the artist has appeared on television shows such as [Television].'
    lessicalized_prop['Influences'] = 'Their musical influences include [Influences].'
    lessicalized_prop['Agency'] = 'The artist is represented by the agency [Agency].'
    lessicalized_prop['Record labels'] = 'They have been signed with record labels such as [Record labels].'
    lessicalized_prop['Touring members'] = 'When touring, the artist is accompanied by members such as [Touring members].'
    lessicalized_prop['Former labels'] = 'The artist has previously been associated with former labels like [Former labels].'
    lessicalized_prop['Trainer'] = 'Their skills have been honed under the guidance of their trainer, [Trainer].'

if dataset=='movielens':
    lessicalized_prop['Starring'] = 'In this movie, [Starring]  take on a prominent role.'
    lessicalized_prop['Producer'] = 'Producing this movie is [Producer].'
    lessicalized_prop['Language'] = 'This movie is primarily in [Language].'
    lessicalized_prop['Editing'] = 'The editing in this movie is handled by [Editing].'
    lessicalized_prop['Country'] = 'This movie is set in [Country].'
    lessicalized_prop['Music composer'] = 'The musics of this movie is composed by [Music composer].'
    lessicalized_prop['Based on'] = 'This movie is based on [Based on].'
    lessicalized_prop['Subject'] = 'this movie explores [Subject] themes.'
    lessicalized_prop['Cinematography'] = 'The visuals of this movie are captured by [Cinematography].'
    lessicalized_prop['Writer'] = 'The narrative of this movie is penned by [Writer].'
    lessicalized_prop['Director'] = 'Guiding the vision of this movie is the director, [Director].'

if dataset=='boardgamegeek':
    lessicalized_prop['Type'] = 'This game falls under the [Type] category.'
    lessicalized_prop['Primary'] = 'The primary feature of this game is [Primary].'
    lessicalized_prop['Alternate'] = 'An alternate name for this game is [Alternate].'
    lessicalized_prop['Yearpublished'] = 'This game was published in the year [Yearpublished].'
    lessicalized_prop['Minplayers'] = 'The minimum number of players required is [Minplayers].'
    lessicalized_prop['Maxplayers'] = 'The maximum number of players allowed is [Maxplayers].'
    lessicalized_prop['Playingtime'] = 'The average playing time for this game is [Playingtime] minutes.'
    lessicalized_prop['Minplaytime'] = 'The minimum playtime for this game is [Minplaytime] minutes.'
    lessicalized_prop['Maxplaytime'] = 'The maximum playtime for this game is [Maxplaytime] minutes.'
    lessicalized_prop['Minage'] = 'The minimum age recommended to play this game is [Minage] years.'
    lessicalized_prop['Boardgamecategory'] = 'This game belongs to the [Boardgamecategory] category.'
    lessicalized_prop['Boardgamemechanic'] = 'The main mechanic of this game is [Boardgamemechanic].'
    lessicalized_prop['Boardgamefamily'] = 'This game is part of the [Boardgamefamily] family.'
    lessicalized_prop['Boardgameexpansion'] = '[Boardgameexpansion] is an expansion for this game.'
    lessicalized_prop['Boardgameimplementation'] = 'This game implements [Boardgameimplementation].'
    lessicalized_prop['Boardgamedesigner'] = 'The designer of this game is [Boardgamedesigner].'
    lessicalized_prop['Boardgameartist'] = 'The artist for this game is [Boardgameartist].'
    lessicalized_prop['Boardgamepublisher'] = 'This game is published by [Boardgamepublisher].'


In [25]:
mapping_name_rel =property_graph.set_index('rel').join(mapping_relation.loc[:, ['name_rel', 'id']].set_index('id')).reset_index()
goupd_mapping = mapping_name_rel.groupby(['item', 'name_rel']).agg({
    'prop':list
})

In [26]:
temp = goupd_mapping.reset_index()
kn_lex = {}
property_mapping_dict = property_mapping.to_dict()
for item in temp['item'].unique():
    item_df = temp[temp['item'] == item].sort_values('name_rel')
    item_name = mapping_dict.get(item, None)
    if item_name == None:
        continue
    prompt = f'{item_name} is a {items}.'
    for _, row in item_df.iterrows():
        names_property = [property_mapping_dict[x] for x in row['prop'] if property_mapping_dict.get(x, None)!=None]
        prompt += f" {lessicalized_prop[row['name_rel']]}"
        if len(names_property) > 1:
            names_property_str = ", ".join(names_property)
        else:
            names_property_str = names_property[0]
        prompt = prompt.replace(f"[{row['name_rel']}]", names_property_str)
    kn_lex[item] = prompt.strip()

In [27]:
import json

with open(os.path.join(datapath, f'{dataset}_domain_kn_graph.jsonl'), 'w') as outfile:
    for new_id, desc in kn_lex.items():
        json.dump({'target_id':str(new_id), 'text':desc}, outfile)
        outfile.write('\n')

In [28]:
item_desc_df = pd.read_json(os.path.join(datapath, f'{dataset}_domain_kn_graph.jsonl'), lines=True) 
item_desc_df['target_id'] = item_desc_df['target_id'].astype(int)
item_desc_dict = item_desc_df.set_index('target_id').to_dict()['text']

## Prepare prompt

In [29]:
train_group.head()

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp
0,mycroft,"[10547, 5029, 15987, 17226, 9609, 7479, 5, 328...","[1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1]",14,"[10, 13, 6]","[0, 2, 3, 4, 5]","[9, 11]","[8, 1, 12, 7]"
1,-Loren-,"[700, 27627, 478, 2471, 138788, 17166, 133038,...","[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, ...",79,"[69, 70, 71, 74, 13, 23, 26, 35, 37, 42, 43, 4...","[1, 66, 38, 7, 41, 10, 9, 45, 46, 15, 78, 54, ...","[64, 3, 5, 8, 73, 76, 16, 18, 22, 29, 33, 36, ...","[0, 2, 4, 6, 11, 12, 14, 17, 19, 20, 21, 27, 2..."
2,-LucaS-,"[478, 27162, 826, 7976, 12942, 21241, 1261, 13...","[1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, ...",23,"[1, 18, 5, 8, 10, 14]","[0, 2, 3, 7, 9, 13, 19]","[16, 17, 20, 22]","[4, 6, 11, 12, 15, 21]"
3,-Morphling-,"[28143, 307002, 163412, 3076, 30549, 70919, 24...","[1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, ...",52,"[32, 33, 39, 44, 45, 14, 48, 19, 20, 51, 23, 2...","[0, 1, 34, 36, 4, 7, 9, 41, 11, 42, 15, 16, 50...","[35, 5, 38, 40, 12, 46, 47, 17, 18, 22, 28]","[2, 3, 37, 6, 8, 10, 43, 13, 49, 25, 26, 30]"
4,-mik-,"[6249, 822, 43443, 15, 204, 70323, 28720, 1883...","[0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, ...",45,"[35, 4, 39, 42, 11, 44, 18, 19, 27, 28]","[1, 33, 3, 6, 7, 8, 14, 21, 22, 29, 30]","[32, 2, 38, 41, 13, 16, 17, 20, 24, 25, 31]","[0, 34, 36, 37, 5, 40, 9, 10, 43, 12, 15, 23, 26]"


In [30]:
mapping_dict[8670]

'Der Herr der Ringe: Die Rückkehr des Königs'

In [31]:
def map_with_names(row, col, mapping_dict):
    return [mapping_dict.get(x, '') for x in np.array(row['item'])[row[col]] if len(mapping_dict.get(x, ''))>2]


train_group['liked_prompt'] = train_group.apply(map_with_names, axis=1, col='index_like_prompt', mapping_dict=mapping_dict)
train_group['liked_resp'] = train_group.apply(map_with_names, axis=1, col='index_like_resp', mapping_dict=mapping_dict)
train_group['disliked_prompt'] = train_group.apply(map_with_names, axis=1, col='index_dislike_prompt', mapping_dict=mapping_dict)
train_group['disliked_resp'] = train_group.apply(map_with_names, axis=1, col='index_dislike_resp', mapping_dict=mapping_dict)

test_group['liked_prompt'] = test_group.apply(map_with_names, axis=1, col='index_like_prompt', mapping_dict=mapping_dict)
test_group['liked_resp'] = test_group.apply(map_with_names, axis=1, col='index_like_resp', mapping_dict=mapping_dict)
test_group['disliked_prompt'] = test_group.apply(map_with_names, axis=1, col='index_dislike_prompt', mapping_dict=mapping_dict)
test_group['disliked_resp'] = test_group.apply(map_with_names, axis=1, col='index_dislike_resp', mapping_dict=mapping_dict)

In [32]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp
0,mycroft,"[10547, 5029, 15987, 17226, 9609, 7479, 5, 328...","[1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1]",14,"[10, 13, 6]","[0, 2, 3, 4, 5]","[9, 11]","[8, 1, 12, 7]","[Catan, Talisman (Third Edition), Acquire]","[Betrayal at House on the Hill, Arkham Horror,...","[The Lord of the Rings, Runebound]","[Risk: Star Wars – Clone Wars Edition, Monopol..."
1,-Loren-,"[700, 27627, 478, 2471, 138788, 17166, 133038,...","[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, ...",79,"[69, 70, 71, 74, 13, 23, 26, 35, 37, 42, 43, 4...","[1, 66, 38, 7, 41, 10, 9, 45, 46, 15, 78, 54, ...","[64, 3, 5, 8, 73, 76, 16, 18, 22, 29, 33, 36, ...","[0, 2, 4, 6, 11, 12, 14, 17, 19, 20, 21, 27, 2...","[Aton, Othello, Star Wars Miniatures, Middle-E...","[Talisman: Revised 4th Edition, Dungeon Twiste...",[Dungeons & Dragons: Conquest of Nerath Board ...,"[Battle Masters, Citadels, Dungeon Roll, Pathf..."


In [33]:
train = []
test = []
for _, row in test_group.iterrows():
    for item in row['liked_prompt']:
        train.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 1
        })
    for item in row['disliked_prompt']:
        train.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 0
        })
    for item in row['liked_resp']:
        test.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 1
        })
    for item in row['disliked_resp']:
        test.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 0
        })

pd.DataFrame(train).to_csv("train_adaptation.csv")
                    

In [34]:
import random
import itertools

def prepare_user_prompt(row):
    random.seed(22)
    user_prompt = ''
    liked_list = ', '
    disliked_list = ', '
    liked_part = """I like the following items:\n{liked_list}\n"""
    disliked_part = """I dislike the following items:\n{disliked_list}\n"""
    liked_items = sorted(row['liked_prompt'], key=lambda k: random.random())
    disliked_items = sorted(row['disliked_prompt'], key=lambda k: random.random())
    liked_list = liked_list.join(liked_items)
    disliked_list = disliked_list.join(disliked_items)
    if len(liked_items)>0:
        user_prompt = user_prompt + liked_part.format(liked_list=liked_list)
    if len(disliked_list)>0:
        user_prompt = user_prompt + disliked_part.format(disliked_list=disliked_list)
    
    to_rank_list_str = ', '
    response_part = """\nRank the following items:\n{to_rank_list}"""
    to_rank_list = list(itertools.chain(*[row['liked_resp'], row['disliked_resp']]))
    to_rank_list = sorted(to_rank_list, key=lambda k: random.random())
    to_rank_list = to_rank_list_str.join(to_rank_list)
    user_prompt = user_prompt + response_part.format(to_rank_list=to_rank_list)
    return user_prompt
        
def get_assistant_prompt(row):
    ranked_list_str = ''
    rank_part = """Here a list:\n{ranked_list}\n"""
    for i, x in enumerate(row['liked_resp']):
        ranked_list_str = ranked_list_str + f"{i+1}. {x}\n"
    for i, x in enumerate(row['disliked_resp'], len(row['liked_resp'])):
        ranked_list_str = ranked_list_str + f"{i+1}. {x}\n"
    return rank_part.format(ranked_list=ranked_list_str)[:-1]


train_group['user_prompt'] = train_group.apply(prepare_user_prompt, axis=1)
train_group['assistant_prompt'] = train_group.apply(get_assistant_prompt, axis=1)

test_group['user_prompt'] = test_group.apply(prepare_user_prompt, axis=1)
test_group['assistant_prompt'] = test_group.apply(get_assistant_prompt, axis=1)

In [35]:
print(train_group.loc[150, 'user_prompt'])

I like the following items:
Neta-Tanka: Deluxe Edition, Zombicide: Black Plague, The Brigade, Evolution: Climate, Burgle Bros., The Gallerist, Pandemic Legacy: Season 1, Trickerion: Legends of Illusion, Pax Pamir: Second Edition, Funny Friends, Chimera Station, Too Many Bones
I dislike the following items:
Dead Men Tell No Tales, Merchants & Marauders, Agricola, Eclipse, Caverna: The Cave Farmers, Terraforming Mars, Le Havre, The Manhattan Project, Coffee Traders, Gaia Project, Tzolk'in: The Mayan Calendar, Power Grid: The First Sparks, Grog Island, Food Chain Magnate, Race for the Galaxy, Beasty Bar, Splendor, Great Western Trail, DinoGenics, On Mars

Rank the following items:
Quirky Circuits, Captains of the Gulf, Igel Ärgern, Pangea, Spirit Island, Patchwork, Lords of Waterdeep, The 7th Continent, 7 Wonders, The Castles of Burgundy, Brass: Birmingham, Pandemic Legacy: Season 2, Terra Mystica, Time of Crisis: The Roman Empire in Turmoil, 235-284 AD, Through the Ages: A New Story of C

In [36]:
print(train_group.loc[150, 'assistant_prompt'])

Here a list:
1. The 7th Continent
2. Lewis & Clark: The Expedition
3. Clans of Caledonia
4. Altiplano
5. Pangea
6. Root
7. Evolution
8. Spirit Island
9. Captains of the Gulf
10. Bios: Megafauna (Second Edition)
11. Quirky Circuits
12. Cottage Garden
13. Mechs vs. Minions
14. Palm Island
15. Terra Mystica
16. The Voyages of Marco Polo
17. Azul
18. Lords of Waterdeep
19. The Castles of Burgundy
20. Wingspan
21. Patchwork
22. Betrayal at House on the Hill
23. A Feast for Odin
24. Through the Ages: A New Story of Civilization
25. Orléans
26. Viral
27. Igel Ärgern
28. Drum Roll
29. 7 Wonders
30. Time of Crisis: The Roman Empire in Turmoil, 235-284 AD
31. Pandemic Legacy: Season 2
32. Power Grid: Factory Manager
33. Aeon's End
34. HeroQuest
35. Brass: Birmingham



In [37]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("../llama3")
tokenizer.pad_token = tokenizer.eos_token
#tokenizer.chat_template =  "{% set ns = namespace(i=0) %}" \
#                            "{% for message in messages %}" \
#                                "{% if message['role'] == 'system' and ns.i == 0 %}" \
#                                       "{{ bos_token +' [INST] <<SYS>>\n' }}" \
#                                       "{{ message['content'] + ' <</SYS>>\n'}}" \
#                                "{% elif message['role'] == 'user' %}" \
#                                    "{{ message['content'] + ' [/INST]\n'}}" \
#                                "<|reserved_special_token_22|>\n"\
#                                "{% elif message['role'] == 'assistant' %}" \
#                                    "{{ message['content'] + ' ' + eos_token }}" \
#                                "{% endif %}" \
#                                "{% set ns.i = ns.i+1 %}" \
#                            "{% endfor %}"
#
tokenizer.chat_template =  "{% set ns = namespace(i=0) %}" \
                            "{% for message in messages %}" \
                                "{% if message['role'] == 'system' and ns.i == 0 %}" \
                                       "{{ bos_token +' [INST] <<SYS>>\n' }}" \
                                       "{{ message['content'] + ' <</SYS>>\n'}}" \
                                "{% elif message['role'] == 'user' %}" \
                                    "{{ message['content'] + ' [/INST]\n'}}" \
                                "{% elif message['role'] == 'assistant' %}" \
                                    "{{ message['content'] + '' + eos_token }}" \
                                "{% endif %}" \
                                "{% set ns.i = ns.i+1 %}" \
                            "{% endfor %}"


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\petru\.pyenv\pyenv-win\versions\3.10.0\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\petru\.pyenv\pyenv-win\versions\3.10.0\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\petru\Documents\python_envs\transformerVenv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\petru\Documents\python_envs\transformerVenv\lib\site-packages\traitlets\config\applic

In [38]:
def get_prompt(row, is_train=True):
    messages = [
    {"role": "system", "content": f"You're operating as a {domain} recommendation system. Your task involves receiving a user's list of liked and disliked items, along with a set of candidate items, and then reordering them based on preferences."},
    {"role": "user", "content": row['user_prompt']}
    ]
    if is_train:
        messages.append({"role": "assistant", "content": row['assistant_prompt']})
    return tokenizer.apply_chat_template(messages, tokenize=False)

In [39]:
train_group['prompt'] = train_group.apply(get_prompt, axis=1, is_train=True)
test_group['prompt'] = test_group.apply(get_prompt, axis=1, is_train=False)

In [40]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp,user_prompt,assistant_prompt,prompt
0,mycroft,"[10547, 5029, 15987, 17226, 9609, 7479, 5, 328...","[1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1]",14,"[10, 13, 6]","[0, 2, 3, 4, 5]","[9, 11]","[8, 1, 12, 7]","[Catan, Talisman (Third Edition), Acquire]","[Betrayal at House on the Hill, Arkham Horror,...","[The Lord of the Rings, Runebound]","[Risk: Star Wars – Clone Wars Edition, Monopol...","I like the following items:\nAcquire, Talisman...",Here a list:\n1. Betrayal at House on the Hill...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...
1,-Loren-,"[700, 27627, 478, 2471, 138788, 17166, 133038,...","[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, ...",79,"[69, 70, 71, 74, 13, 23, 26, 35, 37, 42, 43, 4...","[1, 66, 38, 7, 41, 10, 9, 45, 46, 15, 78, 54, ...","[64, 3, 5, 8, 73, 76, 16, 18, 22, 29, 33, 36, ...","[0, 2, 4, 6, 11, 12, 14, 17, 19, 20, 21, 27, 2...","[Aton, Othello, Star Wars Miniatures, Middle-E...","[Talisman: Revised 4th Edition, Dungeon Twiste...",[Dungeons & Dragons: Conquest of Nerath Board ...,"[Battle Masters, Citadels, Dungeon Roll, Pathf...",I like the following items:\nStar Wars Miniatu...,Here a list:\n1. Talisman: Revised 4th Edition...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...


In [41]:
def get_description_liked(row):
    liked_item_index = row['index_like_resp']
    likex_items = np.array(row['item'])[liked_item_index]
    return '\n'.join([item_desc_dict.get(y, '') for y in likex_items])

In [42]:
train_group['descriptions'] = train_group['item'].apply(lambda x: [item_desc_dict.get(y, '') for y in x if len(item_desc_dict.get(y, ''))>2])
#train_group['descriptions'] = train_group.apply(get_description_liked, axis=1)

In [43]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp,user_prompt,assistant_prompt,prompt,descriptions
0,mycroft,"[10547, 5029, 15987, 17226, 9609, 7479, 5, 328...","[1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1]",14,"[10, 13, 6]","[0, 2, 3, 4, 5]","[9, 11]","[8, 1, 12, 7]","[Catan, Talisman (Third Edition), Acquire]","[Betrayal at House on the Hill, Arkham Horror,...","[The Lord of the Rings, Runebound]","[Risk: Star Wars – Clone Wars Edition, Monopol...","I like the following items:\nAcquire, Talisman...",Here a list:\n1. Betrayal at House on the Hill...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...,[Betrayal at House on the Hill is a board game...
1,-Loren-,"[700, 27627, 478, 2471, 138788, 17166, 133038,...","[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, ...",79,"[69, 70, 71, 74, 13, 23, 26, 35, 37, 42, 43, 4...","[1, 66, 38, 7, 41, 10, 9, 45, 46, 15, 78, 54, ...","[64, 3, 5, 8, 73, 76, 16, 18, 22, 29, 33, 36, ...","[0, 2, 4, 6, 11, 12, 14, 17, 19, 20, 21, 27, 2...","[Aton, Othello, Star Wars Miniatures, Middle-E...","[Talisman: Revised 4th Edition, Dungeon Twiste...",[Dungeons & Dragons: Conquest of Nerath Board ...,"[Battle Masters, Citadels, Dungeon Roll, Pathf...",I like the following items:\nStar Wars Miniatu...,Here a list:\n1. Talisman: Revised 4th Edition...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...,[Battle Masters is a board game. An alternate ...


In [44]:
train_group.to_pickle(os.path.join(datapath, f'{dataset}_train_dataset_graph.pkl'))
test_group.to_pickle(os.path.join(datapath, f'{dataset}_test_dataset_graph.pkl'))

In [45]:
prompts = train_group['prompt'].tolist() #* 3

In [46]:
descriptions = list(item_desc_dict.values())

In [47]:
descriptions = [f'{tokenizer.bos_token} ' + x + f' {tokenizer.eos_token}' for x in descriptions]

In [48]:
import json

with open(os.path.join(datapath, f'{dataset}_graph_kn_train_set.jsonl'), 'w') as outfile:
    for x in descriptions:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [48]:
import random 

random.seed(22)
train_set = prompts + descriptions* 3

random.shuffle(train_set)

In [49]:
len(train_set)

199670

In [50]:
import json

with open(os.path.join(datapath, f'{dataset}_train_set_kgraph.jsonl'), 'w') as outfile:
    for x in train_set:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [51]:
import json

with open(os.path.join(datapath, f'{dataset}_train_set_no_kn.jsonl'), 'w') as outfile:
    for x in train_group['prompt'].tolist():
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [53]:
prompts = test_group['prompt'].tolist()

In [54]:
import json

with open(os.path.join(datapath, f'{dataset}_test_set.jsonl'), 'w') as outfile:
    for x in prompts:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

# Baseline

In [49]:
train_bs = pd.concat([test_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']],\
                      train_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']]])
test_bs = test_group.loc[:, ['user', 'index_like_resp', 'index_dislike_resp', 'item']]

In [50]:
train_bs['bs_format'] = train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_prompt']], axis=1)
train_bs['bs_format'] = train_bs['bs_format'] + train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_prompt']], axis=1)

test_bs['bs_format'] = test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_resp']], axis=1)
test_bs['bs_format'] = test_bs['bs_format'] + test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_resp']], axis=1)

In [51]:
item_set_bs = set(x['item_id:token'] for x in list(itertools.chain(*test_bs['bs_format'].tolist())) + list(itertools.chain(*train_bs['bs_format'].tolist())))

In [52]:
property_graph_bs = property_graph[property_graph['item'].isin(item_set_bs)].rename(columns={
    'item':"head_id:token",
    'rel': "relation_id:token",
    'prop': "tail_id:token"
})
link_bs = property_graph_bs.loc[:, ['head_id:token']].copy()
link_bs['item_id:token'] = link_bs.loc[:, ['head_id:token']].apply(lambda x:x)
link_bs = link_bs.rename(columns={
    'head_id:token': "entity_id:token",
})

property_graph_bs.loc[:, ['head_id:token', 'relation_id:token', 'tail_id:token']].to_csv(os.path.join(datapath, 'baseline', f'{dataset}.kg'), sep='\t', index=False)
link_bs.to_csv(os.path.join(datapath, 'baseline', f'{dataset}.link'), sep='\t', index=False)

In [62]:
import itertools

os.makedirs(os.path.join(datapath, 'baseline'), exist_ok=True)
pd.DataFrame(list(itertools.chain(*test_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part3.inter'), sep='\t', index=False)
pd.DataFrame(list(itertools.chain(*train_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part1.inter'), sep='\t', index=False)

# EXTRA

In [264]:
splits = np.array_split(train_group['descriptions'].to_list(), 10)

In [265]:
lens = []
for split in splits:
    tok = tokenizer(split.tolist(), return_length=True)
    lens.append(tok['length'])

In [266]:
display(pd.Series(list(chain(*lens))).describe())

count     1505.000000
mean      5111.520266
std       3790.858846
min          1.000000
25%       2230.000000
50%       4281.000000
75%       7250.000000
max      24073.000000
dtype: float64

In [243]:
tokenizer.encode("<|reserved_special_token_22|>", add_special_tokens=False)

[128027]

In [None]:
if dataset in ['dbbook', 'movielens']:
    item_prop_df = pd.read_csv(os.path.join(datapath, 'item-prop', 'train.tsv'), \
                names=['item', 'prop', 'rel'], sep='\t')
else:
    item_prop_df = pd.read_csv(os.path.join(datapath, 'item-prop', 'train.tsv'), \
                names=['item', 'rel', 'prop'], sep='\t')

In [None]:
item_prop_df

In [None]:
if dataset == 'dbbook':
    mapping_rel = pd.read_csv(os.path.join(datapath, 'mapping_relations.tsv'), \
            names=['id_rel', 'name_rel'], sep='\t')
else:
    mapping_rel = pd.read_csv(os.path.join(datapath, 'mapping_relations.tsv'), \
                names=['name_rel', 'id_rel'], sep='\t')
mapping_rel.head()

In [None]:
mapping_rel['name_rel_red']=mapping_rel['name_rel'].apply(lambda x: x.split("/")[-1])

In [None]:
mapping_rel

In [None]:
mapping_rel[mapping_rel['id_rel'].isin(item_prop_df['rel'].unique())]

In [None]:
item_desription = pd.read_csv(os.path.join(datapath, dataset+'.txt'), sep=';;', names=['item_id', 'description'],  on_bad_lines='skip')

In [None]:
item_desription.head()

In [None]:
len(item_desription['description'].unique())

In [None]:
item_desription.shape

In [None]:
len(item_desription[item_desription['item_id'].isin(user_item_df['item'].unique())]['description'].unique())