In [1]:
dataset = 'movielens'

domain = {
    'lastfm': 'music',
    'dbbook': 'book',
    'movielens': 'movie'
}[dataset]

items = {
    'lastfm': 'artist',
    'dbbook': 'book',
    'movielens': 'movie'
}[dataset]
import os
datapath = os.path.join('..', 'data', dataset)

special_token = '<|reserved_special_token_22|>'
k_prompt = 100
resp_frac = 0.5

# Set names

In [2]:
import pandas as pd
import os
import random
import numpy as np
import swifter  

In [3]:
user_item_df_train = pd.read_csv(os.path.join(datapath, 'user-item', 'train.tsv'), \
            names=['user', 'item', 'rating'], sep='\t')

user_item_df_test = pd.read_csv(os.path.join(datapath, 'user-item', 'test.tsv'), \
            names=['user', 'item', 'rating'], sep='\t')


df = pd.concat([user_item_df_train, user_item_df_test], ignore_index=True)
all_items = df['item'].unique()

In [4]:
def get_split_random(list_to_split, frac=0.2, margin=0):
    random.seed(22)
    sample_list = random.sample(list_to_split, int(frac*len(list_to_split))+margin)
    complementary_set = set(list_to_split) - set(sample_list)
    return list(set(complementary_set)), list(set(sample_list))

In [5]:
random.seed(22)
test_frac = 0.2
user_list = df['user'].unique().tolist()
user_train, user_test = get_split_random(user_list, frac=test_frac, margin=0)

In [6]:
user_item_df_train = df[df['user'].isin(user_train)]
user_item_df_test = df[df['user'].isin(user_test)]

In [7]:
len(set(user_item_df_train['user'].unique()) - set(user_item_df_test['user'].unique()))

4829

In [8]:
user_item_df_test['rating'].describe()

count    181586.000000
mean          0.574697
std           0.494390
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: rating, dtype: float64

In [9]:
user_item_df_train['rating'].describe()

count    764534.000000
mean          0.572116
std           0.494772
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: rating, dtype: float64

In [10]:
train_group = user_item_df_train.groupby('user').agg({
    'item': list,
    'rating': list
}).reset_index()
test_group = user_item_df_test.groupby('user').agg({
    'item': list,
    'rating': list
}).reset_index()
train_group['len'] = train_group['item'].apply(len)
test_group['len'] = test_group['item'].apply(len)

In [11]:
def split_prompt_resp(row, resp_frac):
    items = np.array(row['rating'])
    index_likes = np.where(items==1)[0].tolist()
    index_dilikes = np.where(items==0)[0].tolist()
    index_like_prompt = []
    index_like_resp = []    
    index_dislike_prompt = []
    index_dislike_resp = []
    resp_frac=resp_frac
    if len(index_likes) > 0:
        index_like_prompt, index_like_resp = get_split_random(index_likes, frac=resp_frac, margin=1)
    if len(index_dilikes) > 0:
        index_dislike_prompt, index_dislike_resp = get_split_random(index_dilikes, frac=resp_frac, margin=1)
    row['index_like_prompt'] = index_like_prompt
    row['index_like_resp'] = index_like_resp
    row['index_dislike_prompt'] = index_dislike_prompt
    row['index_dislike_resp'] = index_dislike_resp
    return row

In [12]:
test_group['len'].describe()

count    1207.000000
mean      150.444076
std       170.309605
min        17.000000
25%        39.000000
50%        86.000000
75%       193.000000
max      1276.000000
Name: len, dtype: float64

In [13]:
train_group['len'].describe()

count    4829.000000
mean      158.321392
std       185.232239
min        14.000000
25%        42.000000
50%        91.000000
75%       196.000000
max      2078.000000
Name: len, dtype: float64

In [14]:
test_group = test_group.swifter.apply(split_prompt_resp, axis=1, resp_frac=resp_frac)
train_group = train_group.swifter.apply(split_prompt_resp, axis=1, resp_frac=resp_frac)

Pandas Apply:   0%|          | 0/1207 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4829 [00:00<?, ?it/s]

In [15]:
test_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp
0,5,"[8531, 8533, 9055, 6859, 6271, 7363, 6864, 744...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...",70,"[0, 64, 2, 67, 4, 14, 16, 21, 25, 31, 36, 37, ...","[1, 3, 5, 6, 9, 11, 13, 15, 17, 18, 20, 22, 23...","[35, 68, 38, 10, 51, 53, 62, 27, 29, 30]","[32, 33, 7, 8, 12, 45, 46, 48, 19, 24, 28]"
1,7,"[6856, 6042, 6270, 7444, 7889, 8122, 6052, 650...","[1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",129,"[0, 5, 7, 8, 11, 12, 18, 19, 26, 34, 41, 43, 4...","[2, 6, 9, 10, 14, 15, 17, 21, 23, 25, 27, 28, ...","[128, 1, 65, 4, 71, 72, 73, 74, 16, 83, 84, 87...","[3, 13, 20, 22, 24, 29, 31, 36, 39, 42, 50, 51..."


In [16]:
test_group.shape

(1207, 8)

In [17]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp
0,0,"[8213, 6267, 6857, 6858, 6916, 7258, 8270, 686...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...",49,"[0, 2, 3, 4, 6, 9, 15, 19, 23, 29, 30, 31, 32,...","[1, 5, 7, 8, 10, 12, 16, 17, 18, 20, 21, 22, 2...","[48, 42, 47]","[11, 13, 14, 24, 28]"
1,1,"[6493, 8733, 7257, 7887, 7444, 8122, 7061, 789...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...",122,"[0, 2, 6, 10, 14, 15, 20, 28, 31, 37, 42, 46, ...","[1, 5, 8, 13, 25, 26, 30, 34, 36, 38, 40, 44, ...","[66, 3, 67, 70, 7, 71, 73, 74, 11, 19, 21, 85,...","[4, 9, 12, 16, 17, 18, 22, 27, 29, 32, 33, 39,..."


In [18]:
train_group.shape

(4829, 8)

In [19]:
if dataset=='movielens':
    def fix_title(title):
        if ", The (" in title:
            name_film, _, year = title.rpartition(", The (")
            title = "The " + name_film + " (" + year
            return title
        if ", A (" in title:
            name_film, _, year = title.rpartition(", A (")
            title = "A " + name_film + " (" + year
        return title
    df_movies = pd.read_csv(os.path.join(datapath, r"movies.dat"), sep="::", names=["item_id", "name", "geners"], encoding='ISO-8859-1')
    df_movies['name'] = df_movies['name'].apply(lambda x: fix_title(x)[0])
    #df_movies['year'] = df_movies['name'].apply(lambda x: fix_title(x)[1])
    import re

    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), \
            names=['url', 'id_set'], sep='\t')
    dbpedia_mapping = pd.read_csv(os.path.join(datapath, 'MappingMovielens2DBpedia-1.2.tsv'), \
            names=['id_movie', 'name', 'dbpedia_url'], sep='\t')
    df_movies = dbpedia_mapping.set_index('dbpedia_url').join(df_relations.set_index('url')).reset_index()
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub("\s+\(\d+\)$", "", x))
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", ' ', x).strip())
    df_movies.dropna(inplace=True)
    df_movies['id_set'] = df_movies['id_set'].astype(int)
    mapping_dict = df_movies.loc[:, ['id_set', 'name']].set_index('id_set').to_dict()['name']
    property_mapping = pd.read_json(os.path.join(datapath, 'entities_names.json'), typ='series')
    mapping_relation = pd.read_csv(os.path.join(datapath, 'mapping_relations.tsv'), \
            names=['rel', 'id'], sep='\t')

  df_movies = pd.read_csv(os.path.join(datapath, r"movies.dat"), sep="::", names=["item_id", "name", "geners"], encoding='ISO-8859-1')


In [20]:
if dataset=='dbbook':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), sep='\t')
    df_relations['name'] = df_relations['uri'].apply(lambda x: x.split(";")[0])
    temp = df_relations[df_relations['id'].isin(df['item'].unique())]
    mapping_dict = temp.set_index('id').to_dict()['name']
    property_mapping = pd.read_json(os.path.join(datapath, 'entities_names.json'), typ='series')
    mapping_relation = pd.read_csv(os.path.join(datapath, 'mapping_relations.tsv'), \
            names=['id', 'rel'], sep='\t')

In [21]:
if dataset=='lastfm':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_items.tsv'), \
            names=['id', 'name'], sep='\t')
    mapping_dict = df_relations.set_index('id').to_dict()['name']
    property_mapping = pd.read_json(os.path.join(datapath, 'entities_names.json'), typ='series')
    mapping_relation = pd.read_csv(os.path.join(datapath, 'mapping_relations.tsv'), \
            names=['rel', 'id'], sep='\t')

In [22]:
import re 

def parse_name_from_url(url):
    name = url.split("/")[-1]
    name = name.replace("_", " ")
    name = name.split(':')[-1]
    name = re.sub(r"\([^()]*\)", "", name)
    pattern = r",.*$"
    name = re.sub(pattern, "", name)
    name = name[0].upper() + name[1:]
    name = " ".join(re.findall('[A-Z][a-z]*', name))
    return name.strip().capitalize()

mapping_relation['name_rel'] = mapping_relation['rel'].apply(parse_name_from_url)
like_dislike_ids = mapping_relation[mapping_relation['name_rel'].isin(['Like', 'Dislike'])]['id'].tolist()
files_prop = [x for x in os.listdir(os.path.join(datapath, 'item-prop')) if 'tsv' in x]
property_graph = pd.concat([pd.read_csv(os.path.join(datapath, 'item-prop', f), \
            names=['item','prop','rel'], sep='\t') for f in files_prop])
property_graph = property_graph[~property_graph['rel'].isin(like_dislike_ids)]
if dataset == 'lastfm':
    property_graph.rename(columns={
        'prop': 'rel', 'rel': 'prop'
    }, inplace=True)
property_graph = property_graph[property_graph['prop'].isin(property_mapping.index)]

In [23]:
lessicalized_prop = {}
if dataset=='dbbook':
    lessicalized_prop['Author'] = 'The book was written by [Author].'
    lessicalized_prop['Series'] =  'This book is part of a series called [Series].'
    lessicalized_prop['Genre'] =  'This is a [Genre] novel.'
    lessicalized_prop['Publisher'] = 'This book was published by [Publisher].'
    lessicalized_prop['Preceded by'] = 'This book is preceded by [Preceded by].'
    lessicalized_prop['Subject'] = 'The story is centered around the theme of [Subject].'
    lessicalized_prop['Illustrator'] = 'The illustrations were done by [Illustrator].'
    lessicalized_prop['Writers'] = 'The screenplay was written by [Writers].'
    lessicalized_prop['Artists'] = 'The artwork was created by [Artists].'
    lessicalized_prop['Letterers'] = 'The lettering was done by [Letterers].'
    lessicalized_prop['Director'] = 'The film adaptation was directed by [Director].'
    lessicalized_prop['Screenplay'] = 'The screenplay was written by [Screenplay].'
    lessicalized_prop['Starring'] = 'The film stars [Starring].'
    lessicalized_prop['Cinematography'] = 'The cinematography was handled by [Cinematography].'
    lessicalized_prop['Producer'] = 'The film was produced by [Producer].'
    lessicalized_prop['Distributor'] = 'The film was distributed by [Distributor].'
    lessicalized_prop['Editor'] = 'This book was edited by [Editor].'
    lessicalized_prop['Category'] = 'This book falls under the category of [Category].'
    lessicalized_prop['Company'] = 'This book was published by [Company].'
    lessicalized_prop['Writer'] = 'This book was written by [Writer].'
    lessicalized_prop['Place'] = 'The story takes place in [Place].'
    lessicalized_prop['Characters'] = 'The main characters include [Characters].'
    lessicalized_prop['Setting'] = 'The story is set in [Setting].'
    lessicalized_prop['Film adaption'] = 'This book has been adapted into a film titled [Film adaption].'
    lessicalized_prop['Political'] = 'This book explores themes of [Political] politics.'
    lessicalized_prop['Title'] = 'The title of the book is [Title].'
    lessicalized_prop['Editors'] = 'This book was edited by [Editors].'
    lessicalized_prop['Creators'] = 'This book was created by [Creators].'
    lessicalized_prop['Magazine'] = 'This story was originally serialized in [Magazine].'
    lessicalized_prop['Studio'] = 'The film adaptation was produced by [Studio].'
    lessicalized_prop['Network'] = 'This book has been adapted into a TV series on [Network].'
    lessicalized_prop['Narrated'] = 'The audiobook is narrated by [Narrated].'
    lessicalized_prop['Creator'] = 'This work was created by [Creator].'
    lessicalized_prop['Published'] = 'The book was published in [Published].'
    lessicalized_prop['Authors'] = 'This book was written by the following authors: [Authors].'
    lessicalized_prop['Co author'] = 'The author co-authored this book with [Co-author name].'

if dataset == 'lastfm':
    lessicalized_prop['Subject'] = 'The artist\'s work focuses primarily on themes of [Subject].'
    lessicalized_prop['Genre'] = 'The artist is known for their unique contribution to the [Genre] genre.'
    lessicalized_prop['Current members'] = 'Currently, the artist\'s lineup includes [Current members].'
    lessicalized_prop['Origin'] = 'The artist hails from [Origin], bringing a distinctive sound to their music.'
    lessicalized_prop['Past members'] = 'Over the years, the artist has seen several changes, with past members including [Past members].'
    lessicalized_prop['Occupation'] = 'In addition to being a musician, the artist also works as [Occupation].'
    lessicalized_prop['Instrument'] = 'The artist is proficient in playing the [Instrument].'
    lessicalized_prop['Genres'] = 'Their music spans across multiple genres, including [Genres].'
    lessicalized_prop['Home town'] = 'The artist grew up in [Home town], which has greatly influenced their music.'
    lessicalized_prop['Religion'] = 'The artist\'s work is often inspired by their religious beliefs in [Religion].'
    lessicalized_prop['Partner'] = 'In their personal life, the artist is partnered with [Partner].'
    lessicalized_prop['Alias'] = 'The artist is also known by the alias [Alias].'
    lessicalized_prop['Title'] = 'The artist holds the title of [Title] in the music industry.'
    lessicalized_prop['Author'] = 'They are also an accomplished author, having written [Author].'
    lessicalized_prop['Work'] = 'The artist\'s notable works include [Work].'
    lessicalized_prop['Notable instruments'] = 'They are renowned for their skill with notable instruments such as [Notable instruments].'
    lessicalized_prop['Manager'] = 'The artist is managed by [Manager].'
    lessicalized_prop['Former members'] = 'Former members of the artist\'s ensemble include [Former members].'
    lessicalized_prop['Voice type'] = 'The artist is recognized for their [Voice type] voice.'
    lessicalized_prop['Siblings'] = 'The artist has [Siblings] siblings who have also influenced their music career.'
    lessicalized_prop['Nationality'] = 'They proudly represent their [Nationality] heritage in their music.'
    lessicalized_prop['Instruments'] = 'The artist is adept at playing several instruments, including [Instruments].'
    lessicalized_prop['Television'] = 'In addition to music, the artist has appeared on television shows such as [Television].'
    lessicalized_prop['Influences'] = 'Their musical influences include [Influences].'
    lessicalized_prop['Agency'] = 'The artist is represented by the agency [Agency].'
    lessicalized_prop['Record labels'] = 'They have been signed with record labels such as [Record labels].'
    lessicalized_prop['Touring members'] = 'When touring, the artist is accompanied by members such as [Touring members].'
    lessicalized_prop['Former labels'] = 'The artist has previously been associated with former labels like [Former labels].'
    lessicalized_prop['Trainer'] = 'Their skills have been honed under the guidance of their trainer, [Trainer].'

if dataset=='movielens':
    lessicalized_prop['Starring'] = 'In this movie, [Starring]  take on a prominent role.'
    lessicalized_prop['Producer'] = 'Producing this movie is [Producer].'
    lessicalized_prop['Language'] = 'This movie is primarily in [Language].'
    lessicalized_prop['Editing'] = 'The editing in this movie is handled by [Editing].'
    lessicalized_prop['Country'] = 'This movie is set in [Country].'
    lessicalized_prop['Music composer'] = 'The musics of this movie is composed by [Music composer].'
    lessicalized_prop['Based on'] = 'This movie is based on [Based on].'
    lessicalized_prop['Subject'] = 'this movie explores [Subject] themes.'
    lessicalized_prop['Cinematography'] = 'The visuals of this movie are captured by [Cinematography].'
    lessicalized_prop['Writer'] = 'The narrative of this movie is penned by [Writer].'
    lessicalized_prop['Director'] = 'Guiding the vision of this movie is the director, [Director].'


In [24]:
mapping_name_rel =property_graph.set_index('rel').join(mapping_relation.loc[:, ['name_rel', 'id']].set_index('id')).reset_index()
goupd_mapping = mapping_name_rel.groupby(['item', 'name_rel']).agg({
    'prop':list
})

In [25]:
temp = goupd_mapping.reset_index()
kn_lex = {}
property_mapping_dict = property_mapping.to_dict()
for item in temp['item'].unique():
    item_df = temp[temp['item'] == item].sort_values('name_rel')
    item_name = mapping_dict.get(item, None)
    if item_name == None:
        continue
    prompt = f'{item_name} is a {items}.'
    for _, row in item_df.iterrows():
        names_property = [property_mapping_dict[x] for x in row['prop'] if property_mapping_dict.get(x, None)!=None]
        prompt += f" {lessicalized_prop[row['name_rel']]}"
        if len(names_property) > 1:
            names_property_str = ", ".join(names_property)
        else:
            names_property_str = names_property[0]
        prompt = prompt.replace(f"[{row['name_rel']}]", names_property_str)
    kn_lex[item] = prompt.strip()

In [26]:
kn_lex

{6040: "Jumanji is a movie. This movie is based on Thomas E. Ackerman. This movie is set in Robert Dalva. The editing in this movie is handled by Awards for David Alan Grier, List of awards and nominations received by Robin Williams, List of awards and nominations received by Kirsten Dunst, Awards for Bebe Neuwirth, Jonathan Hyde, Bonnie Hunt. This movie is primarily in Ted Field, Robert W. Cort. The musics of this movie is composed by Joe Johnston. Producing this movie is Greg Taylor, Jonathan Hensleigh. In this movie, Film scores by James Horner, Films directed by Joe Johnston, Films set in 1969, 1995 films, Films using computer-generated imagery, Time loop films, American adventure comedy films, Films set in 1995, Films shot in New Hampshire, Films set in the 1860s, Films based on children's books, Films shot in Maine, Interscope Communications films, American fantasy-comedy films, Fictional games, Films about orphans, English-language films, American children's fantasy films, Ameri

In [27]:
import json

with open(os.path.join(datapath, f'{dataset}_domain_kn_graph.jsonl'), 'w') as outfile:
    for new_id, desc in kn_lex.items():
        json.dump({'target_id':str(new_id), 'text':desc}, outfile)
        outfile.write('\n')

In [28]:
item_desc_df = pd.read_json(os.path.join(datapath, f'{dataset}_domain_kn_graph.jsonl'), lines=True) 
item_desc_df['target_id'] = item_desc_df['target_id'].astype(int)
item_desc_dict = item_desc_df.set_index('target_id').to_dict()['text']

## Prepare prompt

In [29]:
train_group.head()

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp
0,0,"[8213, 6267, 6857, 6858, 6916, 7258, 8270, 686...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...",49,"[0, 2, 3, 4, 6, 9, 15, 19, 23, 29, 30, 31, 32,...","[1, 5, 7, 8, 10, 12, 16, 17, 18, 20, 21, 22, 2...","[48, 42, 47]","[11, 13, 14, 24, 28]"
1,1,"[6493, 8733, 7257, 7887, 7444, 8122, 7061, 789...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...",122,"[0, 2, 6, 10, 14, 15, 20, 28, 31, 37, 42, 46, ...","[1, 5, 8, 13, 25, 26, 30, 34, 36, 38, 40, 44, ...","[66, 3, 67, 70, 7, 71, 73, 74, 11, 19, 21, 85,...","[4, 9, 12, 16, 17, 18, 22, 27, 29, 32, 33, 39,..."
2,2,"[7725, 6597, 7059, 6599, 8902, 6560, 7622, 717...","[1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, ...",50,"[0, 8, 11, 14, 18, 19, 20, 24, 27, 31, 36, 38,...","[2, 3, 6, 10, 12, 13, 15, 16, 22, 25, 28, 29, ...","[34, 35, 21, 39, 9, 26]","[1, 4, 5, 7, 46, 17, 23, 30]"
3,3,"[6459, 8213, 6267, 8430, 8897, 6916, 7139, 697...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, ...",20,"[3, 8, 10, 14, 15, 16, 17, 19]","[0, 1, 2, 5, 6, 9, 11, 12, 18]",[7],"[4, 13]"
4,4,"[8113, 7440, 6493, 6044, 6270, 8538, 7671, 812...","[1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, ...",188,"[0, 2, 130, 10, 11, 138, 141, 19, 147, 155, 28...","[129, 134, 7, 8, 139, 15, 16, 20, 149, 22, 150...","[1, 3, 6, 135, 9, 137, 140, 14, 17, 146, 148, ...","[128, 131, 4, 5, 133, 132, 136, 12, 13, 142, 1..."


In [30]:
mapping_dict[8670]

'Wayne s World'

In [31]:
def map_with_names(row, col, mapping_dict):
    return [mapping_dict.get(x, '') for x in np.array(row['item'])[row[col]] if len(mapping_dict.get(x, ''))>2]


train_group['liked_prompt'] = train_group.apply(map_with_names, axis=1, col='index_like_prompt', mapping_dict=mapping_dict)
train_group['liked_resp'] = train_group.apply(map_with_names, axis=1, col='index_like_resp', mapping_dict=mapping_dict)
train_group['disliked_prompt'] = train_group.apply(map_with_names, axis=1, col='index_dislike_prompt', mapping_dict=mapping_dict)
train_group['disliked_resp'] = train_group.apply(map_with_names, axis=1, col='index_dislike_resp', mapping_dict=mapping_dict)

test_group['liked_prompt'] = test_group.apply(map_with_names, axis=1, col='index_like_prompt', mapping_dict=mapping_dict)
test_group['liked_resp'] = test_group.apply(map_with_names, axis=1, col='index_like_resp', mapping_dict=mapping_dict)
test_group['disliked_prompt'] = test_group.apply(map_with_names, axis=1, col='index_dislike_prompt', mapping_dict=mapping_dict)
test_group['disliked_resp'] = test_group.apply(map_with_names, axis=1, col='index_dislike_resp', mapping_dict=mapping_dict)

In [32]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp
0,0,"[8213, 6267, 6857, 6858, 6916, 7258, 8270, 686...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...",49,"[0, 2, 3, 4, 6, 9, 15, 19, 23, 29, 30, 31, 32,...","[1, 5, 7, 8, 10, 12, 16, 17, 18, 20, 21, 22, 2...","[48, 42, 47]","[11, 13, 14, 24, 28]","[Run Lola Run Lola rennt, Mary Poppins, Dumbo,...","[Ponette, Sound of Music The, Aladdin, Hunchba...","[Tarzan, James and the Giant Peach, Princess B...","[Wallace Gromit The Best of Aardman Animation,..."
1,1,"[6493, 8733, 7257, 7887, 7444, 8122, 7061, 789...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...",122,"[0, 2, 6, 10, 14, 15, 20, 28, 31, 37, 42, 46, ...","[1, 5, 8, 13, 25, 26, 30, 34, 36, 38, 40, 44, ...","[66, 3, 67, 70, 7, 71, 73, 74, 11, 19, 21, 85,...","[4, 9, 12, 16, 17, 18, 22, 27, 29, 32, 33, 39,...","[Remains of the Day The, Lost World Jurassic P...","[Key Largo, Matrix The, Doctor Zhivago, Green ...","[Breakfast Club The, Children of a Lesser God,...","[Man in the Iron Mask The, Get Shorty, Fisher ..."


In [33]:
train = []
test = []
for _, row in test_group.iterrows():
    for item in row['liked_prompt']:
        train.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 1
        })
    for item in row['disliked_prompt']:
        train.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 0
        })
    for item in row['liked_resp']:
        test.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 1
        })
    for item in row['disliked_resp']:
        test.append({
            'user_id': row['user'],
            'item_id': item,
            'score': 0
        })

pd.DataFrame(train).to_csv("train_adaptation.csv")
                    

In [34]:
import random
import itertools

def prepare_user_prompt(row, test=False):
    random.seed(22)
    user_prompt = ''
    liked_list = ', '
    disliked_list = ', '
    liked_part = """I like the following items:\n{liked_list}\n"""
    disliked_part = """I dislike the following items:\n{disliked_list}\n"""
    liked_items = sorted(row['liked_prompt'], key=lambda k: random.random())
    disliked_items = sorted(row['disliked_prompt'], key=lambda k: random.random())
    liked_list = liked_list.join(liked_items)
    disliked_list = disliked_list.join(disliked_items)
    if len(liked_items)>0:
        user_prompt = user_prompt + liked_part.format(liked_list=liked_list)
    if len(disliked_list)>0:
        user_prompt = user_prompt + disliked_part.format(disliked_list=disliked_list)
    
    to_rank_list_str = ', '
    to_rank_list = list(itertools.chain(*[row['liked_resp'], row['disliked_resp']]))
    if test:
        response_part = f"""\nSugges me {k_prompt} items"""
    else:
        response_part = f"""\nSugges me {len(to_rank_list)} items"""
    to_rank_list = sorted(to_rank_list, key=lambda k: random.random())
    to_rank_list = to_rank_list_str.join(to_rank_list)
    user_prompt = user_prompt + response_part#.format(to_rank_list=to_rank_list)
    return user_prompt
        
def get_assistant_prompt(row):
    ranked_list_str = ''
    rank_part = """Here a list:\n{ranked_list}\n"""
    for i, x in enumerate(row['liked_resp']):
        ranked_list_str = ranked_list_str + f"{i+1}. {x}\n"
    for i, x in enumerate(row['disliked_resp'], len(row['liked_resp'])):
        ranked_list_str = ranked_list_str + f"{i+1}. {x}\n"
    return rank_part.format(ranked_list=ranked_list_str)[:-1]


train_group['user_prompt'] = train_group.apply(prepare_user_prompt, axis=1)
train_group['assistant_prompt'] = train_group.apply(get_assistant_prompt, axis=1)

test_group['user_prompt'] = test_group.apply(prepare_user_prompt, axis=1, test=True)
test_group['assistant_prompt'] = test_group.apply(get_assistant_prompt, axis=1)

In [35]:
print(train_group.loc[150, 'user_prompt'])

I like the following items:
Maltese Falcon The, Monty Python and the Holy Grail, Magnum Force, Run Silent Run Deep, Who Framed Roger Rabbit, Bell Book and Candle, Bringing Out the Dead, From Here to Eternity, Gladiator, Ravenous, Taking of Pelham One Two Three The, Messenger The Story of Joan of Arc The
I dislike the following items:
Entrapment, Boiler Room, Braddock Missing in Action III, Clerks, Gone in 60 Seconds, Get Carter, Bowfinger, Star Trek The Motion Picture, Space Cowboys, Wild Wild West, Arachnophobia, Mystery Men, Teaching Mrs Tingle, Pitch Black, Sixth Sense The, Cell The, Civil Action A, Reindeer Games, Payback, Back to the Future

Sugges me 34 items


In [36]:
print(train_group.loc[150, 'assistant_prompt'])

Here a list:
1. Rushmore
2. Soapdish
3. Romeo Must Die
4. Dead Calm
5. Matrix The
6. Summer of Sam
7. Fight Club
8. Pacific Heights
9. Gattaca
10. Election
11. 13th Warrior The
12. Predator
13. Moonstruck
14. General s Daughter The
15. Battlefield Earth
16. Blair Witch Project The
17. Until the End of the World Bis ans Ende der Welt
18. Shaft
19. Pushing Tin
20. Art of War The
21. Quick and the Dead The
22. Tron
23. Carrie
24. Thomas Crown Affair The
25. Under Siege 2 Dark Territory
26. X Men
27. Godfather The
28. Poseidon Adventure The
29. Meteor
30. Hollow Man
31. Faculty The
32. Mission Impossible 2
33. eXistenZ
34. Powder



In [37]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("../llama3")
tokenizer.pad_token = tokenizer.eos_token
#tokenizer.chat_template =  "{% set ns = namespace(i=0) %}" \
#                            "{% for message in messages %}" \
#                                "{% if message['role'] == 'system' and ns.i == 0 %}" \
#                                       "{{ bos_token +' [INST] <<SYS>>\n' }}" \
#                                       "{{ message['content'] + ' <</SYS>>\n'}}" \
#                                "{% elif message['role'] == 'user' %}" \
#                                    "{{ message['content'] + ' [/INST]\n'}}" \
#                                "<|reserved_special_token_22|>\n"\
#                                "{% elif message['role'] == 'assistant' %}" \
#                                    "{{ message['content'] + ' ' + eos_token }}" \
#                                "{% endif %}" \
#                                "{% set ns.i = ns.i+1 %}" \
#                            "{% endfor %}"
#
tokenizer.chat_template =  "{% set ns = namespace(i=0) %}" \
                            "{% for message in messages %}" \
                                "{% if message['role'] == 'system' and ns.i == 0 %}" \
                                       "{{ bos_token +' [INST] <<SYS>>\n' }}" \
                                       "{{ message['content'] + ' <</SYS>>\n'}}" \
                                "{% elif message['role'] == 'user' %}" \
                                    "{{ message['content'] + ' [/INST]\n'}}" \
                                "{% elif message['role'] == 'assistant' %}" \
                                    "{{ message['content'] + '' + eos_token }}" \
                                "{% endif %}" \
                                "{% set ns.i = ns.i+1 %}" \
                            "{% endfor %}"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
def get_prompt(row, is_train=True):
    messages = [
    {"role": "system", "content": f"You're operating as a {domain} recommendation system. Your task involves receiving a user's list of liked and disliked items, based on that information, you need to return a ranked list of recommended items the user is most likely to enjoy"},
    {"role": "user", "content": row['user_prompt']}
    ]
    if is_train:
        messages.append({"role": "assistant", "content": row['assistant_prompt']})
    return tokenizer.apply_chat_template(messages, tokenize=False)

In [39]:
train_group['prompt'] = train_group.apply(get_prompt, axis=1, is_train=True)
test_group['prompt'] = test_group.apply(get_prompt, axis=1, is_train=False)

In [40]:
print(train_group['prompt'][0])

<|begin_of_text|> [INST] <<SYS>>
You're operating as a movie recommendation system. Your task involves receiving a user's list of liked and disliked items, based on that information, you need to return a ranked list of recommended items the user is most likely to enjoy <</SYS>>
I like the following items:
Dumbo, Antz, Schindler s List, Mary Poppins, Sixth Sense The, Toy Story 2, Last Days of Disco The, Miracle on 34th Street, Saving Private Ryan, Bug s Life A, Snow White and the Seven Dwarfs, Pocahontas, Beauty and the Beast, Airplane, Bambi, Girl Interrupted, Titanic, Run Lola Run Lola rennt, Gigi, E T the Extra Terrestrial
I dislike the following items:
James and the Giant Peach, Tarzan, Princess Bride The

Sugges me 24 items [/INST]
Here a list:
1. Ponette
2. Sound of Music The
3. Aladdin
4. Hunchback of Notre Dame The
5. Erin Brockovich
6. Secret Garden The
7. Apollo 13
8. Wizard of Oz The
9. Dead Poets Society
10. Hercules
11. Awakenings
12. Ferris Bueller s Day Off
13. One Flew O

In [41]:
print(test_group['prompt'][0])

<|begin_of_text|> [INST] <<SYS>>
You're operating as a movie recommendation system. Your task involves receiving a user's list of liked and disliked items, based on that information, you need to return a ranked list of recommended items the user is most likely to enjoy <</SYS>>
I like the following items:
Gone in 60 Seconds, Tequila Sunrise, To Gillian on Her 37th Birthday, Erin Brockovich, Legends of the Fall, Shanghai Noon, Pocahontas, Mary Poppins, Aladdin, Hercules, My Best Friend s Wedding, Top Gun, Straight Story The, Shakespeare in Love, Where the Heart Is, Murphy s Romance, Gladiator, Babe, Grease, Mask of Zorro The, Moonstruck, Umbrellas of Cherbourg The Parapluies de Cherbourg Les, Beauty and the Beast
I dislike the following items:
Pulp Fiction, Speed, Arthur, Peggy Sue Got Married, Dances with Wolves, Other Sister The, Splash, Apple Dumpling Gang The

Sugges me 100 items [/INST]



In [42]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp,user_prompt,assistant_prompt,prompt
0,0,"[8213, 6267, 6857, 6858, 6916, 7258, 8270, 686...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...",49,"[0, 2, 3, 4, 6, 9, 15, 19, 23, 29, 30, 31, 32,...","[1, 5, 7, 8, 10, 12, 16, 17, 18, 20, 21, 22, 2...","[48, 42, 47]","[11, 13, 14, 24, 28]","[Run Lola Run Lola rennt, Mary Poppins, Dumbo,...","[Ponette, Sound of Music The, Aladdin, Hunchba...","[Tarzan, James and the Giant Peach, Princess B...","[Wallace Gromit The Best of Aardman Animation,...","I like the following items:\nDumbo, Antz, Schi...",Here a list:\n1. Ponette\n2. Sound of Music Th...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...
1,1,"[6493, 8733, 7257, 7887, 7444, 8122, 7061, 789...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...",122,"[0, 2, 6, 10, 14, 15, 20, 28, 31, 37, 42, 46, ...","[1, 5, 8, 13, 25, 26, 30, 34, 36, 38, 40, 44, ...","[66, 3, 67, 70, 7, 71, 73, 74, 11, 19, 21, 85,...","[4, 9, 12, 16, 17, 18, 22, 27, 29, 32, 33, 39,...","[Remains of the Day The, Lost World Jurassic P...","[Key Largo, Matrix The, Doctor Zhivago, Green ...","[Breakfast Club The, Children of a Lesser God,...","[Man in the Iron Mask The, Get Shorty, Fisher ...","I like the following items:\nGandhi, Strictly ...",Here a list:\n1. Key Largo\n2. Matrix The\n3. ...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...


In [43]:
def get_description_liked(row):
    liked_item_index = row['index_like_resp']
    likex_items = np.array(row['item'])[liked_item_index]
    return '\n'.join([item_desc_dict.get(y, '') for y in likex_items])

In [44]:
train_group['descriptions'] = train_group['item'].apply(lambda x: [item_desc_dict.get(y, '') for y in x if len(item_desc_dict.get(y, ''))>2])
#train_group['descriptions'] = train_group.apply(get_description_liked, axis=1)

In [45]:
train_group.head(2)

Unnamed: 0,user,item,rating,len,index_like_prompt,index_like_resp,index_dislike_prompt,index_dislike_resp,liked_prompt,liked_resp,disliked_prompt,disliked_resp,user_prompt,assistant_prompt,prompt,descriptions
0,0,"[8213, 6267, 6857, 6858, 6916, 7258, 8270, 686...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...",49,"[0, 2, 3, 4, 6, 9, 15, 19, 23, 29, 30, 31, 32,...","[1, 5, 7, 8, 10, 12, 16, 17, 18, 20, 21, 22, 2...","[48, 42, 47]","[11, 13, 14, 24, 28]","[Run Lola Run Lola rennt, Mary Poppins, Dumbo,...","[Ponette, Sound of Music The, Aladdin, Hunchba...","[Tarzan, James and the Giant Peach, Princess B...","[Wallace Gromit The Best of Aardman Animation,...","I like the following items:\nDumbo, Antz, Schi...",Here a list:\n1. Ponette\n2. Sound of Music Th...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...,[Run Lola Run Lola rennt is a movie. This movi...
1,1,"[6493, 8733, 7257, 7887, 7444, 8122, 7061, 789...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...",122,"[0, 2, 6, 10, 14, 15, 20, 28, 31, 37, 42, 46, ...","[1, 5, 8, 13, 25, 26, 30, 34, 36, 38, 40, 44, ...","[66, 3, 67, 70, 7, 71, 73, 74, 11, 19, 21, 85,...","[4, 9, 12, 16, 17, 18, 22, 27, 29, 32, 33, 39,...","[Remains of the Day The, Lost World Jurassic P...","[Key Largo, Matrix The, Doctor Zhivago, Green ...","[Breakfast Club The, Children of a Lesser God,...","[Man in the Iron Mask The, Get Shorty, Fisher ...","I like the following items:\nGandhi, Strictly ...",Here a list:\n1. Key Largo\n2. Matrix The\n3. ...,<|begin_of_text|> [INST] <<SYS>>\nYou're opera...,[Remains of the Day The is a movie. This movie...


In [46]:
train_group.to_pickle(os.path.join(datapath, f'{dataset}_train_dataset_all_items.pkl'))
test_group.to_pickle(os.path.join(datapath, f'{dataset}_test_dataset_all_items.pkl'))

In [47]:
prompts = train_group['prompt'].tolist() * 3

In [48]:
descriptions = list(item_desc_dict.values())

In [49]:
descriptions = [f'{tokenizer.bos_token} ' + x + f' {tokenizer.eos_token}' for x in descriptions]

In [50]:
import random 

random.seed(22)
train_set = prompts + descriptions

random.shuffle(train_set)

In [51]:
len(train_set)

17325

In [52]:
import json

with open(os.path.join(datapath, f'{dataset}_train_set_no_kn_all_items.jsonl'), 'w') as outfile:
    for x in train_group['prompt'].tolist():
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [53]:
import json

with open(os.path.join(datapath, f'{dataset}_train_set_kgraph_all_items.jsonl'), 'w') as outfile:
    for x in train_set:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

In [54]:
prompts = test_group['prompt'].tolist()

In [55]:
import json

with open(os.path.join(datapath, f'{dataset}_test_set_all_items.jsonl'), 'w') as outfile:
    for x in prompts:
        json.dump({'text':x}, outfile)
        outfile.write('\n')

# Baseline

In [56]:
"""train_bs = pd.concat([test_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']],\
                      train_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']]])
test_bs = test_group.loc[:, ['user', 'index_like_resp', 'index_dislike_resp', 'item']]"""

"train_bs = pd.concat([test_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']],                      train_group.loc[:, ['user', 'index_like_prompt', 'index_dislike_prompt', 'item']]])\ntest_bs = test_group.loc[:, ['user', 'index_like_resp', 'index_dislike_resp', 'item']]"

In [57]:
"""train_bs['bs_format'] = train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_prompt']], axis=1)
train_bs['bs_format'] = train_bs['bs_format'] + train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_prompt']], axis=1)

test_bs['bs_format'] = test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_resp']], axis=1)
test_bs['bs_format'] = test_bs['bs_format'] + test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_resp']], axis=1)"""

"train_bs['bs_format'] = train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_prompt']], axis=1)\ntrain_bs['bs_format'] = train_bs['bs_format'] + train_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_prompt']], axis=1)\n\ntest_bs['bs_format'] = test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 1} for item in x['index_like_resp']], axis=1)\ntest_bs['bs_format'] = test_bs['bs_format'] + test_bs.apply(lambda x:[{'user_id:token': x['user'], 'item_id:token': x['item'][item], 'rating:float': 0} for item in x['index_dislike_resp']], axis=1)"

In [58]:
"""import itertools

os.makedirs(os.path.join(datapath, 'baseline'), exist_ok=True)
pd.DataFrame(list(itertools.chain(*test_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part3.inter'), sep='\t', index=False)
pd.DataFrame(list(itertools.chain(*train_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part1.inter'), sep='\t', index=False)"""

"import itertools\n\nos.makedirs(os.path.join(datapath, 'baseline'), exist_ok=True)\npd.DataFrame(list(itertools.chain(*test_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part3.inter'), sep='\t', index=False)\npd.DataFrame(list(itertools.chain(*train_bs['bs_format'].tolist()))).to_csv(os.path.join(datapath, 'baseline', f'{dataset}.part1.inter'), sep='\t', index=False)"