In [1]:
dataset = 'movielens'
file_name = 'results_llama3_collaborative_2steps'
all_items = False
domain = {
    'lastfm': 'music',
    'dbbook': 'book',
    'movielens': 'movie'
}[dataset]
import os
datapath = os.path.join('..', 'data', dataset)
if all_items:
    out_file = {
        'results_llama3_text_all_items': f'predicted_score_LLaMA_{domain.capitalize()}_text_all_items',
        'results_llama3_no_kn_all_items': f'predicted_score_LLaMA_{domain.capitalize()}_no_kn_all_items',
        'results_llama3_kgraph_all_items': f'predicted_score_LLaMA_{domain.capitalize()}_graph_all_items',
        'results_llama3_chat_model_all_items': f'predicted_score_LLaMA_{domain.capitalize()}_chat_all_items',
        'results_llama3_graph_text_all_items': f'predicted_score_LLaMA_{domain.capitalize()}_graph_text_all_items',
        'results_llama3_collaborative_all_items': f'predicted_score_LLaMA_{domain.capitalize()}_collaborative_all_items',
    }[file_name]
else:
    out_file = {
        'results_llama3_adapted': f'predicted_score_LLaMA_{domain.capitalize()}_Text',
        'results_llama3_adapted_no_kn': f'predicted_score_LLaMA_{domain.capitalize()}_no_kn',
        'results_llama3_kgraph': f'predicted_score_LLaMA_{domain.capitalize()}_graph',
        'results_llama3_chat_model': f'predicted_score_LLaMA_{domain.capitalize()}_chat',
        'results_llama3_graph_text': f'predicted_score_LLaMA_{domain.capitalize()}_graph_text',
        'results_llama3_collaborative': f'predicted_score_LLaMA_{domain.capitalize()}_collaborative',
        'results_llama3_graph_text_double': f'predicted_score_LLaMA_{domain.capitalize()}_graph_text_double',
        'results_llama3_collaborative_graph': f'predicted_score_LLaMA_{domain.capitalize()}_collaborative_graph',
        'results_llama3_collaborative_graph_text': f'predicted_score_LLaMA_{domain.capitalize()}_collaborative_graph_text',
        'results_llama3_collaborative_text': f'predicted_score_LLaMA_{domain.capitalize()}_collaborative_text',
        'results_llama3_text_2steps': f'predicted_score_LLaMA_{domain.capitalize()}_Text_2steps',
        'results_llama3_graph_2steps': f'predicted_score_LLaMA_{domain.capitalize()}_graph_2steps',
        'results_llama3_collaborative_2steps': f'predicted_score_LLaMA_{domain.capitalize()}_collaborative_2steps',
    }[file_name]


In [2]:
import pandas as pd
from tqdm import tqdm
import json

In [3]:
if all_items:
    gt_df = pd.read_pickle(os.path.join(domain, f'{dataset}_test_dataset_all_items.pkl'))
else:
    gt_df = pd.read_pickle(os.path.join(domain, f'{dataset}_test_dataset.pkl'))
gt_df['true_rank'] = gt_df['liked_resp'] + gt_df['disliked_resp']

In [None]:
with open(os.path.join(domain, file_name+'.jsonl'), 'r') as json_file:
    json_list = list(json_file)

results = []
for json_str in tqdm(json_list):
   results.append(json.loads(json_str))

In [None]:
res_df = pd.DataFrame(results)
res_df.head(2)

In [None]:
res_df['response'][2]

In [7]:
import numpy as np

def get_tsv_gt(row):
    items = np.array(row['item'])
    user = row['user']
    like = items[[row['index_like_resp']]].tolist()[0]
    dislike = items[[row['index_dislike_resp']]].tolist()[0]

    list_items = []
    for l in like:
        list_items.append({
            'user': user,
            'item': l,
            'score': 1
        })

    for d in dislike:
        list_items.append({
            'user': user,
            'item': d,
            'score': 0
        })
    return list_items

gt_df['tsv_gt'] = gt_df.apply(get_tsv_gt, axis=1)

In [8]:
import itertools


pd.DataFrame(list(itertools.chain(*gt_df['tsv_gt'].tolist()))).to_csv(os.path.join(domain, 'ground_truth.tsv'), index=False, sep='\t')

In [9]:
res_df = res_df.drop_duplicates(subset=['prompt'])

In [None]:
res_df

In [None]:
len(res_df), len(gt_df)

In [12]:
res_df['response'] = res_df['response'].apply(lambda x: x.split('\n'))

In [13]:
import re

def get_ranked_list(list_items):
    ranked_list = []
    for x in list_items[1:]:
        numbers = re.findall('[0-9]+.', x)
        try:
            for rank in numbers[:1]:
                name = re.sub(rank, '', x, 1).strip()
                if len(name)>0:
                    rank = [int(s) for s in re.findall(r'\b\d+\b', rank)][0]
                    ranked_list.append({'rank': rank, 'name': name})
        except:
            continue
    return ranked_list

res_df['ranked_list'] = res_df['response'].apply(get_ranked_list)

In [None]:
res_df

In [15]:
def get_missing(row):
    predicted = set([x['name'] for x in row['ranked_list']])
    expected = set(row['true_rank'])
    return {
        'missing': expected-predicted,
        '#missing': len(expected-predicted),
        'added': predicted-expected,
        '#added': len(predicted-expected)
    }

In [16]:
res_df = res_df.set_index('prompt').join(gt_df.set_index('prompt').loc[:, ['true_rank', 'user']]).reset_index(drop=True)

In [None]:
res_df

In [18]:
res_df.dropna(inplace=True)

In [19]:
res_df['missing_details'] = res_df.loc[:, ["ranked_list","true_rank"]].apply(get_missing, axis=1)

In [20]:
res_df['#missing'] = res_df['missing_details'].apply(lambda x: x['#missing'])
res_df['#added'] = res_df['missing_details'].apply(lambda x: x['#added'])

In [None]:
res_df['#missing'].describe()

In [None]:
res_df['#added'].describe()

In [23]:
def get_score(ranked_list):
    ranked_list = sorted(ranked_list, key=lambda d: d['rank'])
    try:
        step = 1/len(ranked_list)
    except:
        step = 0
    ranked_list = [{**x, **{'score': 1-step*i}} for i, x in enumerate(ranked_list)]
    return ranked_list

res_df['ranked_list'] = res_df['ranked_list'].apply(get_score)

In [None]:
if dataset=='movielens':
    def fix_title(title):
        if ", The (" in title:
            name_film, _, year = title.rpartition(", The (")
            title = "The " + name_film + " (" + year
            return title
        if ", A (" in title:
            name_film, _, year = title.rpartition(", A (")
            title = "A " + name_film + " (" + year
        return title
    df_movies = pd.read_csv(os.path.join(datapath, r"movies.dat"), sep="::", names=["item_id", "name", "geners"], encoding='ISO-8859-1')
    df_movies['name'] = df_movies['name'].apply(fix_title)
    import re

    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), \
            names=['url', 'id_set'], sep='\t')
    dbpedia_mapping = pd.read_csv(os.path.join(datapath, 'MappingMovielens2DBpedia-1.2.tsv'), \
            names=['id_movie', 'name', 'dbpedia_url'], sep='\t')
    df_movies = dbpedia_mapping.set_index('dbpedia_url').join(df_relations.set_index('url')).reset_index()
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub("\s+\(\d+\)$", "", x))
    df_movies['name'] = df_movies['name'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", ' ', x).strip())
    df_movies.dropna(inplace=True)
    df_movies['id_set'] = df_movies['id_set'].astype(int)
    mapping_dict = df_movies.loc[:, ['id_set', 'name']].set_index('id_set').to_dict()['name']
    mapping_dict = {v:k for k, v in mapping_dict.items()}

In [25]:
if dataset=='dbbook':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_entities.tsv'), sep='\t')
    df_relations['name'] = df_relations['uri'].apply(lambda x: x.split(";")[0])
    mapping_dict = df_relations.set_index('id').to_dict()['name']
    mapping_dict = {v:k for k, v in mapping_dict.items()}

In [26]:
import os 

if dataset=='lastfm':
    df_relations = pd.read_csv(os.path.join(datapath, 'mapping_items.tsv'), \
            names=['id', 'name'], sep='\t')
    mapping_dict = df_relations.set_index('id').to_dict()['name']
    mapping_dict = {v:k for k, v in mapping_dict.items()}

In [28]:
def get_baseline_format(row, mapping_dict):
    tsv_list = []
    user_id = row['user']
    ranked_list = row['ranked_list']
    for item in ranked_list:
        if mapping_dict.get(item['name'], False) == False:
            continue
        tsv_list.append({
            'user': int(user_id) if dataset not in ['boardgamegeek'] else user_id,
            'item': mapping_dict[item['name']],
            'score': item['score'],
        })
    return tsv_list

res_df['tsv_list'] = res_df.apply(get_baseline_format, axis=1, mapping_dict=mapping_dict)

In [29]:
import itertools

pd.DataFrame(list(itertools.chain(*res_df['tsv_list'].tolist()))).to_csv(os.path.join(domain, f'{out_file}.tsv'), index=False, sep='\t')