In [211]:
import numpy as np
import pandas as pd
import random
import json
import os
from tqdm import tqdm
from datetime import datetime

In [212]:
seed = 42
random.seed(seed)
sample_num = 150
rating = pd.read_csv(f'XXXXXXXXXX/ml-1m/ratings.dat', sep='::', header=None, 
                     names=['user', 'item', 'rating', 'timestamp'], engine='python')
title =  pd.read_csv(f'XXXXXXXXXX/ml-1m/movies.dat', sep='::', header=None, 
                     names=['movie_id', 'title', 'genre'], engine='python', encoding='ISO-8859-1')

In [228]:
rating

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [229]:
len(rating['item'].unique()), len(rating['user'].unique())

(3706, 6040)

In [230]:
filter_num = 5
tmp1 = rating.groupby(['item'], as_index=False)['user'].count()
tmp1.rename(columns={'user': 'cnt_user'}, inplace=True)
rating = rating.merge(tmp1, on=['item'])
rating = rating.query(f'cnt_user >= {filter_num}').reset_index(drop=True).copy()
rating.drop(['cnt_user'], axis=1, inplace=True)

In [5]:
len(rating['item'].unique()), len(rating['user'].unique()), len(rating['user'])

(3416, 6040, 999611)

In [80]:
item_set = set(rating['item'])
len(item_set)

3416

In [7]:
interaction_dicts = dict()
for _, row in rating.iterrows():
    user_id = int(row['user'])
    item_id = int(row['item'])
    timestamp = int(row['timestamp'])

    if user_id not in interaction_dicts:
        interaction_dicts[user_id] = {
            'item_id':[],
            'timestamp':[],
            'item_title':[],
        }
    interaction_dicts[user_id]['item_id'].append(item_id)
    interaction_dicts[user_id]['timestamp'].append(timestamp)
    item_title = title[title['movie_id']==item_id]['title'].values[0].split(' (')[0]
    interaction_dicts[user_id]['item_title'].append(item_title)

In [8]:
len(interaction_dicts)

6040

In [9]:
def last_index(lst, element):
    for i in range(len(lst)-1, -1, -1):
        if lst[i] == element:
            return i
    return None

In [10]:
def split_by_timestamp(timestamps):
    dates = [datetime.utcfromtimestamp(ts) for ts in timestamps]
    date_to_timestamps = {}
    for date, ts in zip(dates, timestamps):
        date_str = date.strftime('%Y-%m-%d')
        if date_str not in date_to_timestamps:
            date_to_timestamps[date_str] = []
        date_to_timestamps[date_str].append(ts)
    same_day_timestamps = [timestamps for timestamps in date_to_timestamps.values() if len(timestamps) > 1]
    return same_day_timestamps

In [11]:
def custom_sort(key):
    return sessions_id[key]['time']

In [151]:
def reindex_data(datas, item_mapping):
    final = []
    for data in datas:
        session = data['session']
        reindex_session = [item_mapping[i] + 1 for i in session]
        final.append({'session': reindex_session, 'time': data['time']})
    
    return final

def get_candiate_set(session, target, candidate_size, item_set):
    interact_set = set(session + [target])
    candidate_pool = list(item_set - interact_set)
    candidate_set = random.sample(candidate_pool, candidate_size-1)
    random_index = random.randint(0, candidate_size-1)
    candidate_set.insert(random_index, target)
    
    return candidate_set

def get_text(item_lst, title):
    text = []
    for item in item_lst:
        text.append(title[title['movie_id']==item].values[0][1].split(' (')[0])
    return text

def construct_train_val_text(title, dataset_id, item_set, reindex_train_id, candidate_size):
    data_text_lst = []
    candidates = []
    train_ids, train_texts = [[], []], [[], []]
    
    for index, data in enumerate(dataset_id):
        session_id, target_id = reindex_train_id[index]['session'][:-1], reindex_train_id[index]['session'][-1]
        if len(session_id) >= 1:  
            train_ids[0].append(session_id)
            train_ids[1].append(target_id)
        session_text, target_text = get_text(data['session'][:-1], title), title[title['movie_id']==data['session'][-1]].values[0][1].split(' (')[0]
        if len(session_text) >= 1: 
            train_texts[0].append(session_text)
            train_texts[1].append(target_text)
    return train_ids, train_texts

In [181]:
sessions_id = {}
s_id = 0
time_size = 5 * 60
for user_id in tqdm(interaction_dicts):
    temp = zip(interaction_dicts[user_id]['item_id'], interaction_dicts[user_id]['timestamp'], interaction_dicts[user_id]['item_title'])
    temp = sorted(temp, key=lambda x: x[1])
    result = zip(*temp)
    item_id, timestamps, item_title = [list(_) for _ in result]
    session_timestamps = split_by_timestamp(timestamps)

    session_idx_lst = []  # List to hold current session's item IDs
    session_timestamp = []  # List to hold current session's timestamps
    for session_timestamp in session_timestamps:
        start_idx = timestamps.index(session_timestamp[0])
        end_idx = last_index(timestamps, session_timestamp[-1])
        session_idx_lst = [item_id[start_idx]]
        session_timestamp = [timestamps[start_idx]]
        for i in range(start_idx + 1, end_idx + 1):
            if timestamps[i] -  session_timestamp[0] > time_size:
                sessions_id[s_id] = {'session': session_idx_lst, 'time': session_timestamp[-1]}
                s_id += 1
                # Reset for the new session
                session_idx_lst = [item_id[i]]
                session_timestamp = [timestamps[i]]
            else:
                session_idx_lst.append(item_id[i])
                session_timestamp.append(timestamps[i])
        if session_idx_lst:
            sessions_id[s_id] = {'session': session_idx_lst, 'time': session_timestamp[-1]}
            s_id += 1

  dates = [datetime.utcfromtimestamp(ts) for ts in timestamps]
100%|█████████████████| 6040/6040 [00:02<00:00, 2739.51it/s]


In [182]:
len(sessions_id)

77674

In [183]:
sorted_keys = sorted(sessions_id, key=custom_sort)
sorted_dict = {key: sessions_id[key] for key in sorted_keys}
sorted_dict

{77650: {'session': [858,
   593,
   2384,
   1961,
   2019,
   573,
   1419,
   213,
   3111,
   3505,
   1734,
   912,
   919,
   2503,
   527],
  'time': 956704219},
 77651: {'session': [649,
   318,
   1252,
   3289,
   759,
   2858,
   608,
   2396,
   326,
   1649,
   2028],
  'time': 956704519},
 77652: {'session': [17,
   34,
   2762,
   246,
   2692,
   1617,
   1111,
   300,
   1392,
   150,
   549,
   562,
   265,
   1537,
   1554,
   448,
   866,
   1358,
   2324,
   235,
   247,
   446],
  'time': 956704854},
 77653: {'session': [1704,
   1094,
   50,
   45,
   162,
   348,
   508,
   589,
   1089,
   58,
   2580,
   1694,
   1834,
   2391,
   290,
   357,
   1641],
  'time': 956705185},
 77644: {'session': [282,
   111,
   2067,
   930,
   1230,
   3022,
   947,
   3088,
   3133,
   1294,
   3421,
   2804,
   1269,
   955,
   1244,
   1276,
   2622,
   2791,
   2300,
   2396,
   1028,
   2863,
   1197,
   3548,
   951,
   1211,
   1223],
  'time': 956705441},
 77645: {'se

In [184]:
final_sessions_id = []
for i, (k, v) in enumerate(sorted_dict.items()):
    if v['session'] != []:
        final_sessions_id.append(v)

In [185]:
final_sessions_id

[{'session': [858,
   593,
   2384,
   1961,
   2019,
   573,
   1419,
   213,
   3111,
   3505,
   1734,
   912,
   919,
   2503,
   527],
  'time': 956704219},
 {'session': [649, 318, 1252, 3289, 759, 2858, 608, 2396, 326, 1649, 2028],
  'time': 956704519},
 {'session': [17,
   34,
   2762,
   246,
   2692,
   1617,
   1111,
   300,
   1392,
   150,
   549,
   562,
   265,
   1537,
   1554,
   448,
   866,
   1358,
   2324,
   235,
   247,
   446],
  'time': 956704854},
 {'session': [1704,
   1094,
   50,
   45,
   162,
   348,
   508,
   589,
   1089,
   58,
   2580,
   1694,
   1834,
   2391,
   290,
   357,
   1641],
  'time': 956705185},
 {'session': [282,
   111,
   2067,
   930,
   1230,
   3022,
   947,
   3088,
   3133,
   1294,
   3421,
   2804,
   1269,
   955,
   1244,
   1276,
   2622,
   2791,
   2300,
   2396,
   1028,
   2863,
   1197,
   3548,
   951,
   1211,
   1223],
  'time': 956705441},
 {'session': [933,
   3072,
   1066,
   907,
   935,
   3175,
   2671,
   130

In [186]:
rating_id = rating.copy()
rating_id['original_item'] = rating_id['item']
rating_id['item'] = rating_id.groupby('item').ngroup()
item_mapping = rating_id.set_index('original_item')['item'].to_dict()

In [275]:
# Initialize variables to store total session count and total number of elements
total_sessions = 0
total_elements = 0

# Loop through each entry in the list to calculate total session count and total elements
for item in final_sessions_id:
    session = item['session']
    total_sessions += 1
    total_elements += len(session)

# Calculate the average length of sessions
average_length = total_elements / total_sessions

# Output the result
print(f"The average length of all session lists is: {average_length}")

The average length of all session lists is: 12.792375827175116


In [187]:
train_id = final_sessions_id[:int(len(final_sessions_id)*0.9)]
test_id = final_sessions_id[int(len(final_sessions_id)*0.9):]

In [188]:
len(train_id), len(test_id)

(69906, 7768)

In [189]:
reindex_train_id = reindex_data(train_id, item_mapping)
train_ids, train_text = construct_train_val_text(title, train_id, item_set, reindex_train_id, candidate_size=20)

In [213]:
len(train_ids[1]), len(train_text[1])

(65664, 65664)

In [214]:
all_sequence = [item['session'] for item in reindex_train_id]
len(all_sequence)

69906

In [192]:
reindex_test_id = reindex_data(test_id, item_mapping)
test_ids, test_text = construct_train_val_text(title, test_id, item_set, reindex_test_id, candidate_size=20)

In [215]:
len(test_ids[1]), len(test_text[1])

(5979, 5979)

In [194]:
import pickle
from collections import defaultdict, Counter
with open('train_ML.txt', 'wb') as f:
    pickle.dump(train_ids, f)
with open('test_ML.txt', 'wb') as f:
    pickle.dump(test_ids, f)
with open('all_ML.txt', 'wb') as f:
    pickle.dump(all_sequence, f)

In [197]:
train_candidate_items = pickle.load(open('XXXXXXXXXXXX/train_candidate_ml_items.txt', 'rb'))
test_candidate_items = pickle.load(open('XXXXXXXXXXXXX/test_candidate_ml_items.txt', 'rb'))

In [220]:
train_candidates = []
for items in train_candidate_items:
    candidate_items_mapping = []
    for item in items:
        candidate_keys = [k for k, v in item_mapping.items() if v == item]
        candidate_items_mapping.append(candidate_keys[0])
    items_text = get_text(candidate_items_mapping, title)
    items_text_order = []
    for idx, item in enumerate(items_text):
        items_text_order.append(f"{idx}.{item} \n ")
    train_candidates.append(items_text_order)

In [1]:
train_prompts_with_candidate = []
index = 0
for items in train_text[0]:
    item_prompts = []
    for idx, item in enumerate(items, start=1):
        item_prompts.append(f"{idx}.{item}_{train_ids[0][index][idx-1]} \n ")
    # prompt_text = "".join(item_prompts)
    # candidate_text = "".join(train_candidates[index])
    prompt_text = "The order in which users click on items is as follows: " + "".join(item_prompts)+ "Candidate item set:" + "".join(train_candidates[index])
    train_prompts_with_candidate.append({"prompt": prompt_text})
    # train_prompts_with_candidate.append({"Current session interactions": prompt_text, "Candidate item set": candidate_text})
    index += 1
len(train_prompts_with_candidate)

NameError: name 'train_text' is not defined

In [246]:
test_candidates = []
for items in test_candidate_items:
    candidate_items_mapping = []
    for item in items:
        candidate_keys = [k for k, v in item_mapping.items() if v == item]
        candidate_items_mapping.append(candidate_keys[0])
    items_text = get_text(candidate_items_mapping, title)
    items_text_order = []
    for idx, item in enumerate(items_text):
        items_text_order.append(f"{idx}.{item} \n ")
    test_candidates.append(items_text_order)

In [2]:
test_prompts_with_candidate = []
index = 0
for items in test_text[0]:
    item_prompts = []
    for idx, item in enumerate(items, start=1):
        item_prompts.append(f"{idx}.{item}_{test_ids[0][index][idx-1]} \n ")
    # prompt_text = "".join(item_prompts)
    # candidate_text = "".join(test_candidates[index])
    prompt_text = "The order in which users click on items is as follows: " + "".join(item_prompts)+ "Candidate item set:" + "".join(test_candidates[index])
    test_prompts_with_candidate.append({"prompt": prompt_text})
    # test_prompts_with_candidate.append({"Current session interactions": prompt_text, "Candidate item set": candidate_text})
    index += 1
test_prompts_with_candidate

NameError: name 'test_text' is not defined

In [253]:
with open('XXXXXXXXXXXXX/train_prompts_with_candidate.json', 'w', encoding='utf-8') as f:
        json.dump(train_prompts_with_candidate, f, ensure_ascii=False, indent=4)
with open('XXXXXXXXXXXXX/test_prompts_with_candidate.json', 'w', encoding='utf-8') as f:
        json.dump(test_prompts_with_candidate, f, ensure_ascii=False, indent=4)

In [258]:
all_text_items = []
all_text_items.append(train_text[0])
all_text_items.append(test_text[0])
len(all_text_items[1])

5979

In [256]:
with open('XXXXXXXXXXXXXX/all_text_items.txt', 'wb') as f:
    pickle.dump(all_text_items, f)