In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
data_path = os.path.expanduser("~/blob/raw_datasets/ml-10m/original/ml-10M100K")
output_path = os.path.expanduser("~/blob/raw_datasets/ml-10m/chatbot")

suffix = ".csv" # [.dat, .csv]

In [92]:
# movies = pd.read_csv(os.path.join(data_path, f'movies{suffix}'), sep="::", names=['id', 'titles', 'tags'], engine='python') # ml-10m
movies = pd.read_csv(os.path.join(data_path, f'movies{suffix}'), sep=",", engine='python') # ml-10m
movies.rename(columns={'title': 'titles', 'genres': 'tags', 'movieId': 'id'}, inplace=True)

pattern = r'^(.+)\((\d{4})\)'
movies[['title', 'release_date']] = movies['titles'].str.extract(pattern)  # ml-10m
movies['title'][movies['release_date'].isna()] = movies['titles'][movies['release_date'].isna()]
movies['title'] = movies['title'].apply(lambda x: x.strip())
movies['tags'] = movies['tags'].str.split('|')
movies['release_date'] = pd.to_datetime(movies['release_date'])
movies.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['title'][movies['release_date'].isna()] = movies['titles'][movies['release_date'].isna()]


Unnamed: 0,id,titles,tags,title,release_date
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",Toy Story,1995-01-01
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",Jumanji,1995-01-01
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",Grumpier Old Men,1995-01-01


In [93]:
# ratings = pd.read_csv(os.path.join(data_path, f'ratings{suffix}'), sep="::", names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python')  # ml-10m
ratings = pd.read_csv(os.path.join(data_path, f'ratings{suffix}'), sep=",", engine='python')  # ml-latest
ratings.rename(columns = {'userId':'user_id', 'movieId':'item_id'}, inplace=True)

ratings.head(3)

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503


In [94]:
def get_valid_ids(df, col_name, k):
    frequency = df.groupby([col_name])[[col_name]].count()
    valid_id = frequency[frequency[col_name]>=k].index
    return valid_id


### leave-one-out split
def split_train_test_set_leave_one_out_seq(data: pd.DataFrame, user_col_name: str, time_colname:str, col_names_2_return: list):
    '''Leave the last one item for test set'''
    if time_colname in data:
        df_sorted = data.sort_values(by=[user_col_name, time_colname]).reset_index(drop=True)
    else:
        df_sorted = data.sort_values(by=user_col_name).reset_index(drop=True)

    df_test = df_sorted.groupby(by=user_col_name, as_index=False).nth(-1)
    df_train = df_sorted.iloc[df_sorted.index.difference(df_test.index)]
    if col_names_2_return is None:
        col_names_2_return = data.columns
    return df_train.reset_index(drop=True)[col_names_2_return], df_test.reset_index(drop=True)[col_names_2_return]



def k_core_filter(df: pd.DataFrame, user_k=10, item_k=10, user_col_name='user_id', item_col_name='item_id'):
    num_users_prev, num_items_prev = len(df[user_col_name].unique()), len(df[item_col_name].unique()) 
    delta = True
    iter, max_iter = 0, 5
    
    while delta and iter < max_iter: 
        valid_users = get_valid_ids(df, user_col_name, user_k)  
        df = df[df[user_col_name].isin(valid_users)]

        valid_items = get_valid_ids(df, item_col_name, item_k)  
        df = df[df[item_col_name].isin(valid_items)]

        num_users = len(valid_users)
        num_items = len(valid_items)  

        delta = (num_users != num_users_prev) or (num_items != num_items_prev)
        print('Ite: {0}, users: {1} / {2}, items: {3} / {4}'.format(iter, num_users, num_users_prev, num_items, num_items_prev))

        num_users_prev = num_users
        num_items_prev = num_items
        iter+=1
    return df 

In [95]:
header_line_cnt=0
# col_names = ['userId', 'movieId', 'rating', 'timestamp']
dtypes = {'user_id':int, 'item_id':int, 'rating':float, 'timestamp':int}
user_col_name = 'user_id'
item_col_name = 'item_id'

data = ratings.sort_values(by=['user_id', 'timestamp'], ignore_index=True)
print('original dataset size: {0}'.format(data.shape))
data = data[data['rating']>=3].reset_index(drop=True)
print('filter by rating>=3 dataset size: {0}'.format(data.shape))

data = data.drop_duplicates(subset=['user_id', 'item_id'], keep='last').reset_index(drop=True)
print('drop_duplicates dataset size: {0}'.format(data.shape))

data = k_core_filter(data, user_k=5, item_k=5, user_col_name=user_col_name, item_col_name=item_col_name)
data = data.reset_index(drop=True)
print('k-core filtered dataset size: {0}'.format(data.shape))

original dataset size: (33832162, 4)
filter by rating>=3 dataset size: (27782577, 4)
drop_duplicates dataset size: (27782577, 4)
Ite: 0, users: 298123 / 329127, items: 36257 / 72867
Ite: 1, users: 298073 / 298123, items: 36254 / 36257
Ite: 2, users: 298073 / 298073, items: 36254 / 36254
k-core filtered dataset size: (27638639, 4)


In [96]:
data.head(3)

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,158,4.0,1225733503
1,1,4896,4.0,1225733516
2,1,596,4.0,1225733524


In [97]:
users, items = data['user_id'].unique(), data['item_id'].unique()
num_users, num_items = len(users), len(items)
user_id_map, item_id_map = {id: i+1 for i, id in enumerate(users)}, {id: i+1 for i, id in enumerate(items)}
data['item_id'], data['user_id'] = data['item_id'].apply(lambda x: item_id_map[x]), data['user_id'].apply(lambda x: user_id_map[x])
print(num_users, num_items)

298073 36254


In [98]:
movies['new_id'] = movies['id'].apply(lambda x: item_id_map[x] if x in item_id_map else -1)
movies = movies[movies['new_id'] > 0]
print(movies.shape)
movies.head(3)

(36254, 6)


Unnamed: 0,id,titles,tags,title,release_date,new_id
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",Toy Story,1995-01-01,17
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",Jumanji,1995-01-01,105
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",Grumpier Old Men,1995-01-01,232


In [100]:
seed=2022
df_train0, df_test = split_train_test_set_leave_one_out_seq(data, user_col_name=user_col_name, time_colname="timestamp", col_names_2_return=None)
df_train, df_valid = split_train_test_set_leave_one_out_seq(df_train0, user_col_name=user_col_name, time_colname="timestamp", col_names_2_return=None)
print('size in Train/Valid/Test: {0} / {1} / {2}'.format(df_train.shape, df_valid.shape, df_test.shape))

size in Train/Valid/Test: (27042493, 4) / (298073, 4) / (298073, 4)


In [101]:
df_train = df_train[['user_id', 'item_id']]
df_valid = df_valid[['user_id', 'item_id']]
df_test = df_test[['user_id', 'item_id']]
user_hist = df_train0[['user_id', 'item_id']]

In [102]:
df_train.head(3)

Unnamed: 0,user_id,item_id
0,1,1
1,1,2
2,1,3


In [81]:
if not os.path.exists(output_path):
    print("Create output directory")
    os.makedirs(output_path)

In [82]:
df_train.to_csv(os.path.join(output_path, 'train.tsv'), index=None)
df_valid.to_csv(os.path.join(output_path, 'valid.tsv'), index=None)
df_test.to_csv(os.path.join(output_path, 'test.tsv'), index=None)
user_hist.to_csv(os.path.join(output_path, 'user_history.tsv'), index=None)

In [103]:
movies = movies[['new_id', 'title', 'release_date', 'tags']]

In [104]:
count = pd.value_counts(user_hist['item_id'])
movies['view_count'] = movies['new_id'].apply(lambda x: count[x])
movies.rename(columns={'new_id': 'id'}, inplace=True)

In [111]:
movies.head(3), movies.shape

(   index   id             title release_date  \
 0      0   17         Toy Story   1995-01-01   
 1      1  105           Jumanji   1995-01-01   
 2      2  232  Grumpier Old Men   1995-01-01   
 
                                                 tags  view_count  
 0  [Adventure, Animation, Children, Comedy, Fantasy]       69561  
 1                     [Adventure, Children, Fantasy]       23464  
 2                                  [Comedy, Romance]       12099  ,
 (36254, 6))

In [110]:
movies.reset_index(inplace=True)
movies.to_feather(os.path.join(output_path, 'movies.ftr'))

In [112]:
movies.to_csv(os.path.join(output_path, 'movies.csv'), index=None)


# Test Set

In [5]:
movies = pd.read_feather(os.path.expanduser("~/resources/movie/movies.ftr"))

In [6]:
test_inter = pd.read_csv(os.path.join(output_path, 'test.tsv'))
user_history = pd.read_csv(os.path.join(output_path, 'user_history.tsv'))
user_history = user_history.groupby(by='user_id').agg(list)

In [7]:
user_history

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
2,"[23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3..."
3,"[39, 40, 38, 41, 42, 43, 44, 45, 46, 47, 48, 4..."
4,"[30, 67, 68, 69, 70, 71, 72, 1, 3, 73, 5, 4, 7..."
5,"[91, 92, 93, 94, 95, 96, 26, 97, 98, 99, 100, ..."
...,...
69810,"[1122, 50, 51, 187, 731, 293, 485, 483, 1619, ..."
69811,"[899, 620, 955, 553, 334, 292, 182, 285, 626, ..."
69812,"[848, 205, 1117, 1264, 1344, 1388, 626, 964, 4..."
69813,"[68, 67, 988, 69, 30, 70, 72, 1, 1062, 76, 3, ..."


In [8]:
test_inter.head(3), user_history.head(3)

(   user_id  item_id
 0        1       22
 1        2       38
 2        3       66,
                                                    item_id
 user_id                                                   
 1        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
 2        [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3...
 3        [39, 40, 38, 41, 42, 43, 44, 45, 46, 47, 48, 4...)

In [15]:
N = 900
max_hist = 10
test_data = test_inter.sample(N, random_state=2024)

test_data['history'] = test_data['user_id'].apply(lambda x: user_history.loc[x])

In [16]:
test_data['history'] = test_data['history'].apply(lambda x: x[: max_hist])

In [17]:
test_data

Unnamed: 0,user_id,item_id,history
48671,48672,1752,"[7, 1860, 175, 701, 1204, 2418, 289, 194, 3572..."
34250,34251,4554,"[841, 2268, 1052, 966, 490, 594, 1114, 1374, 1..."
58422,58423,1253,"[73, 13, 84, 908, 730, 1258, 18, 1167, 781, 1062]"
44275,44276,1053,"[23, 3092, 18, 31, 2450, 242, 3526, 569, 1382,..."
9135,9136,1098,"[1706, 175, 986, 516, 633, 1715, 709, 2873, 23..."
...,...,...,...
52949,52950,32,"[8, 175, 724, 178, 1545, 669, 260, 176, 262, 558]"
53293,53294,139,"[67, 30, 988, 69, 555, 70, 71, 72, 179, 3]"
36320,36321,2286,"[148, 7, 1286, 1986, 4924, 1400, 1175, 3602, 9..."
48101,48102,2847,"[1062, 895, 908, 1159, 92, 93, 28, 94, 1325, 783]"


In [18]:
movies.set_index('id', inplace=True)

In [19]:
test_data['history'] = test_data['history'].apply(lambda x: ', '.join([movies.loc[_].title for _ in x]))

In [20]:
test_data['target'] = test_data['item_id'].apply(lambda x: movies.loc[x].title)

In [21]:
test_data_jsonl = test_data[['history', 'target']].to_dict("records")

In [None]:
from typing import *
import json
import pickle

def write_jsonl(obj: List[Dict], fpath: str) -> None:
    try:
        with open(fpath, 'w') as outfile:
            for entry in obj:
                json.dump(entry, outfile)
                outfile.write('\n')
        print("Sucessfully saved into {}.".format(fpath))
    except Exception as e:
        print(f"Error {e} raised. The temp file would be saved in {fpath}.pkl")
        with open(f"{fpath}.pkl", 'wb') as tempfile:
            pickle.dump(obj, tempfile)
    return

write_jsonl(test_data_jsonl, os.path.expanduser(f"~/movie/simulator_test_data_{N}.jsonl"))

# One-Turn Train Data

In [4]:
movies = pd.read_feather(os.path.expanduser("~/resources/movie/movies.ftr"))
movies.set_index('id', inplace=True)
user_history = pd.read_csv(os.path.join(output_path, 'user_history.tsv'))
user_history = user_history.groupby(by='user_id').agg(list)

In [27]:
n_user = 500
max_hist_len = 10
max_title_len = 50

In [28]:
sampled_user_id = np.random.choice(user_history.index, n_user, replace=False)

In [29]:
train_data = user_history.loc[sampled_user_id]

In [30]:
train_data['history'] = train_data['item_id'].apply(lambda x: x[:-1])
train_data['history'] = train_data['item_id'].apply(lambda x: x[-max_hist_len:])
train_data['target'] = train_data['item_id'].apply(lambda x: x[-1])
train_data['history'] = train_data['history'].apply(lambda x: '; '.join([movies.loc[_].title[:max_title_len] for _ in x]))
train_data['target'] = train_data['target'].apply(lambda x: movies.loc[x].title[:max_title_len])

In [31]:
train_data

Unnamed: 0_level_0,item_id,history,target
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
41683,"[205, 2227, 3460, 2131, 1318, 236, 42, 1774, 2...",Underworld; Igby Goes Down; Brokedown Palace; ...,Winged Migration (Le Peuple migrateur)
60445,"[67, 988, 69, 71, 179, 70, 3, 72, 7, 5, 4, 8, ...",Only You; I Like It Like That; Something to Ta...,"Craft, The"
69041,"[1062, 895, 91, 93, 29, 783, 659, 23, 96, 24, ...",Sleepers; Mars Attacks!; Eye for an Eye; Kids ...,Batman & Robin
12673,"[620, 177, 424, 127, 39, 778, 551, 990, 197, 5...",Bio-Dome; Not Another Teen Movie; Mothman Prop...,Velvet Goldmine
25017,"[1162, 1872, 3328, 333, 775, 2685, 1173, 175, ...",Diva; Reds; Sophie's Choice; Full Metal Jacket...,Alien
...,...,...,...
5844,"[68, 69, 30, 67, 70, 555, 1, 3, 4, 73, 1062, 2...",Outbreak; Braveheart; Addams Family Values; Ge...,Grumpier Old Men
542,"[67, 988, 30, 70, 71, 72, 1, 1158, 1062, 1201,...","Taxi Driver; Fugitive, The; Silence of the Lam...",Clerks
3572,"[205, 1161, 271, 3120, 2220, 2395, 1027, 1130,...",Who's Harry Crumb?; Willy Wonka & the Chocolat...,"Red Violin, The (Violon rouge, Le)"
66159,"[1027, 31, 136, 27, 606, 23, 176, 106, 170, 12...",Superbad; Charlie Wilson's War; Indiana Jones ...,Slumdog Millionaire


In [None]:
from typing import *
import json

def write_jsonl(obj: List[Dict], fpath: str) -> None:
    try:
        with open(fpath, 'w') as outfile:
            for entry in obj:
                json.dump(entry, outfile)
                outfile.write('\n')
        print("Sucessfully saved into {}.".format(fpath))
    except Exception as e:
        print(f"Error {e} raised. The temp file would be saved in {fpath}.pkl")
        with open(f"{fpath}.pkl", 'wb') as tempfile:
            pickle.dump(obj, tempfile)
    return

train_data_jsonl = train_data[['history', 'target']].to_dict("records")

write_jsonl(train_data_jsonl, os.path.expanduser(f"~/one-turn-generate/movie/{n_user}-history-data.jsonl"))