In [1]:
import os
import re
import gzip
import json
import numpy as np
import pandas as pd


In [2]:
DATA_DIR = os.path.expanduser("~/blob/raw_datasets/steam")

review_file = os.path.join(DATA_DIR, "steam_reviews.json.gz")
meta_file = os.path.join(DATA_DIR, "steam_games.json.gz")

all_meta_file = os.path.join(DATA_DIR, "steam_games.csv")

processed_raw_folder = os.path.join(DATA_DIR, "processed")
raw_review_path = os.path.join(processed_raw_folder, 'reviews.csv')
raw_meta_path = os.path.join(processed_raw_folder, 'meta.csv')

chatbot_data_folder = os.path.join(DATA_DIR, 'chatbot')

if not os.path.exists(processed_raw_folder):
    os.makedirs(processed_raw_folder)

if not os.path.exists(chatbot_data_folder):
    os.makedirs(chatbot_data_folder)


In [121]:
# with open(os.path.join(DATA_DIR, "steam_games.json"), 'w') as f:
#     games = json.load(f)

def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)


In [122]:
if not os.path.exists(raw_review_path):
    raw_review_df = pd.DataFrame(list(parse(review_file)))
    raw_review_df.to_csv(raw_review_path, index=None, sep='|')
else:
    raw_review_df = pd.read_csv(raw_review_path, sep='|')

In [123]:
if not os.path.exists(raw_meta_path):
    raw_meta_df = pd.DataFrame(list(parse(meta_file)))
    raw_meta_df.to_csv(raw_meta_path, index=None, sep='|')
else:
    raw_meta_df = pd.read_csv(raw_meta_path, sep='|')

In [124]:
all_meta_df = pd.read_csv(all_meta_file)

In [125]:
print(raw_review_df.columns)
print(raw_review_df.sample(3))

Index(['username', 'hours', 'products', 'product_id', 'page_order', 'date',
       'text', 'early_access', 'page', 'found_funny', 'compensation',
       'user_id'],
      dtype='object')
               username  hours  products  product_id  page_order        date  \
3431749  Indifferential   15.3      93.0      282140           5  2016-11-24   
3789391          Kadesh  804.5      15.0      433850           7  2017-01-08   
5883488         UniDoge   28.3      35.0      202530           2  2015-05-05   

                                                      text  early_access  \
3431749  SOMA is a story-driven sci-fi horror game.Ther...         False   
3789391                              Vicia mais que crack.          True   
5883488  This game is iffy for me.\nI've got pros, most...         False   

         page  found_funny compensation  user_id  
3431749   310          NaN          NaN      NaN  
3789391  2801          NaN          NaN      NaN  
5883488    45          NaN        

In [126]:
print(raw_meta_df.columns)
print(raw_meta_df.sample(3))

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'discount_price', 'reviews_url', 'specs', 'price',
       'early_access', 'id', 'developer', 'sentiment', 'metascore'],
      dtype='object')
         publisher      genres                     app_name  \
1357   Deep Silver  ['Action']  Sacred Citadel: Jungle Hunt   
3851     DYA Games   ['Indie']   Super Star Path Soundtrack   
31377     2K Games  ['Action']   Mafia II DLC: Greaser Pack   

                             title  \
1357   Sacred Citadel: Jungle Hunt   
3851    Super Star Path Soundtrack   
31377   Mafia II DLC: Greaser Pack   

                                                     url release_date  \
1357   http://store.steampowered.com/app/222380/Sacre...   2013-04-22   
3851   http://store.steampowered.com/app/378910/Super...   2015-06-22   
31377  http://store.steampowered.com/app/67330/Mafia_...   2010-09-28   

                                                    tags  discount_pri

In [127]:
print(all_meta_df.columns)
print(all_meta_df.sample(3))

Index(['url', 'types', 'name', 'desc_snippet', 'recent_reviews', 'all_reviews',
       'release_date', 'developer', 'publisher', 'popular_tags',
       'game_details', 'languages', 'achievements', 'genre',
       'game_description', 'mature_content', 'minimum_requirements',
       'recommended_requirements', 'original_price', 'discount_price'],
      dtype='object')
                                                     url types  \
35969  https://store.steampowered.com/app/596160/Tric...   app   
40199  https://store.steampowered.com/app/773806/The_...   app   
38265  https://store.steampowered.com/app/1033080/Let...   app   

                                                    name  \
35969                            TrickStyle - Soundtrack   
40199  The Legend of Heroes: Trails of Cold Steel II ...   
38265                      Letters - a written adventure   

                                            desc_snippet recent_reviews  \
35969                                             

In [128]:
all_meta_df['name']

0                                                     DOOM
1                            PLAYERUNKNOWN'S BATTLEGROUNDS
2                                               BATTLETECH
3                                                     DayZ
4                                               EVE Online
                               ...                        
40828    Rocksmith® 2014 Edition – Remastered – Sabaton...
40829    Rocksmith® 2014 Edition – Remastered – Stone T...
40830    Fantasy Grounds - Quests of Doom 4: A Midnight...
40831                         Mega Man X5 Sound Collection
40832                                     Stories In Stone
Name: name, Length: 40833, dtype: object

In [129]:
raw_game_names = raw_meta_df['app_name'].unique()
all_game_names = all_meta_df['name'].unique()

print("Coverage: ", len([g for g in raw_game_names if g in all_game_names]) / len(raw_game_names))

Coverage:  0.6236797008879887


## Text Clean Functions

In [130]:
import re  
from bs4 import BeautifulSoup  
  
def remove_html_tags(text):  
    """Remove HTML tags from text"""  
    soup = BeautifulSoup(text, "html.parser")  
    return soup.get_text()  
  
def remove_emojis(text):  
    emoji_pattern = re.compile(  
        "["  
        u"\U0001F600-\U0001F64F"  # emoticons  
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs  
        u"\U0001F680-\U0001F6FF"  # transport & map symbols  
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)  
        u"\U00002702-\U000027B0"  
        u"\U000024C2-\U0001F251"  
        "]+",  
        flags=re.UNICODE,  
    )  
    return emoji_pattern.sub(r"", text)  
  
def remove_special_characters(text, remove_digits=False):  
    pattern = r"[^a-zA-Z0-9\s\u4e00-\u9fa5\,\.\!]" if not remove_digits else r"[^a-zA-Z\s\u4e00-\u9fa5\,\.\!]"  
    return re.sub(pattern, "", text)  
  
def clean_text(text, remove_digits=False):  
    text = remove_html_tags(text)  
    text = remove_emojis(text)  
    text = remove_special_characters(text, remove_digits=remove_digits)  
    return text  
  
# example
text = "<html><head></head><body><p>This is a text,.: including emoji👍and special symbols!@#</p></body></html>"  
cleaned_text = clean_text(text)  
print(cleaned_text)  

This is a text,. including emojiand special symbols!


## Clean Text in Meta DF

In [131]:
raw_meta_df['tags'][0]

"['Strategy', 'Action', 'Indie', 'Casual', 'Simulation']"

In [134]:
import re
def process_price(x: str):
    if isinstance(x, str):
        # try:
        price = re.findall(r"\d+\.?\d*", x)
        if price:
            return eval(price[0])
        else:
            return 0
    elif isinstance(x, float):
        return x
    else:
        return 0

In [135]:
raw_meta_df['price'] = raw_meta_df['price'].apply(process_price)
avg_price = raw_meta_df['price'][~raw_meta_df['price'].isna()].mean()
raw_meta_df['price'] = raw_meta_df['price'].fillna(avg_price)

In [136]:
print("Title NAN: ", (raw_meta_df['title'].isna().sum()))
print("App name NAN: ", raw_meta_df['app_name'].isna().sum())
print((raw_meta_df['app_name'] == raw_meta_df['title']).sum() / len(raw_meta_df))
raw_meta_df[raw_meta_df['app_name'] != raw_meta_df['title']][['title', 'app_name']]

Title NAN:  2050
App name NAN:  2
0.9189357398475183


Unnamed: 0,title,app_name
4,,Log Challenge
11,,Icarus Six Sixty Six
19,,After Life VR
20,,Kitty Hawk
22,,Mortars VR
...,...,...
32073,,Tank of War-VR
32076,,Flappy Arms
32077,,SpaceWalker
32085,,LIV Client


In [137]:
col2types = {
    "publisher": str,
    "app_name": str,
    "price": float,
    "id": int,
    "developer": str,
}

In [138]:
raw_meta_df_0 = raw_meta_df[~raw_meta_df['id'].isna()]
raw_meta_df_0 = raw_meta_df_0[~raw_meta_df_0['app_name'].isna()]
raw_meta_df_0.reset_index(inplace=True, drop=True)

In [139]:
raw_meta_df_1 = raw_meta_df_0.astype(col2types)

In [140]:
raw_meta_df_1['app_name'] = raw_meta_df_1['app_name'].apply(lambda x: clean_text(x) if x is not None else "")
raw_meta_df_1['developer'] = raw_meta_df_1['developer'].apply(lambda x: clean_text(x) if x is not None else "")

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


In [141]:
all_meta_df_0 = all_meta_df[~all_meta_df['name'].isna()].reset_index(drop=True)
all_meta_df_0 = all_meta_df_0.astype({'name': 'str', 'game_details': 'str', 'game_description': 'str'})
all_meta_df_0['name'] = all_meta_df_0['name'].apply(lambda x: clean_text(x) if x is not None else "")
all_meta_df_0['game_details'] = all_meta_df_0['game_details'].apply(lambda x: clean_text(x) if x is not None else "")
all_meta_df_0['game_description'] = all_meta_df_0['game_description'].apply(lambda x: clean_text(x) if x is not None else "")

  soup = BeautifulSoup(text, "html.parser")


  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


In [142]:
all_meta_df_0["_lower_name"] = all_meta_df_0['name'].apply(lambda x: x.lower())
raw_meta_df_1["_lower_name"] = raw_meta_df_1['app_name'].apply(lambda x: x.lower())

In [143]:
all_meta_df_0["_lower_name"] = all_meta_df_0["_lower_name"].astype(str)
raw_meta_df_1["_lower_name"] = raw_meta_df_1["_lower_name"].astype(str)
raw_meta_df_1 = raw_meta_df_1.drop_duplicates(keep='first').reset_index(drop=True)
raw_meta_df_1 = raw_meta_df_1.set_index('_lower_name')
all_meta_df_0 = all_meta_df_0.set_index('_lower_name')

In [144]:
cat_df = raw_meta_df_1.join(all_meta_df_0, on='_lower_name', rsuffix='_all')

In [145]:
print(cat_df['game_description'].isna().sum(), cat_df['game_details'].isna().sum())
print(cat_df['genres'].isna().sum(), cat_df['tags'].isna().sum())

11991 11991
3628 162


In [146]:
cat_df.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'discount_price', 'reviews_url', 'specs', 'price',
       'early_access', 'id', 'developer', 'sentiment', 'metascore', 'url_all',
       'types', 'name', 'desc_snippet', 'recent_reviews', 'all_reviews',
       'release_date_all', 'developer_all', 'publisher_all', 'popular_tags',
       'game_details', 'languages', 'achievements', 'genre',
       'game_description', 'mature_content', 'minimum_requirements',
       'recommended_requirements', 'original_price', 'discount_price_all'],
      dtype='object')

In [147]:
cat_df['release_date_all'] = pd.to_datetime(cat_df['release_date_all'], format="%b %d, %Y", errors="coerce")
cat_df['release_date'] = pd.to_datetime(cat_df['release_date'], format="%Y-%m-%d", errors="coerce")

In [148]:
cat_df['release_date'][cat_df['release_date'].isna()] = cat_df['release_date_all'][cat_df['release_date'].isna()]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df['release_date'][cat_df['release_date'].isna()] = cat_df['release_date_all'][cat_df['release_date'].isna()]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df['release_date'][cat_df['release_date'].isna()] = cat_df['release_date_all'][cat_df['release_date'].isna()]


In [149]:
used_cols = ["id", "app_name", "release_date", "tags", "price", "game_description"]
final_meta_df = cat_df[used_cols]
final_meta_df = final_meta_df.reset_index(drop=True)
final_meta_df['game_description'] = final_meta_df['game_description'].fillna("No description")
final_meta_df.rename(columns={'app_name': 'title', 'game_description': "description"}, inplace=True)

In [150]:
final_meta_df.sample(3), final_meta_df.shape

(           id                           title release_date  \
 5917   427250                     8Bit Armies   2016-04-22   
 33235   23400   Imperium Romanum Gold Edition   2008-10-30   
 28975  345860  Super Galaxy Squadron EX Turbo   2017-03-06   
 
                                                     tags  price  \
 5917   ['Strategy', 'RTS', 'Pixel Graphics', 'Simulat...  14.99   
 33235  ['Strategy', 'Simulation', 'City Builder', 'Ro...   9.99   
 28975  ['Indie', "Shoot 'Em Up", 'Retro', 'Action', '...  11.99   
 
                                              description  
 5917    About This Game  A Blast From the Past! 8Bit ...  
 33235   About This Game The city builder strategy gam...  
 28975                                     No description  ,
 (33595, 6))

In [151]:
(raw_review_df['product_id'].isin(final_meta_df['id'])).sum() / raw_review_df.shape[0]

1.0

In [152]:
review_df_0 = raw_review_df[['username', 'product_id', 'date']]
review_df_0 = review_df_0[~review_df_0['username'].isna()].reset_index(drop=True)

In [153]:
def get_valid_ids(df, col_name, k):
        frequency = df.groupby([col_name])[[col_name]].count()
        valid_id = frequency[frequency[col_name]>=k].index
        return valid_id

def keepFirstFilter(df: pd.DataFrame, user_col: str='user_id', item_col: str='item_id', time_col: str='timestamp') -> pd.DataFrame:
    print(f"*** Only keep the first interaction for duplicated review ***")
    print(f"Begin: {df.shape}")
    df = df.sort_values(by=[user_col, time_col], ).reset_index(drop=True)
    df = df.drop_duplicates(subset=[user_col, item_col], keep='first').reset_index(drop=True)
    print(f"After: {df.shape}")
    return df


def kCoreFilter(df: pd.DataFrame, user_k: int=10, item_k: int=10, user_col: str='user_id', item_col: str='item_id', max_iter: int=20) -> pd.DataFrame:
    print(f"*** Users whose interaction < {user_k} and items whose interaction < {item_k}  would be filtered out ***")
    print(f"Begin: {df.shape}")
    
    num_users_prev, num_items_prev = len(df[user_col].unique()), len(df[item_col].unique()) 
    delta = True
    iter = 0
    
    while delta and iter < max_iter: 
        valid_users = get_valid_ids(df, user_col, user_k)  
        df = df[df[user_col].isin(valid_users)]

        valid_items = get_valid_ids(df, item_col, item_k)  
        df = df[df[item_col].isin(valid_items)]

        num_users = len(valid_users)
        num_items = len(valid_items)  

        delta = (num_users != num_users_prev) or (num_items != num_items_prev)
        print('Iter: {0}, users: {1} / {2}, items: {3} / {4}'.format(iter, num_users, num_users_prev, num_items, num_items_prev))

        num_users_prev = num_users
        num_items_prev = num_items
        iter+=1
    print(f"After: {df.shape}")
    return df 


def lowRatingFilter(df: pd.DataFrame, rating_thres: float=3.0, rating_col: str='rating') -> pd.DataFrame:
    print(f"*** Rating < {rating_thres} would be filtered out ***")
    print(f"Begin: {df.shape}")
    df = df[df[rating_col] >= rating_thres].reset_index(drop=True)
    print(f"After: {df.shape}")
    return df

In [154]:
review_df_1 = keepFirstFilter(review_df_0, user_col='username', item_col='product_id', time_col='date')
# review_df1 = lowRatingFilter(review_df_1, 3.0)
review_df_1 = kCoreFilter(review_df_1, 5, 5, user_col='username', item_col='product_id')
data_df = review_df_1.reset_index(drop=True)

*** Only keep the first interaction for duplicated review ***
Begin: (7792828, 3)


After: (6889528, 3)
*** Users whose interaction < 5 and items whose interaction < 5  would be filtered out ***
Begin: (6889528, 3)
Iter: 0, users: 281639 / 2567530, items: 11978 / 15474
Iter: 1, users: 281208 / 281639, items: 11961 / 11978
Iter: 2, users: 281204 / 281208, items: 11961 / 11961
Iter: 3, users: 281204 / 281204, items: 11961 / 11961
After: (3484497, 3)


In [155]:
data_df = data_df.rename(columns={'username': 'user_id', 'product_id': 'item_id'})

In [156]:
def map_id(df: pd.DataFrame, user_colname: str='user_id', item_colname: str='item_id', group_colname: str=None, return_map: bool=False, price_df: pd.DataFrame=None, group_df: pd.DataFrame=None) -> pd.DataFrame:
    '''map user and item ids'''
    users = df[user_colname].unique()
    items = df[item_colname].unique()
    print("**Map users and items**")
    n_users, n_items = len(users), len(items)
    user_map = {u: k+1 for k, u in enumerate(users)}
    item_map = {i: k+1 for k, i in enumerate(items)}
    df[user_colname] = df[user_colname].apply(lambda x: user_map[x])
    df[item_colname] = df[item_colname].apply(lambda x: item_map[x])

    if (group_colname is not None) and (group_colname in df):
        groups = df[group_colname].unique()
        n_groups = len(groups)
        group_map = {g: k+1 for k,g in enumerate(groups)}
        df[group_colname] = df[group_colname].apply(lambda x: group_map[x])
    else:
        group_map = {}

    if price_df is not None:
        if user_colname in price_df:
            price_df = price_df[price_df[user_colname].isin(users)].reset_index(drop=True)
            price_df[user_colname] = price_df.apply(lambda x: user_map[x])
        if item_colname in price_df:
            price_df = price_df[price_df[item_colname].isin(items)].reset_index(drop=True)
            price_df[item_colname] = price_df[item_colname].apply(lambda x: item_map[x])

    if group_df is not None:
        if user_colname in group_df:
            group_df = group_df[group_df[user_colname].isin(users)].reset_index(drop=True)
            group_df[user_colname] = group_df.apply(lambda x: user_map[x])
        if item_colname in group_df:
            group_df = group_df[group_df[item_colname].isin(items)].reset_index(drop=True)
            group_df[item_colname] = group_df[item_colname].apply(lambda x: item_map[x])
        if group_colname is not None and group_colname in group_df:
            groups = group_df[group_colname].unique()
            n_groups = len(groups)
            group_map = {g: k+1 for k,g in enumerate(groups)}
            group_df[group_colname] = group_df[group_colname].apply(lambda x: group_map[x])

    if return_map:
        return (df, price_df, group_df), (user_map, item_map, group_map)
    else:
        return df, price_df, group_df

In [157]:
(df, price_df, group_df),(user_map, item_map, group_map) = map_id(data_df, return_map=True)

**Map users and items**


In [158]:
item_map_json = {str(k): v for k,v in item_map.items()}
all_map = {'item': item_map_json, 'user': user_map}

with open(os.path.join(processed_raw_folder, "map.json"), 'w') as f:
    json.dump(all_map, f)

In [159]:
# split
def split_train_test_set_leave_one_out_seq(data: pd.DataFrame, col_name: str, time_colname:str, col_names_2_return: list, seed: int=42):
    '''Leave the last one item for test set'''
    if time_colname in data:
        df_sorted = data.sort_values(by=[col_name, time_colname]).reset_index(drop=True)
    else:
        df_sorted = data.sort_values(by=col_name).reset_index(drop=True)

    df_test = df_sorted.groupby(by=col_name, as_index=False).nth(-1)
    df_train = df_sorted.iloc[df_sorted.index.difference(df_test.index)]
    return df_train.reset_index(drop=True)[col_names_2_return], df_test.reset_index(drop=True)[col_names_2_return]

def split_train_test_set_leave_one_out(data: pd.DataFrame, col_name: str, col_names_2_return: list, seed: int = 0):
    if col_names_2_return is None:
        col_names_2_return = data.columns #.to_list()
    df_groupby = data.groupby(by=col_name, as_index=False) 
    df_test = df_groupby.sample(n=1, random_state=seed)[col_names_2_return]
    df_train = data.iloc[data.index.difference(df_test.index)][col_names_2_return] 
    return df_train.reset_index(drop=True), df_test.reset_index(drop=True) 

def split_train_test_set_by_ratio(data: pd.DataFrame, ratio: list, col_name: str, col_names_2_return: list, seed: int = 0):
    if col_names_2_return is None:
        col_names_2_return = data.columns #.to_list()
    assert len(ratio) == 2, 'ratio is for train/test.'
    frac = ratio[1] / sum(ratio)
    df_groupby = data.groupby(by=col_name, as_index=False) 
    df_test = df_groupby.sample(frac=frac, random_state=seed)[col_names_2_return]
    df_train = data.iloc[data.index.difference(df_test.index)] 
    return df_train.reset_index(drop=True), df_test.reset_index(drop=True) 

In [166]:
df_train_0, df_test = split_train_test_set_leave_one_out_seq(df, 'user_id', 'date', ['user_id', 'item_id'])
df_train, df_valid = split_train_test_set_leave_one_out_seq(df_train_0, 'user_id', 'date', ['user_id', 'item_id',])

df_train.to_csv(os.path.join(chatbot_data_folder, "train.tsv"), index=None)
df_valid.to_csv(os.path.join(chatbot_data_folder, "valid.tsv"), index=None)
df_test.to_csv(os.path.join(chatbot_data_folder, "test.tsv"), index=None)
df_train_0.to_csv(os.path.join(chatbot_data_folder, "user_history.tsv"), index=None)

In [161]:
saved_meta_df = final_meta_df[final_meta_df['id'].isin(item_map.keys())]
saved_meta_df = saved_meta_df.drop_duplicates(subset=['id'], keep='first')
saved_meta_df.reset_index(inplace=True, drop=True)

In [162]:
saved_meta_df['id'] = saved_meta_df['id'].apply(lambda x: item_map[x])

In [165]:
saved_meta_df.shape[0] == len(item_map)

True

In [30]:
saved_meta_df['tags'] = saved_meta_df['tags'].apply(lambda x: eval(x) if isinstance(x, str) else [])
saved_meta_df.to_feather(os.path.join(chatbot_data_folder, 'games.ftr'))

In [20]:
saved_meta_df.columns

Index(['title', 'release_date', 'tags', 'price', 'description', 'visited_num'], dtype='object')

### Test Data for RecBot

In [4]:
saved_meta_df = pd.read_feather(os.path.join(chatbot_data_folder, 'games.ftr'))

In [5]:
df_train = pd.read_csv(os.path.join(chatbot_data_folder, "train.tsv"))
df_valid = pd.read_csv(os.path.join(chatbot_data_folder, "valid.tsv"))
df_test = pd.read_csv(os.path.join(chatbot_data_folder, "test.tsv"))
df_train_0 = pd.read_csv(os.path.join(chatbot_data_folder, "user_history.tsv"))

In [6]:
item_count = pd.value_counts(df_train_0['item_id'])
saved_meta_df['visited_num'] = saved_meta_df['id'].apply(lambda x: item_count.loc[x] if x in item_count else 0)

In [7]:
# user_history = df_train_0.groupby('user_id').agg(list)
saved_meta_df.to_feather(os.path.join(chatbot_data_folder, 'games.ftr'))
saved_meta_df = saved_meta_df.set_index('id')

In [8]:
max_title_len = 50
id2title = {id: saved_meta_df.loc[id].title[: max_title_len] for id in saved_meta_df.index}

In [11]:
N = 900
test_data = df_test.sample(900)
user_history = df_train_0[df_train_0['user_id'].isin(test_data['user_id'])].reset_index(drop=True).groupby('user_id').agg(list)
max_len = 10
test_data['history'] = test_data['user_id'].apply(lambda x: '; '.join([id2title[_] for _ in user_history.loc[x]['item_id'][-max_len:]]))
test_data['target'] = test_data['item_id'].apply(lambda x: saved_meta_df.loc[x].title)
test_data.reset_index(inplace=True, drop=True)

In [12]:
test_data

Unnamed: 0,user_id,item_id,history,target
0,201042,597,Loadout; Sins of a Solar Empire Rebellion; Rus...,HalfLife 2
1,118030,1622,S.K.I.L.L. Special Force 2 Shooter; Contagion...,TOXIKK
2,13555,4101,Woodle Tree Adventures; Knightmare Tower; Shov...,Angry Video Game Nerd II ASSimilation
3,5916,551,Streets of Rogue; b; Unforgiving Trials The Sp...,Soda Dungeon
4,164165,40,Imperial Glory; Mount Blade Warband; Age of E...,Dino DDay
...,...,...,...,...
895,84755,2026,South Park The Stick of Truth; Antichamber; Ca...,Reigns
896,206543,70,The Repopulation; RimWorld; Cloudbuilt; Osiris...,No Mans Sky
897,143863,4106,Alan Wake; Shadow Warrior Classic Redux; Ace o...,Feel The Snow
898,68368,39,F.E.A.R.; The Witcher 2 Assassins of Kings Enh...,Deus Ex Human Revolution Directors Cut


In [None]:
from typing import *
import json, pickle

def write_jsonl(obj: List[Dict], fpath: str) -> None:
    try:
        with open(fpath, 'w') as outfile:
            for entry in obj:
                json.dump(entry, outfile)
                outfile.write('\n')
        print("Sucessfully saved into {}.".format(fpath))
    except Exception as e:
        print(f"Error {e} raised. The temp file would be saved in {fpath}.pkl")
        with open(f"{fpath}.pkl", 'wb') as tempfile:
            pickle.dump(obj, tempfile)
    return

test_data_jsonl = test_data[['history', 'target']].to_dict("records")
write_jsonl(test_data_jsonl, os.path.join(chatbot_data_folder, f"simulator_test_data_{N}.jsonl"))

In [14]:
test_data['history'].apply(lambda x: len(x.split("; "))).max()

10

Bad pipe message: %s [b'S\x97\xa0w\x95\xa3\xd0\xad:\x1b\x02\xea5\xa0\x81X\xbd\xd3 \x0b3\xc7\xb71\xe1\x1d\xe3*\x1c\xc2@\x9a0\xbb\x82\x0f\x13\xec\x01\x84\xa2\xcfM\xda\x930\x1ax\xa6']
Bad pipe message: %s [b'\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00', b'\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05']
Bad pipe message: %s [b'\x03\x08']
Bad pipe message: %s [b"3\xd6{\xc1\xfb\x80\r\x1cg\xb0\xb0\xcf\x94\xbc]\xafd\xb2 \xe7n$\x80x'\xe3!", b'g\x1ev\xd2S\xb3\xab\xc0\xf5lfr\xe7@\x81Sj\xe09F\x8b\x1f\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16']
Bad pipe message: %s [b'\x08\x08\t\x08\n\x08']
Bad pipe message: %s [b'g[\x8b]\xa4}\xfb^}P\xaby\xaf)b\xbd\xf1\xe9\x00\