## Training File Process
- [ ] extract user history data from conversational data

In [1]:
import os
import re
import json
import pickle
import numpy as np
import pandas as pd
from typing import *
from rapidfuzz import fuzz
from copy import deepcopy

In [2]:
train_conv_file = os.path.expanduser("~/work/UniCRS/data/redial/train_data_dbpedia_raw.jsonl")
valid_conv_file = os.path.expanduser("~/work/UniCRS/data/redial/valid_data_dbpedia_raw.jsonl")
test_conv_file = os.path.expanduser("~/work/UniCRS/data/redial/test_data_dbpedia_raw.jsonl")
movie_file = os.path.expanduser("~/blob/raw_datasets/ml-latest/chatbot/movies.ftr")
save_path = os.path.expanduser("~/work/data/redial")

In [3]:
def read_jsonl(fpath: str) -> List[Dict]:
    res = []
    with open(fpath, 'r') as f:
        for line in f:
            data = json.loads(line)
            res.append(data)
    return res


def write_jsonl(obj: List[Dict], fpath: str) -> None:
    try:
        with open(fpath, 'w') as outfile:
            for entry in obj:
                json.dump(entry, outfile)
                outfile.write('\n')
        print("Sucessfully saved into {}.".format(fpath))
    except Exception as e:
        print(f"Error {e} raised. The temp file would be saved in {fpath}.pkl")
        with open(f"{fpath}.pkl", 'wb') as tempfile:
            pickle.dump(obj, tempfile)
    return

In [4]:
# load files
test_conv_data = read_jsonl(test_conv_file)
conv_data = read_jsonl(train_conv_file) + read_jsonl(valid_conv_file) + test_conv_data
movies_data = pd.read_feather(movie_file)
available_movies = movies_data['title'].unique()

In [5]:
def extract_numbers(data_list):  
    numbers = []  
    pattern = r'@(\d+)'  
      
    for item in data_list:  
        text = item.get('text', '')  
        matches = re.finditer(pattern, text)  
        for match in matches:  
            num_str = match.group(1)  
            if num_str not in numbers:  
                numbers.append(num_str)  
    return numbers

def separate_movie_and_year(movie_string):  
    pattern = r"^(.+)\s\((\d{4})\)$"  
    match = re.match(pattern, movie_string)  
      
    if match:  
        movie_name = match.group(1)  
        movie_year = int(match.group(2))  
        return movie_name, movie_year  
    else:  
        return movie_string, None

def attach_labels(conv, movie_ids):
    res = []
    for id in movie_ids:
        labels = conv['initiatorQuestions']
        mentions = conv['movieMentions']
        if id not in labels:
            continue

        if id not in mentions:
            continue

        title_date = conv['movieMentions'][id]
        title, date = separate_movie_and_year(title_date)
        info = {"id": id, 'title': title, 'date': date}
        info.update(labels[id])
        res.append(info)
    return res

In [6]:
# extract items from conversations
train_item = []
for conv in conv_data:
    items = extract_numbers(conv['messages'])
    train_item.append(attach_labels(conv, items))

print(len(train_item))
print(len(conv_data))
print(train_item[0])
print("Avg Len: ", sum([len(_) for _ in train_item]) / len(train_item))

train_pos_item = []
for data in train_item:
    train_pos_item.append([i for i in data if i['liked']==1])

print("Avg Pos Item: ", sum([len(_) for _ in train_pos_item]) / len(train_pos_item))
print("Total positive interactions: ", sum([len(_) for _ in train_pos_item]))

11348
11348
[{'id': '169487', 'title': 'Borat', 'date': 2006, 'suggested': 1, 'seen': 1, 'liked': 1}, {'id': '177112', 'title': 'Wedding Crashers', 'date': 2005, 'suggested': 0, 'seen': 1, 'liked': 1}, {'id': '178502', 'title': 'Ted ', 'date': 2012, 'suggested': 1, 'seen': 1, 'liked': 1}, {'id': '155645', 'title': 'Ted 2', 'date': 2015, 'suggested': 1, 'seen': 1, 'liked': 1}, {'id': '78340', 'title': 'Deadpool ', 'date': 2016, 'suggested': 1, 'seen': 1, 'liked': 1}, {'id': '135571', 'title': 'The Hangover', 'date': 2009, 'suggested': 0, 'seen': 1, 'liked': 1}]
Avg Len:  5.219950652097286
Avg Pos Item:  4.236693690518153
Total positive interactions:  48078


In [7]:
# 
def approx_equal(msg: str, target: str, thres: float=80):
    msg = re.sub(r"[^a-zA-Z0-9\s]", "", msg.lower())
    target = re.sub(r"[^a-zA-Z0-9\s]", "", target.lower())
    if fuzz.ratio(msg, target) > thres:
        return True
    else:
        return False
    

def movie_map(movie, movie_df) -> bool:
    # TODO: 
    map_dict = {}
    approx_map = {}
    title = movie['title']
    date = movie['date']
    movie_df = deepcopy(movie_df)
    # movie_df = movie_df.set_index('title')
    movie_title_set = movie_df['title'].unique()
    if title.lower() in movie_title_set:
        res = movie_df[movie_df['title']==title.lower()]
        if len(res) > 1:
            if date is not None:
                res.loc[:, 'date_diff'] = res['release_date'] - pd.to_datetime(str(date))
                res = res.loc[res['date_diff'].idxmin()]
            else:
                res = res.iloc[0]
        else:
            res = res.iloc[0]
        id = res['id']
        map_dict[(title, date)] = id
    else:
        scores = []
        for m in movie_title_set:
            scores.append(fuzz.ratio(title.lower(), m))
        max_idx = np.argmax(scores)
        max_scores = scores[max_idx]
        if max_scores >= 90:
            map_dict[(title, date)] = movie_df[movie_df['title']==movie_title_set[max_idx]].iloc[0]['id']
            approx_map[(title, date)] = movie_df.loc[movie_df['title']==movie_title_set[max_idx]].iloc[0]['id']
        else:
            map_dict[(title, date)] = -1
            
    return map_dict, approx_map


movies_from_conversation = set()

for data in train_pos_item:
    for i in data:
        if (i['title'], i['date']) not in movies_from_conversation:
            movies_from_conversation.add((i['title'], i['date']))

movies_from_conversation = [{'title': d[0].strip(), 'date': d[1]} for d in movies_from_conversation]

map_dict = {}
approx_map = {}
movie_df = deepcopy(movies_data)
movie_df['title'] = movies_data['title'].apply(lambda x: x.lower())
for movie in movies_from_conversation:
    _res, _approx = movie_map(movie, movie_df)
    map_dict.update(_res)
    approx_map.update(_approx)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.loc[:, 'date_diff'] = res['release_date'] - pd.to_datetime(str(date))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.loc[:, 'date_diff'] = res['release_date'] - pd.to_datetime(str(date))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res.loc[:, 'date_diff'] = res['release_date'] - pd.to_dat

In [8]:
sum([v>0 for v in map_dict.values()]) / len(map_dict), len(approx_map) / len(map_dict)

(0.7282400139542997, 0.04099075527646956)

In [9]:
len(train_pos_item) == len(conv_data)

True

In [10]:
no_map = 0
ukn_map = 0
all_cnt = 0
train_pos_item_mapped = []
test_pos_item_mapped = []
test_data_idx = []
for n, data in enumerate(train_pos_item):
    new_data = []
    for i in data:
        if (i['title'].strip(), i['date']) not in map_dict:
            no_map += 1
        else:
            if map_dict[(i['title'].strip(), i['date'])] < 0:
                ukn_map += 1
            else:
                new_data.append(map_dict[(i['title'].strip(), i['date'])])
        all_cnt += 1
    if (n < (len(train_pos_item)-len(test_conv_data))):
        if len(new_data) >= 2:
            train_pos_item_mapped.append(new_data)
    else:
        if len(new_data) >= 2:
            test_pos_item_mapped.append(new_data)
            test_data_idx.append(n)

print(f"No: {no_map/all_cnt}; Unknown: {ukn_map/all_cnt}")
print(f"Mapped train data:  #conv- {len(train_pos_item_mapped)}, #interactions- {sum([len(d) for d in train_pos_item_mapped])}")
print(f"Mapped train data:  #conv- {len(test_pos_item_mapped)}, #interactions- {sum([len(d) for d in test_pos_item_mapped])}")

No: 0.0; Unknown: 0.18696701193893256
Mapped train data:  #conv- 8920, #interactions- 33837
Mapped train data:  #conv- 1196, #interactions- 4427


In [12]:
selected_test_conv = [conv for i, conv in enumerate(test_conv_data) if i+len(train_pos_item)-len(test_conv_data) in test_data_idx]

In [15]:
selected_test_conv[:2]

[{'movieMentions': {'111776': 'Super Troopers (2001)',
   '91481': 'Beverly Hills Cop (1984)',
   '151656': 'Police Academy  (1984)',
   '134643': 'American Pie  (1999)',
   '192131': 'American Pie ',
   '124771': '48 Hrs. (1982)',
   '94688': 'Police Academy 2: Their First Assignment (1985)',
   '101794': 'Lethal Weapon (1987)'},
  'respondentQuestions': {'111776': {'suggested': 0, 'seen': 1, 'liked': 1},
   '91481': {'suggested': 1, 'seen': 2, 'liked': 2},
   '151656': {'suggested': 1, 'seen': 0, 'liked': 1},
   '134643': {'suggested': 0, 'seen': 1, 'liked': 1},
   '192131': {'suggested': 0, 'seen': 1, 'liked': 1},
   '124771': {'suggested': 1, 'seen': 2, 'liked': 2},
   '94688': {'suggested': 1, 'seen': 0, 'liked': 1},
   '101794': {'suggested': 1, 'seen': 0, 'liked': 2}},
  'messages': [{'timeOffset': 0,
    'text': 'Hi I am looking for a movie like @111776',
    'senderWorkerId': 956,
    'messageId': 204171,
    'entity': ['<http://dbpedia.org/resource/Hi_Records>'],
    'entity_

In [16]:
import re
def get_first_time_appear(messages):
    movies = {}
    movie_pattern = re.compile(r'@\d+')
    for i, text in enumerate(messages):
        match = re.findall(movie_pattern, text['text'])
        if match:
            for m in match:
                if m[1:] not in movies:
                    movies[m[1:]] = i
                else:
                    pass
        else:
            pass
    return movies

def merge_conversation(conv, turn):
    movieid2name = conv['movieMentions']
    def convert(match):
        movieid = match.group(0)[1:]
        if movieid in movieid2name:
            movie_name = movieid2name[movieid]
            return movie_name
        else:
            return match.group(0)
    movie_pattern = re.compile(r'@\d+')
    seeker_id = conv['initiatorWorkerId']
    reco_id = conv['respondentWorkerId']
    messages = conv['messages']
    res = ""
    for m in messages[: turn]:
        if m['senderWorkerId'] == seeker_id:
            role = "User"
        else:
            role = "Assistent"
        text = re.sub(movie_pattern, convert, m['text'])
        res += f"{role}: {text}\n"
    return res

def process_conv(conv):
    seeker_ques = conv['initiatorQuestions']
    first_appear_time = get_first_time_appear(conv['messages'])
    try:
        pos_items = {i for i, q in seeker_ques.items() if q['liked']==1 and q['suggested']==1}
    except:
        return None
    if len(pos_items) == 0:
        return None
    turn = -1
    for item in pos_items:
        if first_appear_time[item] > turn:
            target = item
            turn = first_appear_time[item]
    
    if turn <= 0:
        return None
    context = merge_conversation(conv, turn)

    return {'context': context, 'target': conv['movieMentions'][target]}

In [None]:
selected_processed_test = []
for conv in selected_test_conv[:50]:
    c = process_conv(conv)
    if c:
        selected_processed_test.append(c)
    else:
        continue

write_jsonl(selected_processed_test, os.path.join(save_path, "../test_data_50.jsonl"))

In [18]:
selected_processed_test

[{'context': 'User: Hi I am looking for a movie like Super Troopers (2001)\nAssistent: You should watch Police Academy  (1984)\nUser: Is that a great one? I have never seen it. I have seen American Pie \nUser: I mean American Pie  (1999)\n',
  'target': 'Police Academy 2: Their First Assignment (1985)'},
 {'context': "Assistent: Tell me what would you like to watch?\nUser: Easy A (2010)\nAssistent: Did you watch Avengers: Infinity War (2018) ?\nAssistent: Or The Black Panthers: Vanguard of the Revolution (2015) ?\nAssistent: i really liked it\nUser: No , I think I would like to watch Easy A (2010)\nUser: It's funny\nAssistent: But You already know what you want to watch\nAssistent: You should tell me what kind of movies you like\nAssistent: Did you watch Click  (2006) ?\nAssistent: Do you like scary movies?\n",
  'target': 'It  (2017)'},
 {'context': "User: i would like to watch any movie\nUser: Tell me any movie\nUser: Like Avengers: Infinity War (2018)\nAssistent: Have you seen The A

In [146]:
train_data_mapped = train_pos_item_mapped[: -1000]
valid_data_mapped = train_pos_item_mapped[1000:]
test_data_mapped = test_pos_item_mapped
n_user_train = len(train_data_mapped)
n_user_valid = len(valid_data_mapped)
n_user_test = len(test_data_mapped)
train_df = pd.DataFrame({'user_id':list(range(1, n_user_train+1)), 'item_id': train_data_mapped})
valid_df = pd.DataFrame({'user_id':list(range(n_user_train+1, n_user_train+1+n_user_valid)), 'item_id': valid_data_mapped})
test_df = pd.DataFrame({'user_id':list(range(n_user_train+1+n_user_valid, n_user_train+1+n_user_valid+n_user_test)), 'item_id': test_data_mapped})

In [147]:
valid_history = deepcopy(valid_df)
valid_history['item_id'] = valid_history['item_id'].apply(lambda x: x[:-1])
valid_df['item_id'] = valid_df['item_id'].apply(lambda x: x[-1])

In [148]:
test_history = deepcopy(test_df)
test_history['item_id'] = test_history['item_id'].apply(lambda x: x[:-1])
test_df['item_id'] = test_df['item_id'].apply(lambda x: x[-1])

In [149]:
user_history = pd.concat([train_df, valid_history, test_history])
user_history = user_history.explode("item_id")
user_history

Unnamed: 0,user_id,item_id
0,1,491
0,1,849
0,1,4674
0,1,512
1,2,28300
...,...,...
1193,17034,4237
1194,17035,1036
1194,17035,28
1195,17036,160


In [150]:
new_train_df = pd.concat([train_df, valid_history])
new_train_df = new_train_df.explode('item_id')
new_train_df

Unnamed: 0,user_id,item_id
0,1,491
0,1,849
0,1,4674
0,1,512
1,2,28300
...,...,...
7918,15839,27621
7918,15839,5697
7919,15840,4247
7919,15840,7155


In [152]:
new_train_df.to_csv(os.path.join(save_path, 'train.csv'), index=None)
valid_df.to_csv(os.path.join(save_path, 'valid.csv'), index=None)
test_df.to_csv(os.path.join(save_path, 'test.csv'), index=None)
user_history.to_csv(os.path.join(save_path, 'user_history.csv'), index=None)

Bad pipe message: %s [b'\x86\xbf\x91-/\x8f\\\r8\x82\xa4Y\xf8\xb8\xabjwU\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0']
Bad pipe message: %s [b"+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00"]
Bad pipe message: %s [b'\xae\xb5.\xd3\n\xe1\x03"\x05xU+;,\x1a\x0f\xfd\x8a\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0s\xc0w\x00\xc4\x00\xc3\xc0#\xc0\'\x00g\x00@\xc0r\xc0v\x00\xbe\x00\xbd\xc0\n\xc0\x14\x009\x008']
Bad pipe message: %s [b'Vz\xd3\x9e#\x8c\x7f\xacMk\xa4e7\xb3j\xa1i\xab\x00\x00\xa2\xc0\x14\xc0\n\x009\x008\x007\x006\x00\x88\x00\x87\x00', b'\x85\xc0\x

In [154]:
new_train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)
user_history.reset_index(inplace=True)

new_train_df.to_feather(os.path.join(save_path, 'train_redial.ftr'))
valid_df.to_feather(os.path.join(save_path, 'valid_redial.ftr'))
test_df.to_feather(os.path.join(save_path, 'test_redial.ftr'))
user_history.to_feather(os.path.join(save_path, 'user_history_redial.ftr'))

In [158]:
import random
sampled_test_idx = random.choices(range(0, len(test_conv_data)), k=100)

In [159]:
sampled_test_samples = [test_conv_data[i] for i in sampled_test_idx]

In [162]:
train_pos_item

[[{'id': '169487',
   'title': 'Borat',
   'date': 2006,
   'suggested': 1,
   'seen': 1,
   'liked': 1},
  {'id': '177112',
   'title': 'Wedding Crashers',
   'date': 2005,
   'suggested': 0,
   'seen': 1,
   'liked': 1},
  {'id': '178502',
   'title': 'Ted ',
   'date': 2012,
   'suggested': 1,
   'seen': 1,
   'liked': 1},
  {'id': '155645',
   'title': 'Ted 2',
   'date': 2015,
   'suggested': 1,
   'seen': 1,
   'liked': 1},
  {'id': '78340',
   'title': 'Deadpool ',
   'date': 2016,
   'suggested': 1,
   'seen': 1,
   'liked': 1},
  {'id': '135571',
   'title': 'The Hangover',
   'date': 2009,
   'suggested': 0,
   'seen': 1,
   'liked': 1}],
 [{'id': '184418',
   'title': 'Get Out',
   'date': 2017,
   'suggested': 1,
   'seen': 1,
   'liked': 1},
  {'id': '201085',
   'title': 'The Signal ',
   'date': 2014,
   'suggested': 1,
   'seen': 1,
   'liked': 1},
  {'id': '194237',
   'title': 'Shutter Island ',
   'date': 2010,
   'suggested': 1,
   'seen': 1,
   'liked': 1},
  {'id'

Bad pipe message: %s [b"%\xc2\x95\x83\x00k\x88\x9d:\xa0\xb2(\xef\xe1e\xf1~_\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x000\x00.\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b", b'\x08\x05\x08\x06\x04\x01\x05']
Bad pipe message: %s [b'\x8a\xd4\xc1c\xf2\x8a\xd9\x9e\xc13\xa5\xd8\x16\x94V~\xbb\xff\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/

In [160]:
sampled_test_samples

[{'movieMentions': {'84273': 'Pineapple Express  (2008)',
   '193168': 'Rush Hour  (1998)',
   '148477': 'The Blues Brothers  (1980)',
   '151102': 'Men in Black  (1997)',
   '78340': 'Deadpool  (2016)'},
  'respondentQuestions': [],
  'messages': [{'timeOffset': 0,
    'text': 'Hello! How are you?',
    'senderWorkerId': 975,
    'messageId': 204871,
    'entity': ['<http://dbpedia.org/resource/Hello>',
     '<http://dbpedia.org/resource/How_(TV_series)>'],
    'entity_name': ['Hello!', 'How'],
    'movie': [],
    'movie_name': [],
    'word_name': []},
   {'timeOffset': 5,
    'text': 'Hi I am good',
    'senderWorkerId': 976,
    'messageId': 204872,
    'entity': ['<http://dbpedia.org/resource/Hi_Records>',
     '<http://dbpedia.org/resource/Goods>'],
    'entity_name': ['Hi', 'good'],
    'movie': [],
    'movie_name': [],
    'word_name': []},
   {'timeOffset': 8,
    'text': 'How are you',
    'senderWorkerId': 976,
    'messageId': 204873,
    'entity': ['<http://dbpedia.org/r