# 1. import library

In [1]:
import pandas as pd

# 2. define utils

In [2]:
def split_impressions(impression_str):
    true_list = []
    false_list = []
    for item in impression_str.split():
        if '-1' in item:
            true_list.append(item.split('-')[0])
        elif '-0' in item:
            false_list.append(item.split('-')[0])
    return pd.Series([true_list, false_list])

In [3]:
def parse_history(history_str):
    if pd.isna(history_str) or history_str.strip() == '':
        return []
    return history_str.strip().split()

# 3. Parse Dataset

In [4]:
file_path = './dataset/MIND-small/train/behaviors.tsv'
column_names = ['impression_id', 'user_id', 'time', 'history', 'impressions']
train_beh_df = pd.read_csv(file_path, sep='\t', names=column_names, header=None)

In [5]:
train_beh_df.head()

Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [6]:
file_path = './dataset/MIND-small/dev/behaviors.tsv'
column_names = ['impression_id', 'user_id', 'time', 'history', 'impressions']
dev_beh_df = pd.read_csv(file_path, sep='\t', names=column_names, header=None)

In [7]:
dev_beh_df.head()

Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U80234,11/15/2019 12:37:50 PM,N55189 N46039 N51741 N53234 N11276 N264 N40716...,N28682-0 N48740-0 N31958-1 N34130-0 N6916-0 N5...
1,2,U60458,11/15/2019 7:11:50 AM,N58715 N32109 N51180 N33438 N54827 N28488 N611...,N20036-0 N23513-1 N32536-0 N46976-0 N35216-0 N...
2,3,U44190,11/15/2019 9:55:12 AM,N56253 N1150 N55189 N16233 N61704 N51706 N5303...,N36779-0 N62365-0 N58098-0 N5472-0 N13408-0 N5...
3,4,U87380,11/15/2019 3:12:46 PM,N63554 N49153 N28678 N23232 N43369 N58518 N444...,N6950-0 N60215-0 N6074-0 N11930-0 N6916-0 N248...
4,5,U9444,11/15/2019 8:25:46 AM,N51692 N18285 N26015 N22679 N55556,N5940-1 N23513-0 N49285-0 N23355-0 N19990-0 N3...


# 4. Combine train & dev set, sort by impression time

In [8]:
all_beh_df = pd.concat([train_beh_df, dev_beh_df], ignore_index=True)
all_beh_df['time'] = pd.to_datetime(all_beh_df['time'], errors='coerce')
all_beh_df = all_beh_df.sort_values(by='time').reset_index(drop=True)
all_beh_df['history'] = all_beh_df['history'].apply(parse_history)
all_beh_df[['true_imp', 'false_imp']] = all_beh_df['impressions'].apply(split_impressions)
all_beh_df = all_beh_df.drop(columns=['impressions'])

In [9]:
all_beh_df.head()

Unnamed: 0,impression_id,user_id,time,history,true_imp,false_imp
0,20112,U65916,2019-11-09 00:00:19,"[N51706, N40767, N12096, N9798, N38802, N54827...","[N46057, N55582, N41858]","[N54300, N57005, N52154, N57099, N31002, N3710..."
1,13807,U49985,2019-11-09 00:01:13,"[N5056, N29975, N53234, N39603, N50032, N8422,...","[N57768, N50135, N16560, N25785, N3491]","[N20602, N50059, N15134, N61880, N64536, N3710..."
2,27660,U25550,2019-11-09 00:02:44,"[N17260, N38298, N33976, N47719, N14888, N1887...",[N52433],"[N50135, N15134, N20602, N64536]"
3,152217,U19710,2019-11-09 00:02:50,"[N3530, N48284, N43019, N62546, N138, N13138, ...",[N43083],"[N57099, N30295, N21086, N5379, N57005, N31002..."
4,42166,U38106,2019-11-09 00:03:09,"[N16874, N264, N48697, N51366]",[N48925],"[N3491, N20602, N25785, N23575, N38783, N18708..."


# 5. Print U36050's impressions (history, clicked impressions)

In [10]:
u36050_df = all_beh_df[all_beh_df['user_id'] == 'U36050']

In [13]:
print(u36050_df['true_imp'])
print(u36050_df['history'][7563])
print("length of imp7563's history: ", len(u36050_df['history'][7563]))
print(u36050_df['history'][144150])
print("length of imp7563's history: ", len(u36050_df['history'][144150]))
print(u36050_df['history'][156968])
print("length of imp7563's history: ", len(u36050_df['history'][156968]))

7563      [N18708, N39267, N33981]
144150     [N11269, N6390, N38442]
156968             [N5981, N16120]
Name: true_imp, dtype: object
['N39556', 'N459', 'N57300', 'N4643', 'N16545', 'N4501', 'N10059', 'N719', 'N51483', 'N56446', 'N32868', 'N19620', 'N61084', 'N10865', 'N16625', 'N19638', 'N31193', 'N25577', 'N12096', 'N48216', 'N20110', 'N18275', 'N28115', 'N25525', 'N29453', 'N16715', 'N24298', 'N18355', 'N4985', 'N6506', 'N14761', 'N1864', 'N8148', 'N46811']
length of imp7563's history:  34
['N39556', 'N459', 'N57300', 'N4643', 'N16545', 'N4501', 'N10059', 'N719', 'N51483', 'N56446', 'N32868', 'N19620', 'N61084', 'N10865', 'N16625', 'N19638', 'N31193', 'N25577', 'N12096', 'N48216', 'N20110', 'N18275', 'N28115', 'N25525', 'N29453', 'N16715', 'N24298', 'N18355', 'N4985', 'N6506', 'N14761', 'N1864', 'N8148', 'N46811']
length of imp7563's history:  34
['N39556', 'N459', 'N57300', 'N4643', 'N16545', 'N4501', 'N10059', 'N719', 'N51483', 'N56446', 'N32868', 'N19620', 'N61084', 'N10865', 'N

# 6. Print U65916's impressions (history, clicked impressions)

In [19]:
u65916_df = all_beh_df[all_beh_df['user_id'] == 'U65916']

In [21]:
print(u65916_df['true_imp'])
print(u65916_df['history'][0])
print("length of imp7563's history: ", len(u65916_df['history'][0]))
print(u65916_df['history'][28403])
print("length of imp7563's history: ", len(u65916_df['history'][28403]))
print(u65916_df['history'][94576])
print("length of imp7563's history: ", len(u65916_df['history'][94576]))
print(u65916_df['history'][126677])
print("length of imp7563's history: ", len(u65916_df['history'][126677]))

0         [N46057, N55582, N41858]
28403                     [N40495]
94576              [N7821, N17289]
126677                    [N41612]
Name: true_imp, dtype: object
['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp7563's history:  15
['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp7563's history:  15
['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp7563's history:  15
['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp7563's history:  15
