# 1. import library

In [1]:
import pandas as pd

# 2. define utils

In [2]:
def split_impressions(impression_str):
    true_list = []
    false_list = []
    for item in impression_str.split():
        if '-1' in item:
            true_list.append(item.split('-')[0])
        elif '-0' in item:
            false_list.append(item.split('-')[0])
    return pd.Series([true_list, false_list])

In [3]:
def parse_history(history_str):
    if pd.isna(history_str) or history_str.strip() == '':
        return []
    return history_str.strip().split()

# 3. Parse Dataset

In [4]:
file_path = './dataset/MIND-small/train/behaviors.tsv'
column_names = ['impression_id', 'user_id', 'time', 'history', 'impressions']
train_beh_df = pd.read_csv(file_path, sep='\t', names=column_names, header=None)
train_beh_df['time'] = pd.to_datetime(train_beh_df['time'], errors='coerce')
train_beh_df = train_beh_df.sort_values(by='time').reset_index(drop=True)
train_beh_df['history'] = train_beh_df['history'].apply(parse_history)
train_beh_df[['true_imp', 'false_imp']] = train_beh_df['impressions'].apply(split_impressions)
train_beh_df = train_beh_df.drop(columns=['impressions'])

In [5]:
train_beh_df.head()

Unnamed: 0,impression_id,user_id,time,history,true_imp,false_imp
0,20112,U65916,2019-11-09 00:00:19,"[N51706, N40767, N12096, N9798, N38802, N54827...","[N46057, N55582, N41858]","[N54300, N57005, N52154, N57099, N31002, N3710..."
1,13807,U49985,2019-11-09 00:01:13,"[N5056, N29975, N53234, N39603, N50032, N8422,...","[N57768, N50135, N16560, N25785, N3491]","[N20602, N50059, N15134, N61880, N64536, N3710..."
2,27660,U25550,2019-11-09 00:02:44,"[N17260, N38298, N33976, N47719, N14888, N1887...",[N52433],"[N50135, N15134, N20602, N64536]"
3,152217,U19710,2019-11-09 00:02:50,"[N3530, N48284, N43019, N62546, N138, N13138, ...",[N43083],"[N57099, N30295, N21086, N5379, N57005, N31002..."
4,42166,U38106,2019-11-09 00:03:09,"[N16874, N264, N48697, N51366]",[N48925],"[N3491, N20602, N25785, N23575, N38783, N18708..."


In [6]:
train_beh_df.tail()

Unnamed: 0,impression_id,user_id,time,history,true_imp,false_imp
156960,66234,U717,2019-11-14 23:58:46,"[N54822, N46392, N27863, N13138, N40448, N14006]",[N61233],"[N7494, N46917, N62197, N2960, N22978, N57081,..."
156961,36004,U44395,2019-11-14 23:58:47,"[N38488, N11231, N14761, N21164, N42128, N7328...",[N30290],"[N48487, N41934, N64037, N63913, N55322, N1447..."
156962,105363,U41595,2019-11-14 23:58:51,[],[N23513],"[N14478, N7342, N48487, N29490, N27737, N47781..."
156963,108433,U75895,2019-11-14 23:59:06,"[N1300, N9803, N14114, N31996]",[N61233],"[N29490, N22975, N27737, N6837, N47652, N14478..."
156964,90252,U82996,2019-11-14 23:59:13,"[N39556, N22279, N56461, N33393, N6233, N33617...","[N51253, N42767]","[N3678, N41934, N36786, N23535, N29490, N50055..."


In [7]:
file_path = './dataset/MIND-small/dev/behaviors.tsv'
column_names = ['impression_id', 'user_id', 'time', 'history', 'impressions']
dev_beh_df = pd.read_csv(file_path, sep='\t', names=column_names, header=None)
dev_beh_df['time'] = pd.to_datetime(dev_beh_df['time'], errors='coerce')
dev_beh_df = dev_beh_df.sort_values(by='time').reset_index(drop=True)
dev_beh_df['history'] = dev_beh_df['history'].apply(parse_history)
dev_beh_df[['true_imp', 'false_imp']] = dev_beh_df['impressions'].apply(split_impressions)
dev_beh_df = dev_beh_df.drop(columns=['impressions'])

In [8]:
dev_beh_df.head()

Unnamed: 0,impression_id,user_id,time,history,true_imp,false_imp
0,23333,U90541,2019-11-15 00:00:01,[],"[N12627, N36931, N29490]","[N32567, N6645, N35693, N35576, N21914, N37204..."
1,12622,U15098,2019-11-15 00:00:05,"[N41241, N52026, N36360, N31550]",[N26125],"[N39149, N49633, N10552, N35773, N59814, N1897..."
2,29761,U83813,2019-11-15 00:00:06,[N59027],"[N54803, N38215, N29490, N53346, N60592]","[N38915, N41717, N54792, N3174, N57403, N57081..."
3,11912,U36050,2019-11-15 00:00:13,"[N39556, N459, N57300, N4643, N16545, N4501, N...","[N5981, N16120]","[N45868, N62197, N54803, N23535, N57403, N5695..."
4,28191,U73479,2019-11-15 00:00:15,"[N38963, N55743, N17953, N23593, N10629, N4804...",[N38311],"[N7245, N32567, N47612, N48487, N50484, N7419,..."


In [9]:
dev_beh_df.tail()

Unnamed: 0,impression_id,user_id,time,history,true_imp,false_imp
73147,68029,U85340,2019-11-15 23:56:23,"[N61864, N61864, N61864, N61864, N61864, N6254...",[N47020],"[N21681, N30598]"
73148,32519,U74555,2019-11-15 23:56:34,"[N22839, N41219, N18942, N54276, N24421, N1621...","[N62949, N42180]","[N54562, N4734, N60215, N40575, N29862, N23195..."
73149,18025,U69320,2019-11-15 23:56:44,"[N39074, N51076, N31057, N1362, N6922, N62634,...",[N56434],"[N42634, N58848, N11390, N37204, N58188, N6104..."
73150,28994,U36875,2019-11-15 23:57:48,"[N52632, N10059, N46520, N62006, N65123, N5003...",[N63919],"[N18151, N41069, N26953, N54988, N21681, N1155..."
73151,465,U36875,2019-11-15 23:58:03,"[N52632, N10059, N46520, N62006, N65123, N5003...",[N35340],"[N12409, N47825, N54562, N30598, N56080, N60085]"


# 4. Combine train & dev set, sort by impression time

In [10]:
all_beh_df = pd.concat([train_beh_df, dev_beh_df], ignore_index=True)
all_beh_df['time'] = pd.to_datetime(all_beh_df['time'], errors='coerce')
all_beh_df = all_beh_df.sort_values(by='time').reset_index(drop=True)

In [11]:
all_beh_df.head()

Unnamed: 0,impression_id,user_id,time,history,true_imp,false_imp
0,20112,U65916,2019-11-09 00:00:19,"[N51706, N40767, N12096, N9798, N38802, N54827...","[N46057, N55582, N41858]","[N54300, N57005, N52154, N57099, N31002, N3710..."
1,13807,U49985,2019-11-09 00:01:13,"[N5056, N29975, N53234, N39603, N50032, N8422,...","[N57768, N50135, N16560, N25785, N3491]","[N20602, N50059, N15134, N61880, N64536, N3710..."
2,27660,U25550,2019-11-09 00:02:44,"[N17260, N38298, N33976, N47719, N14888, N1887...",[N52433],"[N50135, N15134, N20602, N64536]"
3,152217,U19710,2019-11-09 00:02:50,"[N3530, N48284, N43019, N62546, N138, N13138, ...",[N43083],"[N57099, N30295, N21086, N5379, N57005, N31002..."
4,42166,U38106,2019-11-09 00:03:09,"[N16874, N264, N48697, N51366]",[N48925],"[N3491, N20602, N25785, N23575, N38783, N18708..."


In [12]:
all_beh_df.tail()

Unnamed: 0,impression_id,user_id,time,history,true_imp,false_imp
230112,68029,U85340,2019-11-15 23:56:23,"[N61864, N61864, N61864, N61864, N61864, N6254...",[N47020],"[N21681, N30598]"
230113,32519,U74555,2019-11-15 23:56:34,"[N22839, N41219, N18942, N54276, N24421, N1621...","[N62949, N42180]","[N54562, N4734, N60215, N40575, N29862, N23195..."
230114,18025,U69320,2019-11-15 23:56:44,"[N39074, N51076, N31057, N1362, N6922, N62634,...",[N56434],"[N42634, N58848, N11390, N37204, N58188, N6104..."
230115,28994,U36875,2019-11-15 23:57:48,"[N52632, N10059, N46520, N62006, N65123, N5003...",[N63919],"[N18151, N41069, N26953, N54988, N21681, N1155..."
230116,465,U36875,2019-11-15 23:58:03,"[N52632, N10059, N46520, N62006, N65123, N5003...",[N35340],"[N12409, N47825, N54562, N30598, N56080, N60085]"


# 4. history에 등장하는 뉴스 개수

In [32]:
history_news = set([nid for sublist in all_beh_df['history'] for nid in sublist])
print("history에 있는 뉴스 ID 개수: ", len(history_news))

history에 있는 뉴스 ID 개수:  44908


# 5. impression에 등장하는 뉴스 개수

In [33]:
true_news = set([nid for sublist in all_beh_df['true_imp'] for nid in sublist])
false_news = set([nid for sublist in all_beh_df['false_imp'] for nid in sublist])
imp_news = true_news.union(false_news)
print("impression에 있는 뉴스 ID 개수: ", len(imp_news))

impression에 있는 뉴스 ID 개수:  22771


# 6. 교집합 및 차집합 뉴스 개수

In [34]:
# imp_news ∩ history_news: history와 impression 모두 등장하는 뉴스
intersection = history_news.intersection(imp_news)

# imp_news - history_news: history에는 없지만 impression에 등장하는 뉴스
imp_not_in_hist = imp_news - history_news

# history_news - imp_news: impression에는 없지만 history에만 있는 뉴스
hist_not_in_imp = history_news - imp_news

# 출력
print(f"겹치는 뉴스 ID 개수: {len(intersection)}")
print(f"impressions에만 있는 뉴스 ID 개수: {len(imp_not_in_hist)}")
print(f"history에만 있는 뉴스 ID 개수: {len(hist_not_in_imp)}")

겹치는 뉴스 ID 개수: 2441
impressions에만 있는 뉴스 ID 개수: 20330
history에만 있는 뉴스 ID 개수: 42467


# 5. Print U36050's impressions on combined set (clicked behaviors, history)

In [16]:
u36050_all_df = all_beh_df[all_beh_df['user_id'] == 'U36050']

In [17]:
print(u36050_all_df['true_imp'])
print(u36050_all_df['history'][7562])
print("length of imp7562's history: ", len(u36050_all_df['history'][7562]))
print(u36050_all_df['history'][144150])
print("length of imp144150's history: ", len(u36050_all_df['history'][144150]))
print(u36050_all_df['history'][156968])
print("length of imp156968's history: ", len(u36050_all_df['history'][156968]))

7562      [N18708, N39267, N33981]
144150     [N11269, N6390, N38442]
156968             [N5981, N16120]
Name: true_imp, dtype: object
['N39556', 'N459', 'N57300', 'N4643', 'N16545', 'N4501', 'N10059', 'N719', 'N51483', 'N56446', 'N32868', 'N19620', 'N61084', 'N10865', 'N16625', 'N19638', 'N31193', 'N25577', 'N12096', 'N48216', 'N20110', 'N18275', 'N28115', 'N25525', 'N29453', 'N16715', 'N24298', 'N18355', 'N4985', 'N6506', 'N14761', 'N1864', 'N8148', 'N46811']
length of imp7562's history:  34
['N39556', 'N459', 'N57300', 'N4643', 'N16545', 'N4501', 'N10059', 'N719', 'N51483', 'N56446', 'N32868', 'N19620', 'N61084', 'N10865', 'N16625', 'N19638', 'N31193', 'N25577', 'N12096', 'N48216', 'N20110', 'N18275', 'N28115', 'N25525', 'N29453', 'N16715', 'N24298', 'N18355', 'N4985', 'N6506', 'N14761', 'N1864', 'N8148', 'N46811']
length of imp144150's history:  34
['N39556', 'N459', 'N57300', 'N4643', 'N16545', 'N4501', 'N10059', 'N719', 'N51483', 'N56446', 'N32868', 'N19620', 'N61084', 'N10865', 

# 6. Print U36050's impressions on training set (clicked behaviors, history)

In [18]:
u36050_train_df = train_beh_df[train_beh_df['user_id'] == 'U36050']

In [19]:
print(u36050_train_df['true_imp'])
print(u36050_train_df['history'][7562])
print("length of imp7562's history: ", len(u36050_train_df['history'][7562]))
print(u36050_train_df['history'][144150])
print("length of imp144150's history: ", len(u36050_train_df['history'][144150]))

7562      [N18708, N39267, N33981]
144150     [N11269, N6390, N38442]
Name: true_imp, dtype: object
['N39556', 'N459', 'N57300', 'N4643', 'N16545', 'N4501', 'N10059', 'N719', 'N51483', 'N56446', 'N32868', 'N19620', 'N61084', 'N10865', 'N16625', 'N19638', 'N31193', 'N25577', 'N12096', 'N48216', 'N20110', 'N18275', 'N28115', 'N25525', 'N29453', 'N16715', 'N24298', 'N18355', 'N4985', 'N6506', 'N14761', 'N1864', 'N8148', 'N46811']
length of imp7562's history:  34
['N39556', 'N459', 'N57300', 'N4643', 'N16545', 'N4501', 'N10059', 'N719', 'N51483', 'N56446', 'N32868', 'N19620', 'N61084', 'N10865', 'N16625', 'N19638', 'N31193', 'N25577', 'N12096', 'N48216', 'N20110', 'N18275', 'N28115', 'N25525', 'N29453', 'N16715', 'N24298', 'N18355', 'N4985', 'N6506', 'N14761', 'N1864', 'N8148', 'N46811']
length of imp144150's history:  34


# 7. Print U36050's impressions on validation set (clicked behaviors, history)

In [20]:
u36050_dev_df = dev_beh_df[dev_beh_df['user_id'] == 'U36050']

In [21]:
print(u36050_dev_df['true_imp'])

3    [N5981, N16120]
Name: true_imp, dtype: object


In [22]:
print(u36050_dev_df['history'][3])
print("length of imp3's history: ", len(u36050_dev_df['history'][3]))

['N39556', 'N459', 'N57300', 'N4643', 'N16545', 'N4501', 'N10059', 'N719', 'N51483', 'N56446', 'N32868', 'N19620', 'N61084', 'N10865', 'N16625', 'N19638', 'N31193', 'N25577', 'N12096', 'N48216', 'N20110', 'N18275', 'N28115', 'N25525', 'N29453', 'N16715', 'N24298', 'N18355', 'N4985', 'N6506', 'N14761', 'N1864', 'N8148', 'N46811']
length of imp3's history:  34


# 8. Print U65916's impressions on combined set (clicked behaviors, history)

In [23]:
u65916_all_df = all_beh_df[all_beh_df['user_id'] == 'U65916']

In [24]:
print(u65916_all_df['true_imp'])
print(u65916_all_df['history'][0])
print("length of imp0's history: ", len(u65916_all_df['history'][0]))
print(u65916_all_df['history'][28403])
print("length of imp28403's history: ", len(u65916_all_df['history'][28403]))
print(u65916_all_df['history'][94576])
print("length of imp94576's history: ", len(u65916_all_df['history'][94576]))
print(u65916_all_df['history'][126677])
print("length of imp126677's history: ", len(u65916_all_df['history'][126677]))

0         [N46057, N55582, N41858]
28403                     [N40495]
94576              [N7821, N17289]
126677                    [N41612]
Name: true_imp, dtype: object
['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp0's history:  15
['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp28403's history:  15
['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp94576's history:  15
['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp126677's history:  15


# 9. Print U65916's impressions on training set (clicked behaviors, history)

In [25]:
u65916_train_df = train_beh_df[train_beh_df['user_id'] == 'U65916']

In [26]:
print(u65916_train_df['true_imp'])

0         [N46057, N55582, N41858]
28403                     [N40495]
94576              [N7821, N17289]
126677                    [N41612]
Name: true_imp, dtype: object


In [27]:
print(u65916_train_df['history'][0])
print("length of imp0's history: ", len(u65916_train_df['history'][0]))
print(u65916_train_df['history'][28403])
print("length of imp28403's history: ", len(u65916_train_df['history'][28403]))
print(u65916_train_df['history'][94576])
print("length of imp94576's history: ", len(u65916_train_df['history'][94576]))
print(u65916_train_df['history'][126677])
print("length of imp126677's history: ", len(u65916_train_df['history'][126677]))

['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp0's history:  15
['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp28403's history:  15
['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp94576's history:  15
['N51706', 'N40767', 'N12096', 'N9798', 'N38802', 'N54827', 'N57801', 'N5184', 'N23653', 'N44007', 'N36443', 'N43142', 'N11143', 'N53652', 'N2678']
length of imp126677's history:  15


# 10. Print U65916's impressions on validation set (clicked behaviors, history)

In [28]:
u65916_dev_df = dev_beh_df[dev_beh_df['user_id'] == 'U65916']

In [29]:
print(u65916_dev_df['true_imp'])

Series([], Name: true_imp, dtype: object)
