# Preperation of MIND dataset for Neural Collaborative Filtering

In [1]:
import csv
import numpy as np
import pandas as pd
import scipy.sparse as sp

In [2]:
behaviors = pd.read_csv("../../../data/mind_small_train/behaviors_processed.csv")
news = pd.read_csv("../../../data/mind_small_train/news_processed.csv")

In [3]:
behaviors.drop_duplicates(subset="user_id", inplace=True)
behaviors.drop(['impression_id', 'time'], axis=1, inplace=True)

In [4]:
behaviors['history_list'] = behaviors.history.str.split()

In [5]:
behav = behaviors[behaviors['history_list'].map(len) >= 5].copy()

In [6]:
behav.reset_index(inplace=True, drop=True)

In [7]:
behav

Unnamed: 0,user_id,history,labels,history_list
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,"[N55189, N42782, N34694, N45794, N18445, N6330..."
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,"[N31739, N6072, N63045, N23979, N35656, N43353..."
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,"[N10732, N25792, N7563, N21087, N41087, N5445,..."
3,U34670,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,"[N45729, N2203, N871, N53880, N41375, N43142, ..."
4,U19739,N39074 N14343 N32607 N32320 N22007 N442 N19001...,N21119-1 N53696-0 N33619-1 N25722-0 N2869-0,"[N39074, N14343, N32607, N32320, N22007, N442,..."
...,...,...,...,...
40326,U36425,N56253 N55189 N6233 N11894 N5183 N10414 N64467...,N38783-0 N57097-1 N63478-0 N11830-0,"[N56253, N55189, N6233, N11894, N5183, N10414,..."
40327,U6794,N20059 N27448 N42458 N2203 N3595 N22058 N54416...,N20079-0 N52773-0 N26142-1 N31978-0,"[N20059, N27448, N42458, N2203, N3595, N22058,..."
40328,U23127,N51591 N18073 N3653 N49640 N56253 N41049 N2203...,N28248-0 N58660-0 N5652-0 N16589-0 N58814-0 N1...,"[N51591, N18073, N3653, N49640, N56253, N41049..."
40329,U43157,N62285 N43086 N17254 N64775 N24721 N12988 N304...,N62688-1 N38960-0 N7821-0 N29952-0 N43368-0 N2...,"[N62285, N43086, N17254, N64775, N24721, N1298..."


## Create User Article Interaction Table

In [8]:
uai = behav.set_index('user_id').history.str.split(' ', expand=True)
uai = uai.stack().reset_index(1, drop=True).reset_index(name='article')

In [9]:
uai.head(10)

Unnamed: 0,user_id,article
0,U13740,N55189
1,U13740,N42782
2,U13740,N34694
3,U13740,N45794
4,U13740,N18445
5,U13740,N63302
6,U13740,N10414
7,U13740,N19347
8,U13740,N31801
9,U91836,N31739


In [10]:
uai_array = uai.to_numpy()

In [11]:
uai_train = []
uai_test = []
user_init = uai_array[0][0]

for row in uai_array:
    row = list(row)
    user = row[0]
    if user == user_init:
        uai_train.append(row)
        user_init = user
        last_row = row
    elif user != user_init:
        uai_train.pop()
        uai_test.append(last_row)
        user_init = user
        
uai_test.append(uai_train.pop())

In [12]:
uai_test[:5]

[['U13740', 'N31801'],
 ['U91836', 'N25785'],
 ['U73700', 'N18870'],
 ['U34670', 'N51891'],
 ['U19739', 'N52121']]

In [13]:
train_articles = [elem[1] for elem in uai_train]
test_articles = [elem[1] for elem in uai_test]

In [14]:
articles_to_drop = set(test_articles)-set(train_articles)

In [15]:
uai_test_red = [ele for ele in uai_test if ele[1] not in articles_to_drop]
uai_test_red_articles = [ele[1] for ele in uai_test_red]

In [16]:
uai_train_df = pd.DataFrame(uai_train, columns=uai.columns)
uai_test_df = pd.DataFrame(uai_test_red, columns=uai.columns)

In [17]:
uai_train_df['user_id_code'] = uai_train_df.user_id.astype('category').cat.codes

In [18]:
uai_train_df['article_id_code'] = uai_train_df.article.astype('category').cat.codes

In [19]:
uai_train_df.head()

Unnamed: 0,user_id,article,user_id_code,article_id_code
0,U13740,N55189,1810,24230
1,U13740,N42782,1810,17587
2,U13740,N34694,1810,13230
3,U13740,N45794,1810,19224
4,U13740,N18445,1810,4500


In [20]:
uai_train_df.article.nunique()

31415

In [21]:
user_code_dict = pd.Series(uai_train_df.user_id_code.values,
                           index=uai_train_df.user_id).to_dict()

In [22]:
article_code_dict = pd.Series(uai_train_df.article_id_code.values,
                              index=uai_train_df.article).to_dict()

In [23]:
uai_test_df['user_id_code'] = [user_code_dict[user] for user in uai_test_df.user_id]

In [24]:
uai_test_df['article_id_code'] = [article_code_dict[art] for art in uai_test_df.article]

In [25]:
uai_test_df.head()

Unnamed: 0,user_id,article,user_id_code,article_id_code
0,U13740,N31801,1810,11677
1,U91836,N25785,39005,8499
2,U73700,N18870,30277,4749
3,U34670,N51891,11801,22468
4,U19739,N52121,4695,22601


In [26]:
uai_train_df.to_csv("small_train.csv", index=False)

In [27]:
uai_test_df.to_csv("small_test.csv", index=False)

## Create Interaction Matrix

In [28]:
train_filename = "small_train.csv"

In [29]:
num_users, num_articles = 0, 0
with open(train_filename, "r") as f:
    header = f.readline()
    line = f.readline()
    while line != None and line != "":
        line_list = line.split(",")
        u, i = int(line_list[2]), int(line_list[3])
        num_users = max(num_users, u)
        num_articles = max(num_articles, i)
        line = f.readline()

In [30]:
num_users, num_articles

(40330, 31414)

In [31]:
mat = sp.dok_matrix((num_users+1, num_articles+1), dtype=np.float32)

with open(train_filename, "r") as f:
    header = f.readline()
    line = f.readline()
    while line != None and line != "":
        line_list = line.split(",")
        user, article = int(line_list[2]), int(line_list[3])
        mat[user, article] = 1.0
        line = f.readline()

## Create Test Negative File

In [32]:
uai_test_df

Unnamed: 0,user_id,article,user_id_code,article_id_code
0,U13740,N31801,1810,11677
1,U91836,N25785,39005,8499
2,U73700,N18870,30277,4749
3,U34670,N51891,11801,22468
4,U19739,N52121,4695,22601
...,...,...,...,...
39841,U36425,N38783,12622,15460
39842,U6794,N47847,27522,20329
39843,U23127,N13429,6303,1810
39844,U43157,N14006,15721,2107


In [33]:
ua_tuples = list(zip(uai_test_df.user_id_code, uai_test_df.article_id_code))
ua_tuples[:5]

[(1810, 11677), (39005, 8499), (30277, 4749), (11801, 22468), (4695, 22601)]

In [34]:
num_negatives = 99
num_articles = uai_train_df.article.nunique()

In [35]:
complete_list = []
for u, i in ua_tuples:
    negatives = []
    for t in range(num_negatives):
        j = np.random.randint(num_articles)
        while (u, j) in mat.keys():
            j = np.random.randint(num_articles)
        negatives.append(j)
    complete_list.append(negatives)

In [36]:
len(ua_tuples), len(complete_list), len(complete_list[0])

(39846, 39846, 99)

In [37]:
output = complete_list[:]

In [38]:
for i in range(len(ua_tuples)):
    output[i].insert(0, ua_tuples[i])

In [39]:
with open('small_test_negatives.tsv', 'w') as f:
    for line in output:
        line_str = '\t'.join(str(ele) for ele in line) + "\n"
        f.write(line_str)