# Preprocessing and Cleaning of the Data

In [2]:
import pandas as pd
import numpy as np

from progressbar import ProgressBar

In [4]:
behaviors = pd.read_csv('../../data/mind_small_train/behaviors.tsv', sep="\t", header=None)
news= pd.read_csv('../../data/mind_small_train/news.tsv', sep="\t", header = None)

In [45]:
# Renaming news columns
news = news.rename(columns={0:'article_id'})
news = news.rename(columns={1:'category'})
news = news.rename(columns={2:'subcategory'})
news = news.rename(columns={3:'title'})
news = news.rename(columns={4:'abstract'})
news = news.rename(columns={5:'url'})
news = news.rename(columns={6:'title_entities'})
news = news.rename(columns={7:'abstract_entities'})

The news dataset stores the information of all the news articles (id, header, abstract, ...). It looks like this:

In [46]:
news.head(3)

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [47]:
news.shape

(51282, 8)

It looks like we have more than 50,000 news articles in our dataset. Let's check whether these are all unique articles or if we also have some duplicates:

In [48]:
print("Number of unique news articles: ", news.title.nunique())
print("Number of duplicates:             ", news.shape[0] - news.title.nunique())

Number of unique news articles:  50434
Number of duplicates:              848


Apparently there are news articles with multiple IDs. We don't just want to drop them, because this would result in a loss of useful information concerning the click behaviors and reading histories in the bahaviors dataset, which looks like this:

In [49]:
# Renaming behaviors columns
behaviors = behaviors.rename(columns={3:'history'})
behaviors = behaviors.rename(columns={0:'impression_id'})
behaviors = behaviors.rename(columns= {1 : 'user_id'})
behaviors = behaviors.rename(columns= {2 : 'time'})
behaviors = behaviors.rename(columns= {4 : 'impressions'})

In [50]:
behaviors.shape

(156965, 5)

In [51]:
behaviors.isna().sum()

impression_id       0
user_id             0
time                0
history          3238
impressions         0
dtype: int64

In [52]:
behaviors.dropna(inplace=True)

In [53]:
behaviors.head(3)

Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...


With different IDs for the de facto same articles we would not be able to track similarities among users sufficiently. In the following, we will replace every redundant ID with the first ID for the respective article. In order to this we create a subset with all the duplicates in it (duplis_title).

In [54]:
duplis_title = news[news.duplicated(subset="title", keep=False)]

In [55]:
duplis_title.sort_values(by="title").head(2)

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
35832,N34049,sports,football_nfl,'A game-changer': Titans' expansion project wi...,"The Titans have begun construction on a 60,000...",https://assets.msn.com/labs/mind/BBWHHH8.html,"[{""Label"": ""Tennessee Titans"", ""Type"": ""O"", ""W...","[{""Label"": ""Tennessee Titans"", ""Type"": ""O"", ""W..."
33999,N56680,sports,football_nfl,'A game-changer': Titans' expansion project wi...,"The Titans have begun construction on a 60,000...",https://assets.msn.com/labs/mind/BBWH8ZY.html,"[{""Label"": ""Tennessee Titans"", ""Type"": ""O"", ""W...","[{""Label"": ""Tennessee Titans"", ""Type"": ""O"", ""W..."


In [56]:
title_set = duplis_title['title'].unique()

In [57]:
article_list = []
for title in title_set:
    x = duplis_title[duplis_title['title']==title]['article_id'].to_list()
    article_list.append(x)

In [58]:
article_list[:5]

[['N61864', 'N47020'],
 ['N59709', 'N13882', 'N57732', 'N56582'],
 ['N6632', 'N39995'],
 ['N14042', 'N21933'],
 ['N37736', 'N22941', 'N60979']]

With this article_list we have a list which contains article IDs for every article which has multiple IDs in our original dataset. 

Now we want to make a dictionary called articleID_dict, which maps all the redundant IDs (keys) to a single ID (value):

In [59]:
articleID_dict = {}
articles_to_change = []
for article in article_list:
    value = article[0]
    keys = article [1:]
    for k in keys:
        articleID_dict[k] = value
        articles_to_change.append(k)

Let's make a copy of the original behaviors dataframe so that we can compare it to the one we are constructing.

In [60]:
behav = behaviors.copy()

In the cell below we loop over all the reader sessions in the behaviors dataset and replace the redundant article IDs with the ones specified in our article ID dictionary. The loop takes some time, but we will only have to run it once and can save the resulting dataframe to a new CSV file, that we can work with later on.

In [61]:
pbar = ProgressBar()

userIDs_hist_changes = []
userIDs_impr_changes = []

for idx in pbar(behav.index):
    user_row = behav.loc[idx, :]
    
    hist_flag = False
    hist = user_row.history
    hist_list = hist.split()
    
    for art in hist_list:
        if art in articles_to_change:
            hist_flag = True
            userIDs_hist_changes.append(user_row["user_id"])
            hist = hist.replace(art, articleID_dict[art])
    if hist_flag:
        behav.loc[idx, "history"] = hist
    
    impression_flag = False
    impressions = user_row.impressions
    impression_list = [l[:-2] for l in impressions.split()]
    
    for l in impression_list:
        if l in articles_to_change:
            impression_flag = True
            userIDs_impr_changes.append(user_row["user_id"])
            impressions = impressions.replace(l, articleID_dict[l])
    if impression_flag:        
        behav.loc[idx, "impressions"] = impressions
    


100% |########################################################################|


Let's do some consistency checks, before we save the dataframe to a CSV file. To do this we compare the article IDs in the original user histories with those in the newly generated ones:

In [67]:
userIDs_hist_changes[:5]

['U91836', 'U91836', 'U19739', 'U19739', 'U89744']

In [62]:
hist1 = behaviors[behaviors.user_id=='U19739']
set1 = set(hist1.history.to_list()[0].split())
hist2 = behav[behav.user_id=='U19739']
set2 = set(hist2.history.to_list()[0].split())
set1 ^ set2

{'N47020', 'N61864'}

In [69]:
articleID_dict["N47020"]

'N61864'

In [63]:
userIDs_impr_changes[:6]

['U8355', 'U89744', 'U89744', 'U89744', 'U89744', 'U11306']

In [70]:
impr1 = behaviors[behaviors.user_id=='U11306']
set1 = set(impr1.impressions.to_list()[0].split())
impr2 = behav[behav.user_id=='U11306']
set2 = set(impr2.impressions.to_list()[0].split())
set1 ^ set2

{'N1920-0', 'N20423-0', 'N30268-0', 'N4212-0', 'N42614-0', 'N5497-0'}

In [72]:
articleID_dict["N1920"]

'N4212'

As you can see our method worked! Now we just need to write the new dataframes to CSV files (and of course drop the duplicates in the news dataframe):

In [61]:
behav.to_csv("../../data/mind_small_train/behaviors_processed.csv", index=False)

In [11]:
news_dropped = news.drop_duplicates(subset="title", keep='first')

In [66]:
news_dropped.to_csv("../../data/mind_small_train/news_processed.csv", 
                    index=False)