In [1]:
import pandas as pd
import numpy as np

from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from nltk.stem import WordNetLemmatizer, SnowballStemmer

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Data loading and preprocessing

In [2]:
news_columns = ['news_id', 'category', 
                'sub_category', 'title', 
                'abstract', 'url',
               'title_entities', 'abstract_entities']
news = pd.read_csv('mind_dataset/train/news.tsv', sep='\t', header=None, names=news_columns)
news.head()

Unnamed: 0,news_id,category,sub_category,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [3]:
news = news[['news_id', 'category', 'sub_category', 'title', 'abstract']]

In [4]:
users_columns = ['impression_id', 'user_id', 'time', 'history', 'impression']
users = pd.read_csv('mind_dataset/train/behaviors.tsv', sep='\t', header=None, 
                  names=users_columns)
users.head()

Unnamed: 0,impression_id,user_id,time,history,impression
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [5]:
users = users[['impression_id', 'user_id', 'history', 'impression']]

In [6]:
users['history'] = users.history.apply(lambda x: [] if isinstance(x, float) else x.split())

In [7]:
def get_clicked(impression):
    clicked = []    
    for news in impression.split():
        news_id, state = news.split('-')
        if state == '1':
            clicked.append(news_id)
    return clicked

def get_unclicked(impression):
    unclicked = []    
    for news in impression.split():
        news_id, state = news.split('-')
        if state == '0':
            unclicked.append(news_id)
    return unclicked
        
users['clicked'] = users.impression.apply(get_clicked)
users['unclicked'] = users.impression.apply(get_unclicked)
users.head()

Unnamed: 0,impression_id,user_id,history,impression,clicked,unclicked
0,1,U13740,"[N55189, N42782, N34694, N45794, N18445, N6330...",N55689-1 N35729-0,[N55689],[N35729]
1,2,U91836,"[N31739, N6072, N63045, N23979, N35656, N43353...",N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,[N17059],"[N20678, N39317, N58114, N20495, N42977, N2240..."
2,3,U73700,"[N10732, N25792, N7563, N21087, N41087, N5445,...",N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,[N23814],"[N50014, N23877, N35389, N49712, N16844, N5968..."
3,4,U34670,"[N45729, N2203, N871, N53880, N41375, N43142, ...",N35729-0 N33632-0 N49685-1 N27581-0,[N49685],"[N35729, N33632, N27581]"
4,5,U8125,"[N10078, N56514, N14904, N33740]",N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,[N8400],"[N39985, N36050, N16096, N22407, N60408, N6149..."


In [8]:
users = users[['impression_id', 'user_id', 'history', 'clicked', 'unclicked']]
users.head()

Unnamed: 0,impression_id,user_id,history,clicked,unclicked
0,1,U13740,"[N55189, N42782, N34694, N45794, N18445, N6330...",[N55689],[N35729]
1,2,U91836,"[N31739, N6072, N63045, N23979, N35656, N43353...",[N17059],"[N20678, N39317, N58114, N20495, N42977, N2240..."
2,3,U73700,"[N10732, N25792, N7563, N21087, N41087, N5445,...",[N23814],"[N50014, N23877, N35389, N49712, N16844, N5968..."
3,4,U34670,"[N45729, N2203, N871, N53880, N41375, N43142, ...",[N49685],"[N35729, N33632, N27581]"
4,5,U8125,"[N10078, N56514, N14904, N33740]",[N8400],"[N39985, N36050, N16096, N22407, N60408, N6149..."


In [9]:
users.isna().sum(), news.isna().sum()

(impression_id    0
 user_id          0
 history          0
 clicked          0
 unclicked        0
 dtype: int64,
 news_id            0
 category           0
 sub_category       0
 title              0
 abstract        2666
 dtype: int64)

In [10]:
news[news.abstract.isna()]

Unnamed: 0,news_id,category,sub_category,title,abstract
38,N22028,lifestyle,lifestylebuzz,"Mom with schizophrenia, 6-year-old daughter mi...",
41,N41835,news,newsworld,Today in History: November 2,
133,N25174,weather,weathertopstories,"Winter Storm Warning For Metro Denver, Boulder...",
137,N45191,lifestyle,shop-all,Amazon Says These Are Its Funniest Customer Re...,
175,N31161,sports,baseball_mlb,Astros have a lot of baseball left despite two...,
...,...,...,...,...,...
51244,N56193,sports,football_nfl,Cowboys have questionable red zone calls in lo...,
51246,N10529,sports,football_nfl,Week 10 Game Balls: Few Bright Spots in Colts ...,
51259,N55199,news,newsus,Winter homeless shelter remains unopened after...,
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,


In [11]:
news.abstract.fillna('', inplace=True)
news.isna().sum()

news_id         0
category        0
sub_category    0
title           0
abstract        0
dtype: int64

In [12]:
print(f"""
News: {news.news_id.nunique()}
Categories: {news.category.nunique()}
Sub-categories: {news.sub_category.nunique()}
Users: {users.user_id.nunique()}
""")


News: 51282
Categories: 17
Sub-categories: 264
Users: 50000



In [13]:
news.groupby('category').count().sort_values('news_id', ascending=False)

Unnamed: 0_level_0,news_id,sub_category,title,abstract
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
news,15774,15774,15774,15774
sports,14510,14510,14510,14510
finance,3107,3107,3107,3107
foodanddrink,2551,2551,2551,2551
lifestyle,2479,2479,2479,2479
travel,2350,2350,2350,2350
video,2068,2068,2068,2068
weather,2048,2048,2048,2048
health,1885,1885,1885,1885
autos,1639,1639,1639,1639


In [14]:
def preprocess(text):   
    stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    processed = [lemmatizer.lemmatize(stemmer.stem(word), pos='v') for word in simple_preprocess(text) if word not in STOPWORDS]
    return processed

In [None]:
news['headline'] = news.title.apply(preprocess)

In [None]:
news['highlight'] = news.abstract.apply(preprocess)

In [None]:
news.head()

In [None]:
news['title-abstract'] = news.apply(lambda row: row['headline'] + row['highlight'], axis=1)
news.head()

In [None]:
users.head()

# Encoding

In [None]:
user_encoder = LabelEncoder()
news_encoder = LabelEncoder()

In [None]:
news_encoder.fit(news.news_id)
user_encoder.fit(users.user_id)

In [None]:
news_encoded = news.copy()
news_encoded['news_id'] = news_encoder.transform(news.news_id)

In [None]:
category_encoder = OneHotEncoder(sparse=False)
subcategory_encoder = OneHotEncoder(sparse=False)

In [None]:
category_encoder.fit([[category] for category in news.category.unique()])
subcategory_encoder.fit([[subcategory] for subcategory in news.sub_category.unique()])

In [None]:
def transform(category, ohe=None):
    return ohe.transform([[category]])[0]

news_encoded['category'] = news_encoded.category.apply(transform, ohe=category_encoder)

In [None]:
news_encoded['sub_category'] = news_encoded.sub_category.apply(transform, ohe=subcategory_encoder)

In [None]:
news_encoded.head()

In [None]:
users_encoded = users.copy()
users_encoded['user_id'] = user_encoder.transform(users.user_id)

In [None]:
users_encoded['history'] = users_encoded.history.apply(news_encoder.transform)

In [None]:
users_encoded.head()

In [None]:
users_encoded['clicked'] = users_encoded.clicked.apply(news_encoder.transform)

In [None]:
users_encoded.head()

In [None]:
users_encoded['unclicked'] = users_encoded.unclicked.apply(news_encoder.transform)

In [None]:
users_encoded.head()