In [1]:
import importlib

In [35]:
import numpy as np
import math
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Input, Dense, Reshape, Flatten, Dropout
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import pandas as pd
import random

from RNNHelper import (import_behaviors, dataframe_to_numpy, encode_articles,
                       create_pos_neg, rnn_train_val_split)

In [5]:
tf.__version__

'2.3.1'

In [6]:
behaviors = import_behaviors("../../data/mind_small_train/behaviors_processed.csv")

In [7]:
behaviors.head(3)

Unnamed: 0,impression_id,user_id,time,history,impressions,length_history,history_split,impressions_split
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,9,"[N55189, N42782, N34694, N45794, N18445, N6330...","[N55689-1, N35729-0]"
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,82,"[N31739, N6072, N63045, N23979, N35656, N43353...","[N20678-0, N39317-0, N58114-0, N20495-0, N4297..."
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,16,"[N10732, N25792, N7563, N21087, N41087, N5445,...","[N50014-0, N23877-0, N35389-0, N49712-0, N1684..."


In [8]:
behaviors.shape

(136047, 8)

In [9]:
behaviors, unique_articles, num_articles, article2idx = encode_articles(behaviors)

Creating list of all articles in behaviors...
Creating unique articles set
Encoding articles in dataframe with integers...


In [10]:
n_hist = 5
complete_list_1s, complete_list_0s = create_pos_neg(behaviors, n_hist_articles=n_hist)

Create complete list of 'positive' articles...
Create complete list of 'negative' articles...


In [11]:
train_array, valid_array, train_targets_array, valid_targets_array, train_idx, test_idx = rnn_train_val_split(complete_list_1s, complete_list_0s)

# Build the RNN Model

## RNN with LSTM cells

### Initialize Parameters

In [12]:
EMB_DIM = 8
LR = 1e-4
METRICS = ['AUC']
EPOCHS = 1
BATCH_SIZE = 256

### Build, Compiel and Fit

In [13]:
lstm = keras.Sequential()
lstm.add(layers.Embedding(input_dim=num_articles, output_dim=EMB_DIM))
lstm.add(layers.LSTM(128))
lstm.add(layers.Dense(10))
lstm.add(layers.Dense(1,activation='sigmoid'))

lstm.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(LR),
              metrics=METRICS)

lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 8)           398336    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               70144     
_________________________________________________________________
dense (Dense)                (None, 10)                1290      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 469,781
Trainable params: 469,781
Non-trainable params: 0
_________________________________________________________________


In [14]:
lstm_hist = lstm.fit(train_array, train_targets_array, epochs=EPOCHS, batch_size=BATCH_SIZE,
                    validation_data=(valid_array, valid_targets_array), 
                    verbose=1)

Train on 217674 samples, validate on 54420 samples


## RNN with GRU cells

In [88]:
gru = keras.Sequential()
gru.add(layers.Embedding(input_dim=num_articles, output_dim=EMB_DIM))
gru.add(layers.GRU(64))
gru.add(layers.Dense(32))
gru.add(layers.Dense(16))
gru.add(layers.Dense(1, activation='sigmoid'))

gru.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(LR),
              metrics=METRICS)

gru.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, None, 8)           398336    
_________________________________________________________________
gru (GRU)                    (None, 128)               52992     
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_11 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 33        
Total params: 461,697
Trainable params: 461,697
Non-trainable params: 0
_________________________________________________________________


In [90]:
gru_hist = gru.fit(train_array, train_targets_array, epochs=EPOCHS, batch_size=BATCH_SIZE,
                    validation_data=(valid_array, valid_targets_array), 
                    verbose=1)

Train on 217674 samples, validate on 54420 samples


# Evaluation

In [12]:
behav, columndict = dataframe_to_numpy(behaviors)

In [13]:
columndict

{'impression_id': 0,
 'user_id': 1,
 'time': 2,
 'history': 3,
 'impressions': 4,
 'length_history': 5,
 'history_split': 6,
 'impressions_split': 7,
 'history_int': 8,
 'impressions_int_1': 9,
 'impressions_int_0': 10}

In [14]:
random.seed(420)
test_indexes = random.sample(test_idx, 5000)

In [15]:
behav_test = behav[test_indexes, :]

In [16]:
unique_articles_int = [article2idx[art] for art in unique_articles]

In [17]:
num_test_negs = 99
test_trajectories = []
test_targets = []
number_iters = len(behav_test)
iteration = 0

for session in behav_test:
    iteration += 1
    progress = round(iteration/number_iters*100, 1)
    print(f"{progress} %", end="\r")
    history_int = session[columndict["history_int"]]
    short_hist = history_int[-n_hist:]
    impression_int_1 = session[columndict["impressions_int_1"]]
    test_trajectories += short_hist
    test_trajectories += [impression_int_1[0]]
    test_targets.append(1)
    
    negative_articles = list(set(unique_articles_int) - set(history_int))
    len_negative_articles = len(negative_articles)
    for t in range(num_test_negs):
        j = np.random.randint(len_negative_articles)
        neg = short_hist + [negative_articles[j]]
        test_trajectories += neg
        test_targets.append(0)

100.0 %

In [18]:
test_input = np.array(test_trajectories)
n_test = len(test_trajectories)//6
test_input = test_input.reshape(n_test, 6,)

In [19]:
len(test_input)

500000

In [25]:
K = 10

In [28]:
def eval_one_rating(test_test, model):
    predictions = model.predict(test_test)
    sorted_pred = sorted(predictions, reverse=True)
    get_item = predictions[0]
    rank = sorted_pred.index(get_item)

    if rank < K:
        hr = 1
        ndcg = math.log(2) / math.log(rank+2)
        rr = 1/(rank+1)
    else:
        hr = 0
        ndcg = 0
        rr = 0
    
    return (hr, ndcg, rr)

In [29]:
hits, ndcgs, rrs = [], [], []
number_iters = len(test_input)

for i in range(0, len(test_input), 100):
    progress = round(i/number_iters*100, 1)
    print(f"{progress} %", end="\r")
    test_test = test_input[i:i+100]
    hr, ndcg, rr = eval_one_rating(test_test, lstm)
    hits.append(hr)
    ndcgs.append(ndcg)
    rrs.append(rr)

99.98 %

In [30]:
hr = np.array(hits).mean()
mrr = np.array(rrs).mean()
ndcg = np.array(ndcgs).mean()

print("Hit ratio:            ", hr)
print("Mean reciprocal rank: ", mrr)
print(f"NDCG@{K}:         ", ndcg)

Hit ratio:             0.2096
Mean reciprocal rank:  0.10394746031746031
NDCG@10:          0.1286773984962387


# RNN with title embedding (word vectorization)

In [20]:
news = pd.read_csv("../../data/mind_small_train/news_processed.csv")

In [21]:
news.head(3)

Unnamed: 0,article_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [22]:
num_articles

49792

In [23]:
unique_news = news[news.article_id.isin(unique_articles)]

In [24]:
unique_news.shape, news.shape

((49792, 8), (50434, 8))

In [25]:
unique_article_ids = unique_news.iloc[:, 0].to_list()

In [26]:
unique_article_ids[:3]

['N55528', 'N19639', 'N61837']

In [27]:
unique_article_titles = unique_news.iloc[:, 3].to_list()

In [28]:
unique_article_titles[:3]

['The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By',
 '50 Worst Habits For Belly Fat',
 "The Cost of Trump's Aid Freeze in the Trenches of Ukraine's War"]

In [31]:
max_features = 60000
title_length = 20
vectorize_layer = TextVectorization(
   # standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=title_length)

In [32]:
vectorize_layer.adapt(unique_article_titles)

In [167]:
vectorizer = vectorize_layer(unique_article_titles)

In [178]:
vectorizer_array = K.eval(vectorizer)

In [36]:
example = ""
for int_word in K.eval(vectorize_layer(unique_article_titles)[1]):
    example += vectorize_layer.get_vocabulary()[int_word] + " "

In [37]:
print(example.strip())

50 worst habits for belly fat


In [38]:
unique_article_ids[1]

'N19639'

In [39]:
news[news.article_id=="N19639"][["article_id", "title"]]

Unnamed: 0,article_id,title
1,N19639,50 Worst Habits For Belly Fat


In [41]:
articleid2title = {art_id: title for art_id, title in zip(unique_article_ids,
                                                          unique_article_titles
                                                         )
                  }

In [42]:
articleid2title["N19639"]

'50 Worst Habits For Belly Fat'

In [102]:
len(train_array)

217674

In [44]:
train_array[:2]

array([[ 2564, 32867, 40363, 32732, 29054, 14681],
       [ 2564, 32867, 40363, 32732, 29054, 45556]])

In [48]:
idx2article = {i: a for a, i in article2idx.items()}

In [185]:
newsarticle2idx = {art: i for i, art in enumerate(unique_article_ids)}

In [186]:
train_vec_idx = []
total_iters = len(train_array)
for i, traj in enumerate(train_array):
    progress = round(i/total_iters*100, 1)
    print(f'{progress}%', end='\r')
    for j, int_id in enumerate(traj):
        #print(j, int_id)
        article_id = idx2article[int_id]
        #print(article_id)
        text = articleid2title[article_id]
        #print(text)
        vec_idx = newsarticle2idx[article_id]
        train_vec_idx.append(vec_idx)
   

100.0%

In [189]:
train_vec_array = vectorizer_array[train_vec_idx, :]

In [194]:
train_vec_array[:5]

array([[ 5752,     2,    96,     9,  3121,   150,  1300,    34,    50,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [ 7588,  3692,  3327,   923,    32,  1504,   824,  6301,   134,
           70,  2991,  5392,   436,     0,     0,     0,     0,     0,
            0,     0],
       [   32,  1716,    49,     4,   857,   184,     4,  8135,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  981,  1795,    70,  3178,  2012,   695,     3,     7,  4271,
           82,    70,    53,  5214,   673,     0,     0,     0,     0,
            0,     0],
       [  214,  3616,     2,   974,    22,  1197,     7,  5261, 22887,
            2,  2472,   531,     3, 10343, 34621,     0,     0,     0,
            0,     0]])

In [191]:
example = ""
for int_word in train_vec_array[0]:
    example += vectorize_layer.get_vocabulary()[int_word] + " "

In [192]:
example

'renovations to make and skip before selling your home            '

In [193]:
title_emb_dim = 32
max_len = 20
len_trajectory = 6

In [199]:
title_seq = Input(shape=(len_trajectory, max_len,), name='title_input')

word_emb = tf.keras.layers.TimeDistributed(Embedding(input_dim=max_features+1, 
                                                output_dim=title_emb_dim,
                                                input_length=max_len, mask_zero=True,
                                                input_shape=(max_len, )))(title_seq)
# maybe bidirectional?Bidirectional(LSTM(32))
title_embedded = tf.keras.layers.TimeDistributed(layers.Bidirectional(layers.LSTM(32)))(word_emb)      
x = layers.LSTM(32)(title_embedded)
prediction = layers.Dense(1, activation='sigmoid')(x)
lstm_titlevec = tf.keras.Model(inputs=title_seq , outputs=prediction)

lstm_titlevec.summary()

Model: "functional_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
title_input (InputLayer)     [(None, 6, 20)]           0         
_________________________________________________________________
time_distributed_32 (TimeDis (None, 6, 20, 32)         1920032   
_________________________________________________________________
time_distributed_33 (TimeDis (None, 6, 64)             16640     
_________________________________________________________________
lstm_26 (LSTM)               (None, 32)                12416     
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 33        
Total params: 1,949,121
Trainable params: 1,949,121
Non-trainable params: 0
_________________________________________________________________


In [205]:
train_vec_array = train_vec_array.reshape(len(train_targets_array), 6, 20)

In [206]:
lstm_titlevec.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(1e-4), metrics=['AUC'])

In [207]:
titlevec_hist = lstm_titlevec.fit(train_vec_array, train_targets_array, 
                                  epochs=1, 
                                  batch_size=512,
                    #validation_data=(valid_array, valid_targets_array), 
                                  verbose=1)



In [29]:
hits, ndcgs, rrs = [], [], []
number_iters = len(test_input)

for i in range(0, len(test_input), 100):
    progress = round(i/number_iters*100, 1)
    print(f"{progress} %", end="\r")
    test_test = test_input[i:i+100]
    hr, ndcg, rr = eval_one_rating(test_test, lstm)
    hits.append(hr)
    ndcgs.append(ndcg)
    rrs.append(rr)

99.98 %

In [30]:
hr = np.array(hits).mean()
mrr = np.array(rrs).mean()
ndcg = np.array(ndcgs).mean()

print("Hit ratio:            ", hr)
print("Mean reciprocal rank: ", mrr)
print(f"NDCG@{K}:         ", ndcg)

Hit ratio:             0.2096
Mean reciprocal rank:  0.10394746031746031
NDCG@10:          0.1286773984962387
