<a href="https://colab.research.google.com/github/moinudeen/neural-collaborative-filtering-news-recommendations/blob/main/news_recommendations_ncf_transormers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Package Installation and Import

In [1]:
!pip install -U sentence-transformers pytorch-lightning scikit-plot wordcloud

Collecting sentence-transformers
  Downloading sentence-transformers-2.0.0.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 4.1 MB/s 
[?25hCollecting pytorch-lightning
  Downloading pytorch_lightning-1.3.8-py3-none-any.whl (813 kB)
[K     |████████████████████████████████| 813 kB 33.7 MB/s 
[?25hCollecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Collecting wordcloud
  Downloading wordcloud-1.8.1-cp37-cp37m-manylinux1_x86_64.whl (366 kB)
[K     |████████████████████████████████| 366 kB 42.8 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 19.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 42.3 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.0.14-py3-none-any.whl (43 kB)
[K 

In [None]:

import string

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

import scikitplot as skplt

np.random.seed(123)



## Download the data files

In [None]:
! wget https://inspire-data-challenge.s3.amazonaws.com/user_news_clicks.csv 

In [None]:
! wget https://inspire-data-challenge.s3.amazonaws.com/news_text.csv

## Data Exploration


In [None]:
news_df = pd.read_csv("news_text.csv", sep="\t")
news_df['title'] = news_df['title'].fillna("")
news_df['title'] = news_df['title'].str.lower()
news_df['title'] = news_df.apply(lambda z: z.get("title", "")+"." if z.get("title") and z.get("title", "")[-1] not in string.punctuation else z.get("title"), axis=1)
news_df['abstract'] = news_df['abstract'].fillna("")
news_df['abstract'] = news_df['abstract'].str.lower()
news_df['text'] = news_df.apply(lambda z: z.get("title", "")+ " " + z.get("abstract", ""), axis=1)
news_df.head()

In [None]:
news_df.info()

In [None]:
print("unique items: ", len(news_df.news_id.unique()))
print("unique categories: ", len(news_df.category.unique()))


In [None]:
news_df.category.value_counts().plot(kind='bar', title='distribution of category values', figsize=(20, 10))

In [None]:
clicks_df = pd.read_csv("user_news_clicks.csv")
clicks_df

In [None]:
print("unique users: ", len(clicks_df.user_id.unique()))
print("unique items: ", len(clicks_df.item.unique()))
print("unique interactions: ",len(clicks_df.click.unique()))

In [None]:
clicks_df.shape, clicks_df.drop_duplicates(subset=["user_id", "item", "click"]).shape

In [None]:
clicks_df.click.value_counts().plot(kind='bar', title='distribution of non-clicks vs clicks')

## Encode news articles text with embeddings from SentenceTransformers

In [None]:
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model = SentenceTransformer('average_word_embeddings_komninos')

#Our sentences we like to encode
sentences = news_df['text'].tolist()

# run the encoder
embeddings = model.encode(sentences)

news_df['text_embedding'] = embeddings.tolist()

print(embeddings.shape)

news_df.head()

## Encode news category column

In [None]:
news_category_encoder = LabelEncoder()
news_df['category_encoded'] = news_category_encoder.fit_transform(news_df["category"])
news_dict = {r['news_id']: r for r in news_df.to_dict("rows")}
len(news_dict)
news_df.head()

## Encode user_id and item_id

In [None]:
label_encoders = {}
label_encoders["user_id"] = LabelEncoder()
label_encoders["item"] = LabelEncoder()
clicks_df["user_id_encoded"] = label_encoders["user_id"].fit_transform(clicks_df["user_id"])
clicks_df["item_id_encoded"] = label_encoders["item"].fit_transform(clicks_df["item"])
clicks_df.head()

## Data Preparation

- Random Sampling data for faster training with limited hardware resources
- There are lots of user-item interactions with more than one event, to keep it simple dropping these duplicates from the data as we are using these interactions as implicit feedback. This will also prevent training data leakage 
- Train and Test split will be done randomly as we don't have any timestamp values for the user-item interactions to split based on chronology of events.
- Not going for negative sampling to keep it simple here. The distribution of clicks vs non-clicks looks balanced already.

In [None]:
traindf = clicks_df.drop_duplicates(subset=["user_id", "item", "click"])
traindf = traindf.sample(frac=0.4)
testdf = traindf.sample(frac=0.025)
traindf = traindf.drop(testdf.index)
traindf.shape, testdf.shape

In [None]:
traindf.click.value_counts()

In [None]:
testdf.click.value_counts()

# Modelling
- using neural collaborating filtering approach 

In [None]:
class MINDTrainDataset(Dataset):
    """
    PyTorch Dataset for Training MIND dataset
    """

    def __init__(self, interactions, all_news_ids):
        """
        Args:
        interactions (pd.DataFrame): Dataframe containing the interactions
        all_news_ids (dict): dict containing all news ids and its metadata
        """
        self.users, self.items, self.item_cats, self.labels = self.get_dataset(interactions, all_news_ids)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.item_cats[idx], self.labels[idx]

    def get_dataset(self, interactions, all_news_ids):
        users, items, labels, item_cats = [], [], [], []
        user_item_set = set(zip(interactions['user_id_encoded'], interactions['item'], interactions['click']))

        for u, i, l in user_item_set:
            users.append(u)
            items.append(all_news_ids[i]['text_embedding'])
            item_cats.append(all_news_ids[i]['category_encoded'])
            labels.append(l)
    
        return torch.tensor(users), torch.tensor(items), torch.tensor(item_cats), torch.tensor(labels)

In [None]:
class NCF(pl.LightningModule):
    """ 
    Neural Collaborative Filtering (NCF)
    """
    
    def __init__(self, num_users, num_item_cats, text_embedding_dim, interactions, all_news_ids, embedding_hidden_dim=16):
        """
         Args:
            num_users (int): Number of unique users
            num_item_cats (int): Number of unique item cats
            text_embedding_dim (int): dimensions of the text embedding
            interactions (pd.DataFrame): Dataframe containing the news clicks
            all_news_ids (dict): dict containing all news ids and its metadata
        """
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_hidden_dim)
        self.item_cat_embedding = nn.Embedding(num_embeddings=num_item_cats, embedding_dim=embedding_hidden_dim)
        self.input_feature_shape = text_embedding_dim+embedding_hidden_dim+embedding_hidden_dim
        print(self.input_feature_shape)
        self.fc1 = nn.Linear(in_features=self.input_feature_shape, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=64)
        self.fc3 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.interactions = interactions
        self.all_news_ids = all_news_ids
        
    def forward(self, user_input, item_cat_input, item_embedding_input):
        
        # Compute embeddings 
        user_embedded = self.user_embedding(user_input)
        item_cat_embedded = self.item_cat_embedding(item_cat_input)

        # Concat the embeddings
        vector = torch.cat([user_embedded, item_cat_embedded, item_embedding_input], dim=-1)

        # Pass through fully connected
        out = nn.ReLU()(self.fc1(vector))
        out = nn.ReLU()(self.fc2(out))
        out = nn.ReLU()(self.fc3(out))

        # Output layer
        pred = nn.Sigmoid()(self.output(out))

        return pred
    
    def training_step(self, batch, batch_idx):
        # compute the loss
        user_input, item_embedding, item_cat_input, labels = batch
        predicted_labels = self(user_input, item_cat_input, item_embedding)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MINDTrainDataset(self.interactions, self.all_news_ids),
                          batch_size=512, num_workers=4)

# Training the model

In [None]:
num_users = max(traindf['user_id_encoded'].max()+1, testdf['user_id_encoded'].max()+1)
num_items_cat = news_df['category_encoded'].max()+1
text_embedding_dim = embeddings.shape[1]

model = NCF(num_users, num_items_cat, text_embedding_dim, traindf, news_dict)

In [None]:
text_embedding_dim, num_users, num_items_cat

In [None]:
from pytorch_lightning.loggers import TensorBoardLogger
logger = TensorBoardLogger("tb_logs", name="NCF_SBERT")


In [None]:
trainer = pl.Trainer(max_epochs=1, reload_dataloaders_every_epoch=True, progress_bar_refresh_rate=50, logger=logger, checkpoint_callback=False)

trainer.fit(model)

In [None]:
test_item_embeddings = [news_dict.get(i).get("text_embedding") for i in testdf['item'].values]
test_itemcat_embeddings = [news_dict.get(i).get("category_encoded") for i in testdf['item'].values]


trainer.logger.experiment.add_graph(model, input_to_model=(torch.tensor(testdf['user_id_encoded'].values[0]), torch.tensor(test_itemcat_embeddings[0]), torch.tensor(test_item_embeddings[0])))

In [None]:
%reload_ext tensorboard

In [None]:
%tensorboard --logdir tb_logs/

# Evaluating the model



*   Calculate the classification metrics precision, recall and f1 score with a default threshold of 0.5
*   Plot confusion matrix, PR Curves and ROC curves.
*   From the metrics, we can observe that the precision is high overall but the recall is lower for clicks==1.




In [None]:

predicted_labels = np.squeeze(model(torch.tensor(testdf['user_id_encoded'].values), torch.tensor(test_itemcat_embeddings), torch.tensor(test_item_embeddings)).detach().numpy())

testdf['prediction_conf1'] = predicted_labels.tolist()
testdf['prediction_conf0'] = (1-predicted_labels).tolist()
testdf['prediction'] = testdf['prediction_conf1'].apply(lambda z: 1 if z>0.5 else 0)
testdf

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report


def plot_confusion_matrix(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred)
    cmd = ConfusionMatrixDisplay(cm, display_labels=labels)
    cmd.plot()


In [None]:
plot_confusion_matrix(testdf['click'], testdf['prediction'], [1, 0])

In [None]:
print(classification_report(testdf['click'], testdf['prediction'], [1, 0]))

In [None]:

skplt.metrics.plot_precision_recall(testdf['click'].values, testdf[['prediction_conf0', 'prediction_conf1']].values, classes_to_plot=[0, 1], figsize=(10,10))


In [None]:
skplt.metrics.plot_roc_curve(testdf['click'].values, testdf[['prediction_conf0', 'prediction_conf1']].values, figsize=(10,10))

## Visualizing outputs
- doing some sanity checks to figure if the model outputs make sense
   - randomly taking an user id and checking the titles from train clicks and predicted clicks

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

def get_wordcloud_for_user(text_list):

    stopwords = set(STOPWORDS).union([np.nan, 'NaN', 'S'])

    wordcloud = WordCloud(
                   max_words=50000,
                   min_font_size =12,
                   max_font_size=50,
                   relative_scaling = 0.9,
                   stopwords=set(STOPWORDS),
                   normalize_plurals= True
    )

    clean_titles = [word for word in text_list if word not in stopwords]
    title_wordcloud = wordcloud.generate(' '.join(clean_titles))

    plt.figure(figsize = (10,10))
    plt.imshow(title_wordcloud, interpolation='bilinear',)
    plt.axis("off")
    plt.show()

In [None]:
# traindf.user_id.value_counts()[44300:44305]
ii = traindf[(traindf['user_id']=="U84756") & (traindf['click']==1)]['item'].unique()
tlist = [news_dict[i]['title'] for i in ii]
# tlist
news_df[news_df['news_id'].isin(ii)]['category'].value_counts()

In [None]:
get_wordcloud_for_user(tlist)

In [None]:
ii = testdf[(testdf['user_id']=="U84756") & (testdf['prediction']==1)]['item'].unique()
news_df[news_df['news_id'].isin(ii)]['category'].value_counts()

In [None]:
# testdf[testdf['user_id']=="U84756"]
tlist1 = [news_dict[i]['title'] for i in ii]

In [None]:
get_wordcloud_for_user(tlist1)

# Further Improvements

- More feature engineering: user-category affinity score, chronological based train-test split and evaluation, etc
- Hyperparamter tuning: finding the best values for different hyperparams like text embedding method, Neural Net layers and hidden sizes, batch size, epochs, optimizers, etc.
- Finetuning the prediction probabilty thresholds to find the right balance. i.e, Precision-Recall tradeoff.