In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sys
import csv
import seaborn as sb
from collections import OrderedDict
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score
from transformers import BertTokenizer
import torch
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from bertopic import BERTopic
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from tqdm import tqdm
import torch
import transformers

2023-01-08 08:00:11.716651: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-08 08:00:11.800481: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-08 08:00:11.800497: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-08 08:00:12.269283: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [2]:
test_data = pd.read_csv('newest_test_data', engine='python', encoding='utf-8', on_bad_lines='skip').fillna('')

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [4]:
def tokenize_data(data):
    labels = []
    topics = []
    input_ids = []
    attention_masks = []
    sentiment_score = []
    for index, line in data.iterrows():
        labels.append(int(line.label))
        topics.append(int(line.topic))
        sentiment_score.append(float(line.text_sentiment_score))
        encoded_dict = tokenizer.encode_plus(
                            line.idf_words,
                            add_special_tokens = True, # [CLS] & [SEP]
                            truncation = 'longest_first', # Control truncation
                            max_length = 100, # Max length about texts
                            pad_to_max_length = True, # Pad and truncate about sentences
                            return_attention_mask = True, # Attention masks
                            return_tensors = 'pt') # Return to pytorch tensors
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    try:
        labels = torch.tensor(labels)
    except:
        print(labels)

    topics = torch.tensor(topics)
    sentiment_score = torch.tensor(sentiment_score)
    return input_ids, attention_masks, labels, topics, sentiment_score

In [5]:
class BertPlusModel(torch.nn.Module):
    def __init__(self, bert_model):
        super(BertPlusModel, self).__init__()
        self.bert_model = bert_model
        self.linear = torch.nn.Linear(769, 1)  # The output of BERT has size 768, so the input size to the linear layer should be 768
        self.sigmoid = torch.nn.Sigmoid()  # The sigmoid function will map the output to the range [0, 1]

    def forward(self, input_ids, attention_mask, additional_input):
        _, pooled_output = self.bert_model(input_ids=input_ids, attention_mask=attention_mask).to_tuple()
        concatenated = torch.cat((pooled_output, additional_input.unsqueeze(dim=1)), dim=1)
        return self.linear(concatenated)


In [6]:
# Get the pretrained BERT model
bert_model = transformers.BertModel.from_pretrained('bert-base-uncased')

# Initialize the model with the pretrained BERT model
model = BertPlusModel(bert_model)

# Load model CPU
model.load_state_dict(torch.load('model_10epoch_8batch.pt', map_location='cpu'))

# Set the model to evaluation mode
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertPlusModel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [7]:
def get_topic_id(article):
     for index, line in article.iterrows():
        return int(line.topic)

In [8]:
def filter_artciles_with_same_topic(article, dataset):
   topic = get_topic_id(article)
   filtered_articles = dataset.loc[dataset['topic'] == topic] 
   return filtered_articles

In [9]:
def filter_articles_based_on_predictions(articles, predictions, RECOMMENDATION_SIZE):
    result = []
    recommendations_found = 0
    for article_title, article_text, prediction in zip(articles['title'], articles['text'], predictions):
        if(prediction == 1):
            result.append([article_title, article_text])
            recommendations_found += 1
            if(recommendations_found >= RECOMMENDATION_SIZE):
                return result
    return result

In [10]:
def predict_label(query):
    query_input_ids, query_attention_masks, query_labels, query_topics, query_sentiment_score = tokenize_data(query)
    
    # Move tensors to GPU
#     query_input_ids = query_input_ids.to(device)
#     query_attention_masks = query_attention_masks.to(device)
#     query_labels = query_labels.to(device)
#     query_topics = query_topics.to(device)
#     query_sentiment_score = query_sentiment_score.to(device)
    
    output = model(query_input_ids, query_attention_masks, query_topics)
    query_labels = query_labels.type(torch.float).unsqueeze(1)
    output = torch.round(torch.sigmoid(output))  
    return output

In [11]:
RECOMMENDATION_SIZE = 5

query = test_data.sample(n=1).reset_index(drop=True)

print(get_topic_id(query))

articles_with_same_topic = filter_artciles_with_same_topic(query, test_data)
print(len(articles_with_same_topic))
predictions = predict_label(articles_with_same_topic).squeeze(1)
print(predictions)

recommended_articles = filter_articles_based_on_predictions(articles_with_same_topic, predictions, RECOMMENDATION_SIZE)

print(recommended_articles)

146
16
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       grad_fn=<SqueezeBackward1>)
