In [None]:
!pip install transformers tensorflow_text sentence-transformers

## **Downloading Data**
### Data from kaggle for a number of srticles and there abstacts

In [None]:
!wget -O papers.zip "https://storage.googleapis.com/kaggle-data-sets/491/9097/compressed/papers.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20230411%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20230411T104147Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=2492eb055dbd527ba639bfa4dc87cfe21daf40270c424a2a0893a10806ed7aa19e34406a541d4f5fcdb6870c85034a2e2056ab8eb47a1a6b5a4737fe680a956baafd20f536714d095d81723553b5156227ed2ad05124537d7982d8deb0e6c0cb0256c4170854813ca1ab34d8feaa78e22b2b00a39aa1d4fd9521a68b0fe057f11a6236792d06ee2bf4f9234c97e500aaa3c0403b5e92087cfc0dc2d24217005e5498155be5be637e78f15f75d919c9b0163ca823ba87ba767777eeb1e5da6f1a1702210855f08237ebc41bae297d6dd7516da9acd80e675ca5388decaedf24508d018bc52ae4f8bf93dbff89bf4f16c8b548b209f643771b825565f069e8696d"

--2023-04-11 23:04:08--  https://storage.googleapis.com/kaggle-data-sets/491/9097/compressed/papers.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20230411%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20230411T104147Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=2492eb055dbd527ba639bfa4dc87cfe21daf40270c424a2a0893a10806ed7aa19e34406a541d4f5fcdb6870c85034a2e2056ab8eb47a1a6b5a4737fe680a956baafd20f536714d095d81723553b5156227ed2ad05124537d7982d8deb0e6c0cb0256c4170854813ca1ab34d8feaa78e22b2b00a39aa1d4fd9521a68b0fe057f11a6236792d06ee2bf4f9234c97e500aaa3c0403b5e92087cfc0dc2d24217005e5498155be5be637e78f15f75d919c9b0163ca823ba87ba767777eeb1e5da6f1a1702210855f08237ebc41bae297d6dd7516da9acd80e675ca5388decaedf24508d018bc52ae4f8bf93dbff89bf4f16c8b548b209f643771b825565f069e8696d
Resolving storage.googleapis.com (storage.googleapis.com)... 172.253.62.128, 172.253.115.128, 172.253.122.128, ...
Connecting to stora

In [None]:
!unzip "/content/papers.zip" -d "/content/data"

Archive:  /content/papers.zip
  inflating: /content/data/papers.csv  


## **Import Libraries**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
import tensorflow_text as text
from transformers import AutoTokenizer, TFAutoModel
from tqdm.notebook import tqdm
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os
import gensim
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pprint

from sklearn import metrics

import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## **Reading the data:** 
### The data is a set of papers saved in a csv file and our main concern would be of the abstract feature
### We will give most concern for the the abstract as a proof of concept

Calling only articles that  have abstracts

In [None]:
data_path = "/content/data/papers.csv"
df = pd.read_csv(data_path)
atrticles_with_abstract = df[df['abstract'] != 'Abstract Missing']
articles = list(atrticles_with_abstract.abstract)

## Using the pretrained ***Transformer***: "***all-MiniLM-L6-v2***" *in preprocessing and producing the output*

We define a set of keywords and use a pre-trained semantic search model to encode them. It then encodes a list of articles using the same semantic search model and computes the cosine similarity between the encoded articles and the encoded keywords. The similarity scores are stored in a list.

The "all-MiniLM-L6-v2" is based on the MiniLM architecture, which is a small and efficient language model designed to achieve state-of-the-art performance on various natural language processing (NLP) tasks with a small number of parameters.

All-MiniLM-L6-v2 is an even smaller version of the MiniLM-L6 model, with a total of 71 million parameters. Despite its smaller size, it has achieved impressive results on various benchmark datasets and tasks in the NLP field.

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch


model = SentenceTransformer('all-MiniLM-L6-v2')

# Encoding keywords and abstracts

articles_encoded = model.encode(articles, convert_to_tensor=True)
keywords = ['technology', 'business', 'science', 'entertainment', 'politics']


# Find the closest 5 abstracts of the articles for each query keyword based on cosine similarity
top_k = min(5, len(articles))
for query in keywords:
    query_embedding = model.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, articles_encoded)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar articles in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(articles[idx], "(Score: {:.4f})".format(score))







Query: technology

Top 5 most similar articles in corpus:
Communication between a speaker and hearer will be most efficient when both parties make accurate inferences about the other. We study inference and communication in a television game called Password, where speakers must convey secret words to hearers by providing one-word clues. Our working hypothesis is that human communication is relatively efficient, and we use game show data to examine three predictions. First, we predict that speakers and hearers are both considerate, and that both take the other?s perspective into account. Second, we predict that speakers and hearers are calibrated, and that both make accurate assumptions about the strategy used by the other. Finally, we predict that speakers and hearers are collaborative, and that they tend to share the cognitive burden of communication equally. We find evidence in support of all three predictions, and demonstrate in addition that efficient communication tends to bre

# **Extracting Hot-keywords from articles**

### Classical processing to prepare the data for extracting the hot keywords

In [None]:
#Creating a list of custom stopwords that are most common and repeated in almost all papers
new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]
stop_words = list(stop_words.union(new_words))

def pre_process(text):
    """
    Preprocesses the input text by converting to lowercase, removing tags, special characters and digits, 
    removing stop words and words less than three letters,
    and lemmatizing the tokens.
    
    Args: text: str, the input text to be preprocessed.
    
    Returns: tokens: list of str, the preprocessed tokens.
    """
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    # remove stopwords
    text = [word for word in text if word not in stop_words]

    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in text]
    
    return tokens

In [None]:
# Get the top N articles
n_articles = 5
top_articles = atrticles_with_abstract.iloc[top_results.indices.tolist()[:n_articles]]
# Join the article texts into a single string
article_text = ' '.join(list(top_articles['abstract']))

tokens = pre_process(article_text)

# Create a frequency distribution of the tokens
fdist = nltk.FreqDist(tokens)

# Get the top 10 most frequent tokens
top_tokens = fdist.most_common(10)


## Here are the 10 most common keywords in the articles 

In [None]:
top_tokens

[('model', 14),
 ('agent', 13),
 ('matrix', 8),
 ('analysis', 5),
 ('document', 5),
 ('topic', 5),
 ('voting', 5),
 ('team', 4),
 ('vote', 3),
 ('legislation', 3)]