In [3]:
%%file requirements.txt

arxiv
langchain
openai
sentence-transformers
transformers
sklearn


Writing requirements.txt


In [75]:
css = open('assets/style.css','r').read()
js = open('assets/script.js','r').read()

## Perform the following:
1. `[x]` Retrieve recent arxiv papers according to query
    * using `arxiv` library
3. `[x]` find the topics of the retrieved papers
    * using e.g., NMF + tf-Idf
    * make intuitive subsets of the words describing the topics using further thresholding.
4. `[x]` compute the relevance of all the papers to the keywords (using embedings)
    * some heuristics with thresholding etc
5. `[x]` discard papers and corresponding to the topics with low mean relevance
6. `[x]` find the final relevant papers
7. `[ ]` create a summary of the abstracts of these papers

In [None]:
import arxiv 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

In [124]:
## Some code to make pretty-printig HTML better:
s_pre =  '<html>'
s_pre += ' <head>'
s_pre += '  <style>' + css + '</style>'
s_pre += ' </head>'
s_pre += ' <body>'

s_post = '</body>'
s_post += '</html>'

def collapsible_button_html(title, content):
    s  = '   <button class="collapsible">%s</button> '%title
    s += '   <div class="content">'
    s += '   <p>' + content + '</p>'
    s += '   </div>'
    return s

def _make_collapsible(title, content):
    s += s_pre
    s += collapsible_button_html(title, content)
    s += '<script>' + js + '</script>'
    s += s_post
    return s

def _make_collapsibles(titles, content_list):
    s = s_pre
    for t, c in zip(titles, content_list):
        
        s += collapsible_button_html(t,c)
    s += '<script>' + js +'</script>'
    s += s_post
    return s

In [125]:
def _paper_quality_assignment(papers):
    """
    Performs some analytics on the papers and tries to figure 
    out a paper quality score.
    """
    # TODO

def _get_arxiv_papers_for_query(query, num_papers = 200):
    """
    gets a list of arxiv papers according to some query
    """
    
    res = arxiv.Search(
       interests_query,
      id_list=[],
      max_results = num_papers,
      sort_by = arxiv.SortCriterion.SubmittedDate,
      sort_order = arxiv.SortOrder.Descending
    )
    return res

def _get_arxiv_paper_list_text_data(arxiv_res):
    """
    Returns a list that contains the texts, given a list of arxiv results.
    """
    res_text_dat = [];
    query_results = []
    for r in arxiv_res.results():
        text_dat =  r.title +':\n\n' + r.summary
        res_text_dat.append(text_dat)
        query_results.append(r)
    return res_text_dat, query_results


def _topic_indices_from_topic_matrix(topic_matrix):
    inds = []
    for row in topic_matrix.T:
        inds.append(np.where(row)[0])
    return inds

def _get_topics_nmf_tfidf(
        texts,
        ntopics = 15,
        topic_accept_nmf_thresh = 0.1,
        topic_rel_q_thresh = 0.75
    ):
    """
    Gets a set of keywords using tf-idf, and simply non-negative matrix decomposition

    Args:
      texts  : a list of texts
      ntopics : number of topics
      topic_accept_nmf_thresh : the threshold above which to accept NMF components 
                      (the matrix is already usually sparse, but this helps making 
                      more intuitive sets of keywords for the papers)\
      
    """
    tfidf = TfidfVectorizer(stop_words='english')
    t = tfidf.fit_transform(texts)
    rr = NMF(n_components=ntopics).fit_transform(t.T)
    fnames = tfidf.get_feature_names_out()

    # get words for topics:
    topic_words = []
    for r in rr.T:
        topic_words.append(fnames[r>topic_accept_nmf_thresh])

    topic_rel = t @ rr
    q_v = np.quantile(topic_rel,topic_rel_q_thresh,1)
    # pplot.pcolor(topic_rel.T > q_v)
    topic_matrix = topic_rel.T>q_v
    topic_indices = _topic_indices_from_topic_matrix(topic_matrix)
    return  t, {'topic_indices' : topic_indices, 'topic_words' : topic_words ,'thresholded_topic_matrix' : topic_matrix}


def _second_level_topic_selection(paper_query_rel_scores, topic_data, q_thresh_val = 0.80):
    """Further filtering of topics based on paper-query relevance

    This creates a dictionary of topics that are relevant to the papers in the list.

    The paper_query_rel_scores can be computed (for instance) as follows: 
    
      `paper_query_rel_scores = emb_res @ enc_quer`


    """
    
    all_topic_inds = np.unique(np.stack(topic_data['topic_indices']))
    top_rel_scores = {}
    for topic_list, paper_score in zip(topic_data['topic_indices'], paper_query_rel_scores):
        for t in topic_list:
            if t not in top_rel_scores:
                top_rel_scores[t] = [paper_score, 1]
            else:
                top_rel_scores[t][0] += paper_score
                top_rel_scores[t][1] += 1
                
    for k in top_rel_scores.keys():
        avg = top_rel_scores[k][0] / top_rel_scores[k][1]
        top_rel_scores[k].append(avg)
    
    q_thresh = np.quantile([v[2] for k, v in top_rel_scores.items()], q_thresh_val)
    topic_filter = {k : v[2] >= q_thresh for k, v in top_rel_scores.items()}
    
    #contains "true" when this is a topic to be kept.
    kept_topics = {'indices' : [] , 'keywords' : []}
    discarded_topics = {'indices' : [] , 'keywords' : []}
    
    for t, b in topic_filter.items():
        if b:
            kept_topics['indices'].append(t)
            kept_topics['keywords'].append(topic_data['topic_words'][t])
        else:
            discarded_topics['indices'].append(t)
            discarded_topics['keywords'].append(topic_data['topic_words'][t])

    discarded_paper_inds, kept_paper_inds = [], []
    for k,p in enumerate(topic_data['topic_indices']):
        is_in_kept = False
        for i in p:
            if i in kept_topics['indices']:
                kept_paper_inds.append(k)
                is_in_kept = True
                break
                
        if not is_in_kept:
            discarded_paper_inds.append(k)
        
    return (kept_paper_inds, discarded_paper_inds), (kept_topics, discarded_topics)

In [126]:
from scipy.sparse._csr import csr_matrix
from typing import List, Dict, Tuple
from abc import ABC

class TopicModeler(ABC):
    def __init__(self):
        pass
    def __call__(self, v : List[str]):
        pass
        
class TFIDFNMFTopicModeler(TopicModeler):
    def __init__(
            self,
            ntopics = 15,
            topic_accept_nmf_thresh = 0.1,
            topic_rel_q_thresh = 0.75
        ):
        self.ntopics = ntopics
        self.topic_accept_nmf_thresh = 0.1
        self.topic_rel_q_thresh = 0.75
    
    def __call__(self, vals : List[str]) -> [csr_matrix, Dict] :
        return _get_topics_nmf_tfidf(
            vals,
            ntopics=self.ntopics,
            topic_accept_nmf_thresh=self.topic_accept_nmf_thresh,
            topic_rel_q_thresh=self.topic_rel_q_thresh
        )
        
class ArxivCustomRetrieval:
    """
    A hand-engineered retrieval engine, that 
    performs simple topic modeling and simple inner-product embedings-based
    topic and paper relevance determination (for filtering the most relevant papers)
    """
    def __init__(
            self, 
            topic_modeler : TopicModeler, 
            q_topic_thresh_val = 0.8,
            embeding_model_str = 'thenlper/gte-base', **kwargs
        ):
        """
        A simple stateful wrapper to all the utility functions.

        Args:

            topic_modeler: an object that can return topics from text (in a speciffic format)
            topic_2nd_lv_quantile_thresh : after preliminary ID of the topics, using the embeddings of the retrieved texts, 
                           the IDed topics are re-evaluated for matching the initial query. This value is used to compute 
                           quantiles of topic-matching values (computed as the average of relevance score of topic-related
                           papers. 
            
        """
        
        self.topic_modeler = topic_modeler
        
        model = SentenceTransformer(embeding_model_str)
        self.embedding_model = model
        self.q_topic_thresh_val = q_topic_thresh_val
        self.docs_embedded = False
        self.arxiv_papers_retrieved = False

    def get_full_state(self):
        query_state = (self.text_res, self.query_res)
        embedding_state = (self._enc_quer, self._emb_res ,self._paper_query_rel_scores, self.docs_embedded)
        topic_state = (self.topic_matrix, self.topic_data)
        return {'topic_state' : topic_state, 'embedding_state' : embedding_state, 'query_state' : query_state} 
        
    def set_full_state(self, state):
        self.topic_matrix, self.topic_data = state['topic_state']
        self._enc_quer, self._emb_res, self._paper_query_rel_scores, self.docs_embedded  = state['embedding_state']
        self.text_res, self.query_res = state['query_state']
        
        
    def run(self, query):
        """
        Full run:
         1. running the retrieval from arxiv
         2. performing topic modeling with TFIDF and NMF 
         3. reducing the topics to the most discriminative (through thresholding)
         4. embeding the retrieved documents (using a transformer model)
         5. computing the relevance of the retrieved documents with the provided query
         6. finding the mean topic relevance (by summing per-
           topic instance how relevant the documents that were assigned that topic are)
         7. setting (through thresholding) the most relevant topics
         8. discarding/keeping according to topic relevance the corresponding papers
        """
        # 1. 
        self.execute_query(query)
        # 2. + 3.
        self.get_topic_data(self.text_res)
        # 4. + 5. 
        self.embed_docs(self.text_res, query)
        # paper_query_rel_scores = self._emb_res @ self._enc_quer
        # 6. 7. 8. 
        (kept_paper_inds, discarded_paper_inds), (kept_topics, discarded_topics) = _second_level_topic_selection(
            self._paper_query_rel_scores, 
            self.topic_data,
            q_thresh_val = self.q_topic_thresh_val
        )
        
        self.kept_paper_inds, self.discarded_paper_inds = kept_paper_inds, discarded_paper_inds
        self.kept_topics, self. discarded_topics = kept_topics, discarded_topics        

    def get_kept_paper_arxiv_query_res(self):
        return [self.query_res[k] for k in self.kept_paper_inds]
        
    def embed_docs(self, text_res, interests_query):
        if not self.docs_embedded:
            emb_res = embeddings = model.encode(text_res)
            enc_quer = model.encode(interests_query)
            self._enc_quer = enc_quer
            self._emb_res = emb_res
            self._paper_query_rel_scores = emb_res @ enc_quer
            self.docs_embedded = True

    def get_topic_data(self, text_list : List[str]):
        """
        Creates a set of topics (as a sparse matrix and a vector of topic indices)
        
        See also:
          `TFIDF_NMF_TopicModeler`
        """
        self.topic_matrix, self.topic_data = self.topic_modeler(text_list)
        
    def execute_query(self, query):
        """
        Executes the query and stores the results in the object.
        """
        res = _get_arxiv_papers_for_query(query)
        text_res, query_res = _get_arxiv_paper_list_text_data(res)
        self.text_res, self.query_res = text_res, query_res

    def _repr_html_(self):
        """returns HTML ready to render or add as a component, that contains title and abstract from the arxiv papers.
        """
        titles, contents = [], []
        for q in self.query_res:
            titles.append(q.title)
            contents.append(q.summary)
            
        s = _make_collapsibles(titles, contents)
        return s
        
interests_query = "Search query: llm chatgpt quantization qlora dataset distilation low rank adapter"

In [127]:
t = ArxivCustomRetrieval(topic_modeler=TFIDFNMFTopicModeler())

In [128]:
t.run(interests_query)

In [129]:
t

In [101]:
class ArxivQueryResult:
    def __init__(self, query_result : arxiv.Result):
        self.query_result = query_result
        
    def _repr_html_(self):
        v = self.query_result
        s = _make_collapsible(v.title, v.summary)
        return s
        

s = ArxivQueryResult(t.query_res[0])._repr_html_()
HTML(s)

Check Sentence transformers leaderboard for the best retrieval transformer model you can afford.
[https://huggingface.co/spaces/mteb/leaderboard](https://huggingface.co/spaces/mteb/leaderboard)

# TODO: 
* Use a GPT-model for summarization/extraction of meaningful info from the papers.
* Use the topics (somehow) more