In [1]:
import time
import json
import pickle
import gradio as gr
from annoy import AnnoyIndex
from transformers import pipeline

from database import Database, ADSEntry
from text_to_vec import spacy_tokenizer

# load database
settings = json.load(open("settings.json", 'r'))
ADSDatabase = Database( settings=settings['database'], dtype=ADSEntry )

# ranks words by importance/occurence
vectorizer = pickle.load(open("vectorizer.pkl", "rb"))

# reduce dimensionality
pca = pickle.load(open("pca.pkl", "rb"))

# lambda function to process input text
process_input = lambda x: pca.transform(vectorizer.transform([spacy_tokenizer(x)]).toarray())[0]

class TextAI:
    def __init__(self, db, model_checkpoint='./models/checkpoint-90000', neighbor_file=f'test_514.ann'):
        self.db = db
        self.model = pipeline('text-generation',model=model_checkpoint, tokenizer='gpt2', config={'max_length':2000})
        ndim = int(neighbor_file.split('_')[-1].split('.')[0])
        self.neighbors = AnnoyIndex(ndim, 'angular')
        self.neighbors.load(neighbor_file) # super fast, will just mmap the file

    def __call__(self, x):
        # generate text 
        text_suggestions = []
        for i in range(4):
            text_suggestions.append(self.model(x)[0]['generated_text'].replace(x,'').strip())

        #text = self.model(x, max_length=1000, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1)[0]['generated_text']
        # get similar abstracts
        nids = self.neighbors.get_nns_by_vector(process_input(x), 10, search_k=-1, include_distances=False)
        entrys = self.db.query(self.db.dtype.id.in_(nids)) # thread issue with gradio (could cache instead)
        paper_recs = [[entry.title,entry.bibcode,entry.abstract] for entry in entrys]
        return text_suggestions, paper_recs

textai = TextAI(ADSDatabase)

In [2]:
custom_text = "We acknowledge funding support from the National Aeronautics and Space Administration (NASA) Mars Data Analysis Program (MDAP) Grant Number"
text, papers = textai(custom_text)
print(text)

for i in range(len(papers)):
    print(f"{papers[i][0]} ({papers[i][1]}) {papers[i][2]} \n")

['97-57102, with participation from the Arizona university system and NASA. The flight model of the SPICAM', '70264266 and from the National Science Foundation (NF) administered by NASA.\nA major result of Mars', 'ing System (MADS). We are in the early process of distributing these grants through the NASA Science Data Center', '08501 to the College of Charleston, South Carolina.\nThe primary goal of the Virtual Planetary Laboratory (V']
The Harvard Science Research Mentoring Program (2018arXiv180908078G) The last decade has seen a proliferation of mentoring programs that provide high-school students authentic research experiences. Such programs expose students to front-line research, equip them with basic research skills (including coding skills), and introduce them to scientist role models. Mentors in such programs range from undergraduate students to faculty members. Here, I describe the founding and first two years of operation of the Harvard Science Research Mentoring Program (SRMP

In [None]:
def generate(input_text, radio):
    text, papers = textai(input_text)
    return '\n'.join(text)

demo = gr.Interface(
    fn=generate,
    inputs=[
        gr.Textbox(lines=2, value=custom_text),
        gr.Radio(["1","2","3"]),
    ],
    
    outputs="text",
    live=True
)
demo.launch()