# API for AstRoBERTa

# Environment imports

In [9]:
%%capture
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

In [2]:
%%capture
!pip install fastapi

In [3]:
%%capture
!pip install colabcode

In [44]:
import os
from subprocess import Popen, PIPE, STDOUT
from haystack.nodes import TfidfRetriever, TransformersReader, Seq2SeqGenerator
from haystack.pipelines import ExtractiveQAPipeline, GenerativeQAPipeline
from haystack.utils import convert_files_to_docs, print_answers, clean_wiki_text
from haystack.document_stores import InMemoryDocumentStore
import sys
sys.path.append('/content/gdrive/MyDrive/Dissertation/code')
from types_m import *
from ff_m import _format_filters

# Connect to google drive

In [4]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


# Set up elasticsearch as document store

In [37]:
document_store = InMemoryDocumentStore()

In [None]:
def clean_text(txt):
    txt = clean_wiki_text(txt)
    txt = txt.replace("===", "")
    txt = txt.replace("\n\n\n", "\n")
    return txt 

The doc dir has to be redirected to a foler with cleaned txt files.

In [38]:
doc_dir = "/content/gdrive/MyDrive/Dissertation/wiki-data"
docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_text, split_paragraphs=True)

document_store.write_documents(docs)
document_store.get_document_count()

22

# Test pipeline

In [None]:
retriever = TfidfRetriever(document_store=document_store)
reader = TransformersReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=-1)
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
question = "What is space debris?"

In [43]:
print_answers(pipeline.run(
    query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
) , details="minimum")


Query: What is space debris?
Answers:
[   {   'answer': 'rocket upper stages',
        'context': 'e problem of space debris, it was learned that much debris '
                   'was due to rocket upper stages (e.g. the Inertial Upper '
                   'Stage) which end up in orbit, and break up d'},
    {   'answer': 'five centimeters or larger',
        'context': 'ite weapon test (ASAT), spreading nearly 2800 objects of '
                   'space debris five centimeters or larger into LEO. An '
                   'analysis concluded that about eighty percent of the debr'},
    {   'answer': 'defunct human-made objects in space',
        'context': 'junk, space pollution, space waste, space trash, or space '
                   'garbage) is defunct human-made objects in '
                   'space—principally in Earth orbit—which no longer serve a '
                   'useful function. T'},
    {   'answer': 'orbit around the Earth',
        'context': 'ntal forum whose aim is t

# Choose between extractive and abstractive QA

In [None]:
# Run this cell for extractive
reader = TransformersReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=-1)
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)

In [45]:
# Run this cell for abstractive
generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")
pipeline = GenerativeQAPipeline(generator, retriever)

Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

# Create API

In [17]:
from fastapi import FastAPI, Response, status
from fastapi.testclient import TestClient
import pickle
import haystack
import time
import json
from numpy import ndarray

In [46]:
app = FastAPI()
client = TestClient(app)

@app.get('/ping')
def index():
    return {'pong'}

@app.get('/initialized', status_code=200)
def index(response: Response):
  response.status_code = status.HTTP_201_CREATED
  return "hola", 200

@app.post("/fakequery/", response_model=QueryResponse)
def create_item(query: QueryResponse):
    print("hello1")
    print(query)
    return query

@app.post("/query/", response_model=QueryResponse, response_model_exclude_none=True)
def query(request: QueryRequest):
    start_time = time.time()
    params = request.params or {}
    if "filters" in params.keys():
        params["filters"] = _format_filters(params["filters"])
    for key in params.keys():
        if "filters" in params[key].keys():
            params[key]["filters"] = _format_filters(params[key]["filters"])
    # Pipeline is used here
    result = pipeline.run(query=request.query, params=params, debug=request.debug)

    if not "documents" in result:
        result["documents"] = []
    if not "answers" in result:
        result["answers"] = []

    for document in result["documents"]:
        if isinstance(document.embedding, ndarray):
            document.embedding = document.embedding.tolist()

    return result

# Test API

In [19]:
response = client.get("/ping")
print(response.status_code)
print(response.json())

200
['pong']


In [20]:
response = client.post(
    "/fakequery/",
    # headers={"X-Token": "coneofsilence"},
    json={"query": "how are you?"},
)
print(response.status_code, response.json())

hello1
query='how are you?' answers=[] documents=[] debug=None
200 {'query': 'how are you?', 'answers': [], 'documents': [], '_debug': None}


In [48]:
response = client.post(
    "/query/",
    json={'query': 'What is space debris?', 'params': {'filters': {}, 'Retriever': {'top_k': 3}, 'Generator': {'top_k': 3}}},
)
print(response.json)
# print(response.status_code, response.json())

<bound method Response.json of <Response [200]>>


In [None]:
response.json()

# Add more API functions

In [23]:
from haystack.schema import Label
from typing import Any, Dict, List, Union, Optional

@app.post("/feedback/")
def post_feedback(feedback: Union[LabelSerialized, CreateLabelSerialized]):
    if feedback.origin is None:
        feedback.origin = "user-feedback"
    label = Label(**feedback.dict())
    document_store.write_labels([label])

@app.get("/feedback/", response_model=List[LabelSerialized])
def get_feedback():
    labels = document_store.get_all_labels()
    return labels

@app.delete("/feedback/")
def delete_feedback():
    all_labels = document_store.get_all_labels()
    user_label_ids = [label.id for label in all_labels if label.origin == "user-feedback"]
    document_store.delete_labels(ids=user_label_ids)

@app.post("/eval-feedback/")
def get_feedback_metrics(filters: FilterRequest = None):
    if filters:
        filters_content = filters.filters or {}
        filters_content["origin"] = ["user-feedback"]
    else:
        filters_content = {"origin": ["user-feedback"]}
    labels = document_store.get_all_labels(filters=filters_content)
    res: Dict[str, Optional[Union[float, int]]]
    if len(labels) > 0:
        answer_feedback = [1 if l.is_correct_answer else 0 for l in labels]
        doc_feedback = [1 if l.is_correct_document else 0 for l in labels]
        answer_accuracy = sum(answer_feedback) / len(answer_feedback)
        doc_accuracy = sum(doc_feedback) / len(doc_feedback)
        res = {"answer_accuracy": answer_accuracy, "document_accuracy": doc_accuracy, "n_feedback": len(labels)}
    else:
        res = {"answer_accuracy": None, "document_accuracy": None, "n_feedback": 0}
    return res

@app.get("/export-feedback/")
def export_feedback(context_size: int = 100_000, full_document_context: bool = True, only_positive_labels: bool = False):
    """
    This endpoint returns JSON output in the SQuAD format for question/answer pairs
    that were marked as "relevant" by user feedback through the `POST /feedback` endpoint.
    The context_size param can be used to limit response size for large documents.
    """
    if only_positive_labels:
        labels = document_store.get_all_labels(filters={"is_correct_answer": [True], "origin": ["user-feedback"]})
    else:
        labels = document_store.get_all_labels(filters={"origin": ["user-feedback"]})
        labels = [l for l in labels if not (l.is_correct_document is True and l.is_correct_answer is False)]
    export_data = []
    for label in labels:
        answer_text = label.answer.answer if label and label.answer else ""
        offset_start_in_document = 0
        if label.answer and label.answer.offsets_in_document:
            offset_start_in_document = label.answer.offsets_in_document[0].start
        if full_document_context:
            context = label.document.content
            answer_start = offset_start_in_document
        else:
            text = label.document.content
            context_to_add = int((context_size - len(answer_text)) / 2)
            start_pos = max(offset_start_in_document - context_to_add, 0)
            additional_context_at_end = max(context_to_add - offset_start_in_document, 0)
            end_pos = min(offset_start_in_document + len(answer_text) + context_to_add, len(text) - 1)
            additional_context_at_start = max(offset_start_in_document + len(answer_text) + context_to_add - len(text), 0)
            start_pos = max(0, start_pos - additional_context_at_start)
            end_pos = min(len(text) - 1, end_pos + additional_context_at_end)
            context = text[start_pos:end_pos]
            answer_start = offset_start_in_document - start_pos
        squad_label: Dict[str, Any]
        if label.is_correct_answer is False and label.is_correct_document is False:  # No answer
            squad_label = {"paragraphs": [{
                        "context": context,
                        "id": label.document.id,
                        "qas": [{"question": label.query, "id": label.id, "is_impossible": True, "answers": []}],}]}
        else:
            squad_label = {"paragraphs": [{
                        "context": context,
                        "id": label.document.id,
                        "qas": [{
                                "question": label.query,
                                "id": label.id,
                                "is_impossible": False,
                                "answers": [{"text": answer_text, "answer_start": answer_start}],}],}]}
            start = squad_label["paragraphs"][0]["qas"][0]["answers"][0]["answer_start"]
            answer = squad_label["paragraphs"][0]["qas"][0]["answers"][0]["text"]
            context = squad_label["paragraphs"][0]["context"]
            if not context[start : start + len(answer)] == answer:
                print(f"Skipping invalid squad label as string via offsets ",f"('{context[start:start + len(answer)]}') does not match answer string ('{answer}') ")
        export_data.append(squad_label)
    export = {"data": export_data}
    with open("feedback_squad_direct.json", "w", encoding="utf8") as f:
        json.dump(export_data, f, ensure_ascii=False, sort_keys=True, indent=4)
    return export

# Test new API functions

In [30]:
response = client.post("/feedback/",
    json={
      'query': 'How many sessions were planned at the Fourth European Conference on Space Debris?',
      'document': {
        'content': 'The 5th IAASS Conference was held  17 19 October in Versailles, France.  The 3 day  conference consisted of  forty 2 hour sessions  devoted to a number of  general space safety  issues including three sessions on spacecraft  reentry, four sessions on space traffic, two  sessions on space debris, one session on debris ',
        'content_type': 'text',
        'id': '9c91054b96e9267e1f3476ea6467980a',
        'meta': {
          'article_title': 'nasa_odqn_paper_295_m.txt'
        }
      },
      'is_correct_answer': True,
      'is_correct_document': True,
      'origin': 'user-feedback',
      'answer': {
        'answer': 'two',
        'type': 'extractive',
        'score': 0.4920458346605301,
        'context': ' safety  issues including three sessions on spacecraft  reentry, four sessions on space traffic, two  sessions on space debris, one session on debris ',
        'offsets_in_document': [{
          'start': 266,
          'end': 269
        }],
        'offsets_in_context': [{
          'start': 97,
          'end': 100
        }],
        'document_id': '9c91054b96e9267e1f3476ea6467980a',
        'meta': {
          'article_title': 'nasa_odqn_paper_295_m.txt'
        }
      }
    },
)
response.status_code, response.json()


(200, None)

In [25]:
response = client.get("/feedback/")
print(response.status_code)
response.json()

200


[{'id': 'd07fa428-636e-4adf-808d-c54c5d163afc',
  'query': 'How many sessions were planned at the Fourth European Conference on Space Debris?',
  'document': {'content': 'The 5th IAASS Conference was held  17 19 October in Versailles, France.  The 3 day  conference consisted of  forty 2 hour sessions  devoted to a number of  general space safety  issues including three sessions on spacecraft  reentry, four sessions on space traffic, two  sessions on space debris, one session on debris ',
   'content_type': 'text',
   'id': '9c91054b96e9267e1f3476ea6467980a',
   'meta': {'article_title': 'nasa_odqn_paper_295_m.txt'},
   'score': None,
   'embedding': None},
  'is_correct_answer': True,
  'is_correct_document': True,
  'origin': 'user-feedback',
  'answer': {'answer': 'two',
   'type': 'extractive',
   'score': 0.4920458346605301,
   'context': ' safety  issues including three sessions on spacecraft  reentry, four sessions on space traffic, two  sessions on space debris, one session on d

In [26]:
response = client.delete("/feedback/")
print(response.status_code)
response.json()

200


In [29]:
response = client.post("/eval-feedback/")
print(response.status_code)
response.json()

200


{'answer_accuracy': None, 'document_accuracy': None, 'n_feedback': 0}

In [31]:
response = client.get("/export-feedback/")
print(response.status_code)
response.json()

200


{'data': [{'paragraphs': [{'context': 'The 5th IAASS Conference was held  17 19 October in Versailles, France.  The 3 day  conference consisted of  forty 2 hour sessions  devoted to a number of  general space safety  issues including three sessions on spacecraft  reentry, four sessions on space traffic, two  sessions on space debris, one session on debris ',
     'id': '9c91054b96e9267e1f3476ea6467980a',
     'qas': [{'question': 'How many sessions were planned at the Fourth European Conference on Space Debris?',
       'id': '140f032d-968d-4556-ad51-5a2903b556b9',
       'is_impossible': False,
       'answers': [{'text': 'two', 'answer_start': 266}]}]}]}]}

# Start server

In [32]:
from pyngrok import ngrok
ngrok.set_auth_token("2Cic2NFLJHneI0vmcTJfWs46ZQU_53MiGMPf8CJkdtvYw5JAJ")



In [33]:
from colabcode import ColabCode
server = ColabCode(port=10000, code=False)

Copy the URL `Public URL: NgrokTunnel: "https://9f64-34-91-131-253.ngrok.io"` and paste it in the Heroku app as a config variable for the UI to be able to connect to this API

In [51]:
server.run_app(app=app)

Public URL: NgrokTunnel: "https://d838-34-91-131-253.ngrok.io" -> "http://localhost:10000"


INFO:     Started server process [70]
INFO:uvicorn.error:Started server process [70]
INFO:     Waiting for application startup.
INFO:uvicorn.error:Waiting for application startup.
INFO:     Application startup complete.
INFO:uvicorn.error:Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:10000 (Press CTRL+C to quit)
INFO:uvicorn.error:Uvicorn running on http://127.0.0.1:10000 (Press CTRL+C to quit)


INFO:     34.243.39.159:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     34.243.39.159:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     34.243.39.159:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     34.243.39.159:0 - "POST /query HTTP/1.1" 307 Temporary Redirect
INFO:     34.243.39.159:0 - "POST /query/ HTTP/1.1" 200 OK
INFO:     34.243.39.159:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     34.243.39.159:0 - "POST /query HTTP/1.1" 307 Temporary Redirect
INFO:     34.243.39.159:0 - "POST /query/ HTTP/1.1" 200 OK
INFO:     52.31.214.214:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     52.31.214.214:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     52.31.214.214:0 - "POST /query HTTP/1.1" 307 Temporary Redirect




INFO:     52.31.214.214:0 - "POST /query/ HTTP/1.1" 200 OK
INFO:     54.246.245.52:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     54.246.245.52:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     54.246.245.52:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     54.246.245.52:0 - "POST /query HTTP/1.1" 307 Temporary Redirect




INFO:     54.246.245.52:0 - "POST /query/ HTTP/1.1" 200 OK
INFO:     54.246.245.52:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     54.74.169.90:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     54.74.169.90:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     54.74.169.90:0 - "POST /query HTTP/1.1" 307 Temporary Redirect
INFO:     54.74.169.90:0 - "POST /query/ HTTP/1.1" 200 OK
INFO:     54.74.169.90:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     54.74.169.90:0 - "GET /initialized HTTP/1.1" 201 Created
INFO:     54.74.169.90:0 - "POST /query HTTP/1.1" 307 Temporary Redirect
INFO:     54.74.169.90:0 - "POST /query/ HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:uvicorn.error:Shutting down
INFO:     Finished server process [70]
INFO:uvicorn.error:Finished server process [70]
