In [None]:
!nvidia-smi
!pip install git+https://github.com/LIAAD/yake
!pip install grpcio-tools==1.34.1
!pip install --upgrade git+https://github.com/deepset-ai/haystack.git

In [None]:
#from haystack.preprocessor.cleaning import clean_wiki_text
#from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
#from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
from pprint import pprint
from haystack.generator.transformers import Seq2SeqGenerator

# Base Reader
from haystack.pipeline import ExtractiveQAPipeline
from haystack.reader.farm import FARMReader
reader = FARMReader("deepset/roberta-base-squad2", use_gpu=True)

#FAISS
from haystack.document_store import FAISSDocumentStore
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)

#Classifier and Summarizer
from haystack.document_classifier import TransformersDocumentClassifier
from haystack import Pipeline
from haystack.summarizer import TransformersSummarizer
from haystack.pipeline import SearchSummarizationPipeline
doc_classifier_model = 'textattack/bert-base-uncased-imdb'
doc_classifier = TransformersDocumentClassifier(model_name_or_path=doc_classifier_model)

#QA Generator
from haystack.pipelines import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline
from haystack.pipelines import QuestionAnswerGenerationPipeline
from haystack.question_generator import QuestionGenerator # Initialize Question Generator
question_generator = QuestionGenerator(use_gpu=True) 

# LFQA Generator
from haystack.pipeline import GenerativeQAPipeline
generator = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5")

# Yake
import yake

In [None]:
import json
import pandas as pd

reviews_text = ""
doc_list = []
with open('B092YHJGMN.json') as json_file:
  data = json.load(json_file)
  for i in data['reviews']:
    localdict = {}
    localdict2 = {}
    reviews_text += str(i['reviewText'])
    localdict2["name"] = i['reviewerName']
    localdict["content"] = i['reviewText']
    localdict["meta"] = localdict2
    doc_list.append(localdict)

n = len(doc_list)
print("Total Reviews: ", n)
reviews_text = reviews_text.replace('"', '')
document_store.write_documents(doc_list)

#Load DPR
from haystack.retriever.dense import DensePassageRetriever
retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=64,
                                  max_seq_len_passage=256,
                                  batch_size=16,
                                  use_gpu=True,
                                  embed_title=True,
                                  use_fast_tokenizers=True)
# Important: 
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation. 
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once. 
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)

In [None]:
kw_list = []

kw_extractor = yake.KeywordExtractor(lan="en", n=3, dedupLim=0.2, dedupFunc='seqm', windowsSize=1, top=30)
keywords = kw_extractor.extract_keywords(reviews_text)

for keyword in keywords:
  kw_list.append(keyword[0])


print(kw_list)
ques = "How long is the battery life?" #"Can i play wow?" "Can i upgrade up to 32gb of ram?"
kw_old = "battery life" #"play wow" #"upgrade ram"

In [None]:
#QA Generator

# The most basic version of a question generator pipeline takes a document as input and outputs generated
# questions which the the document can answer.
qg_pipeline = QuestionGenerationPipeline(question_generator)
for document in document_store:
  result_qg = qg_pipeline.run(documents=[document])
  print(result_qg['generated_questions'][0]['questions']) 

output_kw = []
output_QAgen = []
# This pipeline takes a query as input. It retrieves relevant documents and then generates questions based on these.
rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)
for kw in kw_list:
  print(kw)
  result_rqg = rqg_pipeline.run(query=kw)
  for i in range(len(result_rqg['generated_questions'][0]['questions'])):
    pprint(result_rqg['generated_questions'][0]['questions'][i])
    output_kw.append(kw)
    output_QAgen.append(result_rqg['generated_questions'][0]['questions'][i])

# This pipeline takes a document as input, generates questions on it, and attempts to answer these questions using
# a Reader model
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
for document in document_store:
  result_qag = qag_pipeline.run(documents=[document])
  pprint(result_qag['results'])
  dict_keys(['results', 'documents', 'query_doc_list', 'root_node', 'params', 'node_id'])

# LFQA - GenerativeQAPipeline combines a retriever and a reader/generator to answer our questions.
gqa_pipeline = GenerativeQAPipeline(generator, retriever)
result_gqa = gqa_pipeline.run(query=ques, params={"Retriever": {"top_k": 50}})
pprint(result_gqa['answers'][0])

In [None]:
df = pd.DataFrame(output_kw, columns = ['Keywords Generated'])
df['Questions Generated'] = output_QAgen
df.to_csv("QA_Gen.csv", index=False)

In [None]:
#classifer
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipeline.add_node(component=doc_classifier, name='Classifier', inputs=['Retriever'])
result_classifier = pipeline.run(query=question, params={"Retriever": {"top_k": 50}})
documents = result_classifier['documents']
dictsagain = []
count = 0
classifier_positive = list()
for i in [doc.text for doc in documents if doc.meta['classification']['label'] == 'LABEL_1']:
  # ‘LABEL_1’ for a positive sentiment, or ‘LABEL_0’ for negative
  print(str(count+1) + ") " + i)
  classifier_positive.append(i)
  count = count + 1
classifier_negative = list()
count = 0
for i in [doc.text for doc in documents if doc.meta['classification']['label'] == 'LABEL_0']:
  print(str(count+1) + ") " + i)
  classifier_negative.append(i)
  count = count + 1

In [None]:
#summarizer
summarizer = TransformersSummarizer(model_name_or_path='t5-large', min_length=10, max_length=300,
                                    generate_single_summary=True)
pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer)
result_summarizer = pipeline.run(query=keyword, params={"Retriever": {"top_k": 50}})
sumarizer_result = list()
sumarizer_result_context = list()
sumarizer_result.append(result_summarizer['documents'][0].text)
sumarizer_result_context.append(result_summarizer['documents'][0].meta['context'])

In [None]:
pipe = ExtractiveQAPipeline(reader, retriever)

# You can configure how many candidates the reader and retriever shall return
# The higher top_k for retriever, the better (but also the slower) your answers.
prediction = pipe.run(query=question, params={"Retriever": {"top_k": 50}, "Reader": {"top_k": 50}})


In [None]:
reader_results = list()
reader_results_context = list()
for answer in prediction['answers']:
  print(answer['answer'])
  reader_results.append(answer['answer'])
  print(answer['context'])
  reader_results_context.append(answer['context'])
  # Empty row filling
  maxlength = max(len(classifier_positive),len(classifier_negative), len(sumarizer_result), 
                   len(sumarizer_result_context), len(reader_results), len(reader_results_context))
  def fillarray(array):
    if len(array) is maxlength:
      print("ok")
    else:
      for x in range(len(array),maxlength):
        array.append("")
  fillarray(classifier_positive)
  fillarray(classifier_negative)
  fillarray(sumarizer_result)
  fillarray(sumarizer_result_context)
  fillarray(reader_results)
  fillarray(reader_results_context)
  
dict = {'question' : "How long is the battery life?", 
        'answer_classifier_positive' : classifier_positive, 'answer_classifier_negative' : classifier_negative,
        'answer_summarizer': sumarizer_result, 'answer_sumarizer_context': sumarizer_result_context,
        'answer_reader' : reader_results, 'answer_reader_context': reader_results_context}
df = pd.DataFrame(dict)
df.to_csv('Haystack_result.csv')