## NLP Ask Docs Challenge
work towards organizing and developing better AI/ML data infrastructure and documentation. On this challenge our goal is to create an NLP tool and process to take a recorded meeting, extract the text, summarize, and provide key word tagging.

we then want to able to ask questions directly to the model that has information on the text


First we install the libs we will need to run the code

In [10]:
!pip install pdftotext python-docx==0.8.10 regex==2020.11.13 tokenizers==0.9.4 torch==1.11.0 tqdm==4.54.1 transformers==4.0.1


Collecting pdftotext
  Downloading pdftotext-2.2.2.tar.gz (113 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/113.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.9/113.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-docx==0.8.10
  Using cached python_docx-0.8.10-py3-none-any.whl
Collecting regex==2020.11.13
  Using cached regex-2020.11.13-cp310-cp310-linux_x86_64.whl
Collecting tokenizers==0.9.4
  Using cached tokenizers-0.9.4.tar.gz (184 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting torch==1.11.0
  Using cached torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl (750.6 MB)
Collecting tqdm==4.54.1
  Using cached tqdm-4.54.1-py2.py3-none-any.whl (69 kB)
Collecting transformers==4.0.1
  Using c

In [11]:
import os
import argparse
import re
import docx
import math
import pdftotext
from itertools import zip_longest
from transformers import pipeline

ModuleNotFoundError: ignored

Next we set up the NLP method -

In [None]:

class NLPBot3000:
    '''NLPs!'''
    def __init__(self, infile_path, outfile_path="output.docx", nlp="summarization", batch_size=2700, nlp_kwargs=None):
        print(f"Initializing {nlp} pipeline...")
        self.nlp = pipeline(nlp)
        self.infile_path = infile_path
        self.text = ''

        print("Extacting text...")
        if self.infile_path[-3:] == 'pdf':
            self.pdf_get_text()

        if self.infile_path[-3:] == 'vtt':
            self.vtt_get_text()

        self.ner = pipeline('ner', grouped_entities=True)
        self.outfile_path = outfile_path
        self.batch_size = batch_size

        self.summaries = []
        self.tags = set()

    def pdf_get_text(self):
        with open(self.infile_path, 'rb') as f:
            self.pages = pdftotext.PDF(f)
        self.text = '\n\n'.join(page for page in self.pages)

    def vtt_get_text(self):
        '''MS Stream Transcripts'''

        with open(self.infile_path, 'r') as f:
            transcript = f.read()
        keepers = []
        for line in transcript.split('\n')[1:]:

            if line == '' or 'NOTE' in line or '-' in line:
                pass
            else:
                keepers.append(line)

        self.text = " ".join(keepers)

    def do_nlp(self):
        '''Summarizes text scraped from links'''
        N = len(self.text)
        # maker sure n_batches is always at least 1
        n_batches = math.ceil((N+1) / self.batch_size)
        batch = N // n_batches

        for i in range(0, N, batch):
            print(i, batch+i)
            section = self.text[i:(i+batch)]
            try:
                if len(section) < 50:
                    print("section too short")
                    continue

                summary = self.nlp(section, min_length=90, max_length=200)
                self.summaries.append(summary[0]['summary_text'])
                tag_set = set(x['word'] for x in self.ner(section))
                self.tags.update(tag_set)
                # print(summary)
            except Exception as e:
                print(f"\nFAILURE: {e}")
                continue
        return self.summaries

    def clean_summaries(self):
        '''Cleans summarized text'''
        self.final_text = ". ".join(sentence[0].upper() + sentence[1:] for sentence in "\n".join(self.summaries).split(" . "))
        return self.final_text

    def create_text_section(self, title="Test Title"):
        '''Writes MS Word Document with summarized text'''
        # read or create word document and make query the heading

        print("Creating document.")
        try:
            self.doc = docx.Document(self.outfile_path)
        except:
            self.doc = docx.Document()

        self.doc.add_heading(title, 1)

        try:
            self.summaries = self.do_nlp()
            self.final_text = self.clean_summaries()
            self.doc.add_paragraph(self.final_text)
            self.doc.add_heading("Extracted Tags", 2)
            self.doc.add_paragraph(", ".join(self.tags))
            self.doc.save(self.outfile_path)

        except Exception as e:
            print(f"\n\nEXCEPTION: {e}\n\n")

    def answer_question(self, question):
        try:
            nlp_qa = pipeline('question-answering')
            answer = nlp_qa(context=self.text, question=question)

        except Exception as e:
            print(f"\n\nEXCEPTION: {e}\n\n")

        return answer

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Scrape the googs!")
    parser.add_argument('infile_path', type=str, help="Word Document full filepath")

    parser.add_argument('outfile_path', type=str, help="Word Document full filepath")

    args = parser.parse_args()
    wp = NLPBot3000(args.infile_path, args.outfile_path)

    wp.create_text_section()
