**Extracting Business Insights Using Generative AI**
----
In this project, the capabilities of various large language models (LLMs) are utilized to extract business insights from scholarly articles.

The focus of this project is to construct a hybrid pipeline that deals with a variety of tasks using various large language models with differing performances. Additionally, prompt engineering is employed to create the best prompts for achieving the highest performance from these models.

----

In [None]:
# Importing libraries

import replicate
import re

In [None]:
# Function defined for dividing articles into sections

def is_correct_title(title):
    for c in title:
        if ('a' <= c and c <= 'z') or ('A' <= c and c <= 'Z'):
            return True
    return False

def divide_article_into_sections(article):
    sections = {}
    section_titles = re.findall(r'\d+\..+?\n', article)  # Find all lines that start with "number.title"
    
    # Remove the falsely selected section titles
    section_titles = [title for title in section_titles if '%' not in title] # EX: '2.5% ...' is not a section title
    
    # Check the correctness of the chosen section titles by checking the non-decreasing order of the section numbers
    correct_titles = []
    previous_number = 0  # Start with a sentinel value
    for title in section_titles:
        pos = article.find(title)
        prev_pos = pos-1
        # Extract the number at the start of the title
        match = re.match(r'(\d+)(\.\d+)?', title)
        if match:
            number = float(match.group(1))
            # Check if the main number (before the dot) is non-decreasing
            if number == previous_number or number == (previous_number + 1):
                # Check if there is a new line char before the title number
                if article[prev_pos] == '\n':
                    if is_correct_title(title):
                        correct_titles.append(title)
                        previous_number = number
                    
    section_titles = correct_titles
    
    # Use zip to pair section titles with their corresponding text
    for title, next_title in zip(section_titles, section_titles[1:] + ['']):
        # Get the start and end positions of each section
        start_pos = article.find(title)
        end_pos = article.find(next_title)
        
        # Extract the section text and remove the section title
        section_text = article[start_pos + len(title):end_pos].strip()
        
        # Store the section in the dictionary with the title as the key
        sections[title.strip()] = section_text
    
    abstract_types = ["abstract", "Abstract", "a b s t r a c t", "A B S T R A C T"]
    for type in abstract_types:
        if type in article:
            start_pos = article.find(type)
            end_pos = article.find(list(sections.keys())[0])
            section_text = article[start_pos + len(type):end_pos]
            new_sections_dic = {'abstract':section_text}
            new_sections_dic.update(sections)
            sections = new_sections_dic
            break

    # Cleaned sections dictionary
    cleaned_sections_dict = {}

    # Regular expression to match integers and punctuation
    regex = re.compile('[0-9\.\,\!\?\:\;\-\—\(\)]')

    for key, value in sections.items():
        # Remove integers and punctuation from the key
        cleaned_key = regex.sub('', key)
        # Convert key to lowercase
        cleaned_key = cleaned_key.lower()
        # Remove any extra whitespace
        cleaned_key = cleaned_key.strip()
        # Add to the cleaned dictionary
        cleaned_sections_dict[cleaned_key] = value

    return cleaned_sections_dict

In [None]:
# Functions defined for extracting pdf documents and converting them into clean text

from pdfminer.high_level import extract_pages 
from pdfminer.layout import LTTextContainer, LTChar, LTAnno
import fitz 

def clean_text(text):
    """
    Function to clean the extracted text.
    This function may need to be modified based on the specific 'noises' you encounter in your PDFs.
    """
    # Remove page numbers
    text = text[4].split('\n')
    text = [line for line in text if not line.isdigit()]
    # Joining text
    text = '\n'.join(text)
    return text

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file and clean it.
    """
    # Open the provided PDF file
    doc = fitz.open(pdf_path)
    full_text = ""
    # Iterate over each page
    for page_num in range(len(doc)):   
        # Get a page
        page = doc.load_page(page_num)
        # Extract text from the page
        blocks = page.get_text("blocks")
        for block in blocks:
            # Clean the extracted text
            text = clean_text(block)
            #print(text)
            # Append cleaned text
            full_text += text
    # Close the document
    doc.close()
    return full_text

def extract_pdf_and_divide_sections(path):
    extracted_text = extract_text_from_pdf(path)
    #print(extracted_text[20000:30000])
    parsed_sections = divide_article_into_sections(extracted_text)
    return parsed_sections

In [None]:
# Function defined for sending prompt to LLaMA 2 70B model using Replicate's API Service

def send_prompt(prompt, sys_prompt):
    rp_client = replicate.Client(api_token='r8_VbKuL8aKGq6NNTtMVRRRfHWP6VnCZAl3G2Kum')
    output = rp_client.run(
        "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
        input={
        "debug": False,
        "top_k": 50,
        "top_p": 1,
        "prompt": prompt,
        "temperature": 0.75,
        "system_prompt": sys_prompt,
        "max_new_tokens": 1000,
        "min_new_tokens": -1
    })
    response = ""
    for item in output:
        # https://replicate.com/meta/llama-2-70b-chat/versions/02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3/api#output-schema
        # print(item, end="")
        response += item
    return response

In [None]:
# Functions defined for processes within our pipeline that send prompt to LLaMA 2 70B model

def summarize(section_name, section_text):
    summarize_sys_prompt = 'You are a tool that summarizes the given text. The given text is a section of an article. Give a concise summary of the section text to include only the most important information.'
    prompt = section_name + ": " + section_text
    output = send_prompt(prompt, summarize_sys_prompt)
    return output

def extract_insights(input):
    insights_sys_prompt = 'You are a tool that extracts key insights from an article. You will be provided with article sections. As an output, you should provide concise insights about the given article in bulletpoints.'
    prompt = input
    output = send_prompt(prompt, insights_sys_prompt)
    return output

def generate_title(insights):
    find_title_sys_prompt = "From the given insights, provide a title."
    prompt = "Extracted insights: " + insights + "Title: "
    output = send_prompt(prompt, find_title_sys_prompt)
    return output

def choose_images(insights, image_titles):
    choose_images_sys_prompt = "Given the image title, choose the most important 3 images of the article based on the insights extracted from the article."
    prompt = "Extracted insights: " + insights + "Image titles: " + image_titles + "Important sections: "
    output = send_prompt(prompt, choose_images_sys_prompt)
    return output

In [None]:
# Getting and preprocessing PDF input

business_pdf1_path = "/Users/selinceydeli/Desktop/AIResearch/business-article-inputs/1-s2.0-S0148296323004216-main.pdf"
sections_dict = extract_pdf_and_divide_sections(business_pdf1_path)

In [None]:
# Getting and preprocessing clean text input

business_txt_path = "/Users/selinceydeli/Desktop/AIResearch/llm_dev/summarization_pipeline/bus_article1.txt"
with open(business_txt_path, 'r') as file:
    article = file.read()
sections_dict = divide_article_into_sections(article)

In [None]:
# Extracting section texts of important sections 

abstract = sections_dict.get('abstract', "")

critical_sections = ["introduction", "conclusion", "discussion", "methodology"]

critical_section_information = {}
for section_name in critical_sections:
  critical_section_information[section_name] = sections_dict.get(section_name, "")

"""
If at least two of the sections among "conclusion", "discussion", and "outcomes" are missing, 
then take the last four sections (we keep each subsection seperately in the current formulation of sections_dict) 
of the article (excluding keywords, acknowledgments, and references sections)
"""
check_for_absence = ""
critical_section_list = list(critical_section_information.items())
for section_name, section_text in critical_section_list[-3:]:
    if section_text == "": check_for_absence += '0'

if len(check_for_absence) >= 2:
    accepted = 0
    unwanted_sections = ["keywords", "acknowledgments", "references"]
    sections_list = list(sections_dict.items())
    for section_name, section_text in sections_list[::-1]: # Reverse iteration of the sections_list
        section_name = section_name.lower()
        section_text = sections_dict.get(section_name, "")
        if section_name not in unwanted_sections and section_text != "":
            critical_section_information[section_name] = section_text
            accepted += 1
            if accepted >= 4:
                break

In [None]:
# Summarizing important sections
    
summarized_sections = {}
for section_name, section_text in critical_section_information.items():
    if section_text != "" and section_name != "introduction" and section_name != "managerial implications": 
        summary = summarize(section_name, section_text)
        summarized_sections[section_name] = summary
        print("Summary of " + section_name + ": \n" + summary)
    else : summarized_sections[section_name] = None

In [None]:
# Extracting insights from the article

def create_section_input(summarized_sections):
    # Initialize an empty string to store the formatted output
    section_input = ""

    # Iterate over each key-value pair in the dictionary
    for key, value in summarized_sections.items():
        # Append the key and value to the string with the specified format
        section_input += f"{key}: {value} \n"

    return section_input

section_input = create_section_input(summarized_sections)
insights = extract_insights(section_input)
print("Extracted insights:\n" + insights)

In [None]:
# Generating a meaningful title to be presented as the chat title in the interface

title = generate_title(insights)
print(title)

In [None]:
# Choosing the most important figures/titles

def capture_image_titles(extracted_text):
    # Define the possible representations of figure and table names
    figure_patterns = ["Fig\.", "Figure"]
    table_patterns = ["Table"]

    # Combine the patterns into regex patterns
    figure_pattern = "|".join(figure_patterns)
    table_pattern = "|".join(table_patterns)

    # Create a regex pattern to capture the figure and table titles
    figure_title_pattern = f"({figure_pattern})\s*(\d+)\.\s*(.*?)\."
    table_title_pattern = f"({table_pattern})\s*(\d+)\s*\\n?\s*(.*?)\."

    # Find all matches in the extracted_text for figures and tables
    figure_matches = re.findall(figure_title_pattern, extracted_text)
    table_matches = re.findall(table_title_pattern, extracted_text)

    # Initialize lists to store figure and table titles
    titles = []

    # Process and store the matched titles and numberings from the figures
    for match in figure_matches:
        title_type, title_number, title_text = match
        if 'A' <= title_text[0] <= "Z":
            titles.append(f"{title_type} {title_number}. {title_text}")

    # Process and store the matched titles and numberings from the tables
    for match in table_matches:
        title_type, title_number, title_text = match
        if title_text and 'A' <= title_text[0] <= "Z":
            titles.append(f"{title_type} {title_number}. {title_text}")

    return titles

def extract_pdf(path):
    extracted_text = extract_text_from_pdf(path)
    return extracted_text

extracted_pdf = extract_pdf(business_pdf1_path)
titles = capture_image_titles(extracted_pdf)
image_titles = ""
for title in titles:
    image_titles += title + "\n"
important_images = choose_images(insights, image_titles)
print(important_images)

In [None]:
import fitz
from PIL import Image
import os
import re

def extract_titles_from_page(page):
    
    # Define the possible representations of figure and table names
    figure_patterns = ["Fig\.", "Figure"]
    table_patterns = ["Table"]

    # Combine the patterns into regex patterns
    figure_pattern = "|".join(figure_patterns)
    table_pattern = "|".join(table_patterns)

    # Create a regex pattern to capture the figure and table titles
    figure_title_pattern = f"({figure_pattern})\s*(\d+)\.\s*(.*?)\."
    table_title_pattern = f"({table_pattern})\s*(\d+)\s*\\n?\s*(.*?)\."

    text_blocks = page.get_text("blocks")

    # Initialize lists to store figure and table titles
    titles = []
    
    for block in text_blocks:
        block_text = block[4]
        
        # Find all matches in the extracted_text for figures and tables
        figure_matches = re.findall(figure_title_pattern, block_text)
        table_matches = re.findall(table_title_pattern, block_text)

        # Process and store the matched titles and numberings from the figures
        for match in figure_matches:
            title_type, title_number, title_text = match
            if title_text != "":
                if ('A' <= title_text[0] and title_text[0] <= "Z") or ('0' <= title_text[0] and title_text[0] <= '9'):
                    titles.append(f"{title_type} {title_number}. {title_text}") 

        # Process and store the matched titles and numberings from the tables
        for match in table_matches:
            title_type, title_number, title_text = match
            if title_text != "":
                if ('A' <= title_text[0] and title_text[0] <= "Z") or ('0' <= title_text[0] and title_text[0] <= '9'):
                    titles.append(f"{title_type} {title_number}. {title_text}")

    return titles

# Open the file
pdf_file = fitz.open(business_pdf1_path)

titles = []

# Iterate over PDF pages
for page_index in range(len(pdf_file)):
    page = pdf_file[page_index]
    page_image_titles = extract_titles_from_page(page)
    for title in page_image_titles:
        title += " (Page:" + str(page_index+1) + ")"
        titles.append(title)

pdf_file.close()

image_titles = ""
for title in titles:
    image_titles += title + "\n"
important_images = choose_images(insights, image_titles)
print(important_images)