In [1]:
from PyPDF2 import PdfReader
from langchain.vectorstores import FAISS,faiss
from langchain.embeddings import SentenceTransformerEmbeddings,HuggingFaceInstructEmbeddings,HuggingFaceHubEmbeddings,OpenAIEmbeddings,HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain.chains import QAWithSourcesChain,RetrievalQA
from langchain.docstore.document import Document
from pydantic import BaseModel
from fastapi import FastAPI, File, UploadFile
from tqdm import tqdm
from openai import OpenAI
from PDF_Read1 import create_index1,uploaded_docs,load_model,new_model,prepare_data
from summary_functions import prepare_data_summarize,get_page_summary
import pandas as pd
import numpy as np
import time
import os
import openai
import json
from dotenv import load_dotenv
load_dotenv()
os.environ.get("OPENAI_API_KEY")
OPEN_API_KEY=os.environ.get("OPENAI_API_KEY")

  embedding = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2", model_kwargs = {'device': 'cpu'})
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client=OpenAI(api_key=OPEN_API_KEY)
sys_prompt= """You are a research analyst and you are expert in reading the text of the document and summarize it in to 300 words."""
#user_prompt="""You are provided with the PDF document your task is to summarize the provided document in 300 words"""

### User Providing a PDF file which is summarized page-by-page

In [3]:
my_pdf,len=uploaded_docs("./LLM_Fine_Tuned_News.pdf")

In [4]:
my_pdf

[(' \nEnhancing LLM with Evolutionary Fine -Tuning for News \nSummary Generation  \nLe Xiao, Xiaolin Chen  \nCollege of Information Science and Engineering, Henan University of Technology, \nZhengzhou, China  \nxiaole@haut.edu.cn chenxiaolin @stu.haut.edu.cn  \nAbstract  \nNew s summary generation is an \nimportant task in the field of intelligence \nanalysis, which can provide accurate and \ncomprehensive information to help \npeople better understand and respond to \ncomplex real -world events. However, \ntraditional news summary generatio n \nmethods face some challenges, which are \nlimited by the model itself and the \namount of training data, as well as the \ninfluence of text noise, making it difficult \nto generate reliable information \naccurately. In this paper, we propose a \nnew paradigm for new s summary \ngeneration using LLM  with powerful \nnatural language understanding and \ngenerative capabilities. We use LLM  to \nextract multiple structured event patterns \nfrom th

In [5]:
docs=prepare_data_summarize(my_pdf)

100%|██████████| 12/12 [00:00<00:00, 11876.27it/s]


In [6]:
docs

[Document(metadata={'source': '1'}, page_content=' \nEnhancing LLM with Evolutionary Fine -Tuning for News \nSummary Generation  \nLe Xiao, Xiaolin Chen  \nCollege of Information Science and Engineering, Henan University of Technology, \nZhengzhou, China  \nxiaole@haut.edu.cn chenxiaolin @stu.haut.edu.cn  \nAbstract  \nNew s summary generation is an \nimportant task in the field of intelligence \nanalysis, which can provide accurate and \ncomprehensive information to help \npeople better understand and respond to \ncomplex real -world events. However, \ntraditional news summary generatio n \nmethods face some challenges, which are \nlimited by the model itself and the \namount of training data, as well as the \ninfluence of text noise, making it difficult \nto generate reliable information \naccurately. In this paper, we propose a \nnew paradigm for new s summary \ngeneration using LLM  with powerful \nnatural language understanding and \ngenerative capabilities. We use LLM  to \nextra

In [7]:
data_dict={}
for document in docs:
    data_dict[document.metadata['source']]=document.page_content
data_dict


{'1': ' \nEnhancing LLM with Evolutionary Fine -Tuning for News \nSummary Generation  \nLe Xiao, Xiaolin Chen  \nCollege of Information Science and Engineering, Henan University of Technology, \nZhengzhou, China  \nxiaole@haut.edu.cn chenxiaolin @stu.haut.edu.cn  \nAbstract  \nNew s summary generation is an \nimportant task in the field of intelligence \nanalysis, which can provide accurate and \ncomprehensive information to help \npeople better understand and respond to \ncomplex real -world events. However, \ntraditional news summary generatio n \nmethods face some challenges, which are \nlimited by the model itself and the \namount of training data, as well as the \ninfluence of text noise, making it difficult \nto generate reliable information \naccurately. In this paper, we propose a \nnew paradigm for new s summary \ngeneration using LLM  with powerful \nnatural language understanding and \ngenerative capabilities. We use LLM  to \nextract multiple structured event patterns \nfro

In [18]:
page_summaries={}
for page,content in data_dict.items():
        
        response=client.chat.completions.create(model="gpt-3.5-turbo",messages=[
        {
        "role":"system", "content":sys_prompt
        },
        {
        "role":"user",
        "content":f"""You are provided with the PDF text:
     content: {content} 
     Your task is to summarize the provided document in 300 words"""
    }
    ]
    )
        summary=response.choices[0].message.content
        page_summaries[page]=summary  

In [19]:
page_summaries

{'1': 'The document discusses the enhancement of news summary generation through the utilization of Large Language Models (LLM) with evolutionary fine-tuning. The traditional methods of news summary generation face challenges due to model limitations, training data constraints, and text noise. The proposed approach involves using LLM to extract structured event patterns from news paragraphs, evolving them with a genetic algorithm, and selecting the most suitable event pattern to generate news summaries. A News Summary Generator (NSG) is designed for this purpose to select and evolve event pattern populations, resulting in accurate and reliable news summaries with some level of generalization ability.\n\nPre-trained language models, particularly those based on the Transformer architecture, have seen significant advancements, enabling them to process natural language and learn statistical patterns from large-scale textual data. LLMs have shown improvements in various natural language pro

In [20]:
response=client.chat.completions.create(model="gpt-3.5-turbo",messages=[
        {
        "role":"system", "content":sys_prompt
        },
        {
        "role":"user",
        "content":f"""You are provided with the dictionary with key as the page numbers and values are the summaries of the respective page:
     content: {page_summaries} 
     Your task is to combine all the summaries and provide a collated final summary of the document in 300 words"""
    }
    ]
    )
Final_summary=response.choices[0].message.content

In [21]:
Final_summary

'The document explores the utilization of Large Language Models (LLMs) with evolutionary fine-tuning to enhance news summary generation. Traditional methods face challenges such as model limitations and text noise. LLMs, particularly based on Transformer architecture, have advanced in natural language processing tasks, excelling in information extraction. News summary generation is crucial for conveying essential event topics concisely, aiding in intelligence analysis and decision-making. The proposed approach involves extracting event patterns from news paragraphs using LLMs and genetic algorithms to generate accurate and reliable news summaries.\n\nEvent extraction involves trigger identification and event argument identification/classification, supporting applications like information extraction and knowledge graph construction. Genetic algorithms are introduced for optimizing event patterns to create news headlines and summarizations efficiently. Evaluations include ROUGE and BLEU 

### User providing Data Manually which is then summarized

In [3]:
data="""Enhancing LLM with Evolutionary Fine -Tuning for News \nSummary Generation  \nLe Xiao, Xiaolin Chen  \nCollege of Information Science and Engineering, Henan University of Technology, \nZhengzhou, China  \nxiaole@haut.edu.cn chenxiaolin @stu.haut.edu.cn  \nAbstract  \nNew s summary generation is an \nimportant task in the field of intelligence \nanalysis, which can provide accurate and \ncomprehensive information to help \npeople better understand and respond to \ncomplex real -world events. However, \ntraditional news summary generatio n \nmethods face some challenges, which are \nlimited by the model itself and the \namount of training data, as well as the \ninfluence of text noise, making it difficult \nto generate reliable information \naccurately. In this paper, we propose a \nnew paradigm for new s summary \ngeneration using LLM  with powerful \nnatural language understanding and \ngenerative capabilities. We use LLM  to \nextract multiple structured event patterns \nfrom the events contained in news \nparagraphs, evolve the event pattern \npopulation with genetic  algorithm, and \nselect the most adaptive event pattern to \ninput into the LLM  to generate news \nsummaries. A News Summary Generator  \n(NSG ) is designed to select and evolve \nthe event pattern populations and \ngenerate news summaries. The \nexperimental results sho w that the news \nsummary generator is able to generate \naccurate and reliable news summaries \nwith some generalization ability.  \n 1. Introduction  \nIn recent years, pre -trained language models \nhave undergone rapid development [1, 2, 3, 4, \n5, 6], especially models b ased on the \nTransformer [7] architecture, which has the \nability to process natural language. These \nmodels are able to automatically learn \nstatistical patterns and patterns in language by \ntraining on large -scale textual data , which \nmakes pre -trained languag e models widely \nadaptable and can be applied to a variety of \ndomains and tasks.  \nLarge language m odels  (LLM)  have \nimproved the experimental results of many \nnatural language processing tasks, exceeding \nthe previous state -of-the-art of deep learning \nmodels in  tasks such as information extraction \n[8] and causal inference [9], and therefore, how \nto enhance the performance of LLMs in \nspecific tasks has attracted extensive research.  \nNews summary generation is a type of \ndocument summary generation [10], which \naims to generate concise and important event \ntopics in a paragraph of text to better \ncommunicate intelligent  content. It plays a key \nrole in areas such as information processing, \nintelligence analysis, research , and decision -\nmaking . Automatic news summary gener ation \ncan provide accurate and comprehensive \ninformation to help people better understand \nand respond to complex real -world events.  \nTraditional news summary generation is \nmainly used to generate headlines , a task that \nrequires the model to be able to under stand the data.
"""
processed_data = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100,length_function=len,separators=['\n'])
splits = text_splitter.split_text(data)
for i,text in enumerate(splits):
    processed_data.extend(
            [
                Document(
                    page_content= text,
                    metadata={"source": i}
                ) 
            ]
        )


In [4]:
processed_data

[Document(metadata={'source': 0}, page_content='Enhancing LLM with Evolutionary Fine -Tuning for News \nSummary Generation  \nLe Xiao, Xiaolin Chen  \nCollege of Information Science and Engineering, Henan University of Technology, \nZhengzhou, China  \nxiaole@haut.edu.cn chenxiaolin @stu.haut.edu.cn  \nAbstract  \nNew s summary generation is an \nimportant task in the field of intelligence \nanalysis, which can provide accurate and \ncomprehensive information to help \npeople better understand and respond to \ncomplex real -world events. However, \ntraditional news summary generatio n \nmethods face some challenges, which are \nlimited by the model itself and the \namount of training data, as well as the \ninfluence of text noise, making it difficult \nto generate reliable information \naccurately. In this paper, we propose a \nnew paradigm for new s summary \ngeneration using LLM  with powerful \nnatural language understanding and \ngenerative capabilities. We use LLM  to \nextract mu

In [5]:
data_dict1={}
for document in processed_data:
    data_dict1[document.metadata['source']]=document.page_content
data_dict1

{0: 'Enhancing LLM with Evolutionary Fine -Tuning for News \nSummary Generation  \nLe Xiao, Xiaolin Chen  \nCollege of Information Science and Engineering, Henan University of Technology, \nZhengzhou, China  \nxiaole@haut.edu.cn chenxiaolin @stu.haut.edu.cn  \nAbstract  \nNew s summary generation is an \nimportant task in the field of intelligence \nanalysis, which can provide accurate and \ncomprehensive information to help \npeople better understand and respond to \ncomplex real -world events. However, \ntraditional news summary generatio n \nmethods face some challenges, which are \nlimited by the model itself and the \namount of training data, as well as the \ninfluence of text noise, making it difficult \nto generate reliable information \naccurately. In this paper, we propose a \nnew paradigm for new s summary \ngeneration using LLM  with powerful \nnatural language understanding and \ngenerative capabilities. We use LLM  to \nextract multiple structured event patterns \nfrom the

In [7]:
page_summaries={}
for page,content in data_dict1.items():
    response=client.chat.completions.create(model="gpt-3.5-turbo",messages=[
        {
        "role":"system", "content":sys_prompt
        },
        {
        "role":"user",
        "content":f"""You are provided with the PDF text:
     content: {content} 
     Your task is to summarize the provided document in 300 words"""
    }
    ]
    )
    summary=response.choices[0].message.content
    page_summaries[page]=summary 
page_summaries


{0: 'The document discusses enhancing news summary generation using Large Language Models (LLM) with Evolutionary Fine-Tuning. News summary generation plays a crucial role in intelligence analysis by providing accurate information to help individuals better grasp and react to real-world events. Traditional methods face challenges due to model limitations, insufficient training data, and text noise impact, hindering the generation of reliable information effectively. The proposed approach leverages LLM with strong natural language understanding and generative capabilities, aiding in extracting structured event patterns from news content. By implementing Evolutionary Fine-Tuning, the model can adapt and improve its performance over time. This methodology aims to overcome the limitations of traditional approaches and enhance the accuracy and comprehensiveness of news summaries. The study is conducted by Le Xiao and Xiaolin Chen from the College of Information Science and Engineering at He

In [8]:
response=client.chat.completions.create(model="gpt-3.5-turbo",messages=[
        {
        "role":"system", "content":sys_prompt
        },
        {
        "role":"user",
        "content":f"""You are provided with the dictionary with key as the page numbers and values are the summaries of the respective page:
     content: {page_summaries} 
     Your task is to combine all the summaries and provide a collated final summary of the document in 300 words"""
    }
    ]
    )
Final_summary=response.choices[0].message.content

In [9]:
Final_summary

"The document focuses on enhancing news summary generation through Large Language Models (LLM) with Evolutionary Fine-Tuning. It addresses challenges faced by traditional methods such as model limitations and text noise impact. The approach leverages LLM's natural language understanding and generative capabilities to extract structured event patterns from news content and improve performance over time with Evolutionary Fine-Tuning. The research by Le Xiao and Xiaolin Chen from Henan University of Technology suggests shifting towards advanced techniques for more accurate news summaries.\n\nThe development of a News Summary Generator (NSG) is discussed, combining genetic algorithms and LLM to extract and refine event patterns for accurate summaries. The importance of pre-trained language models, particularly Transformer-based models, in enhancing news summary efficiency and accuracy is emphasized. The NSG successfully generates informative summaries with a degree of generalization.\n\nTr