In [None]:
pip install pypdf langchain transformers tiktoken -q

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain import PromptTemplate,  LLMChain
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain import HuggingFacePipeline
import transformers
import torch
import time



In [None]:
loader = PyPDFLoader("/kaggle/input/harry-porter/harry-potter-book-collection-1-4.pdf")

In [None]:
model_id = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
#     load_in_8bit=True,
    device_map="auto",
)

In [None]:
pipeline1 = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map="auto",
    max_length=800,
    max_new_tokens=70,
    do_sample=True,
    top_k=10,
    pad_token_id=50256,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)
llm1 = HuggingFacePipeline(pipeline = pipeline1, model_kwargs = {'temperature':0})

In [None]:
pipeline2 = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map="auto",
    max_length=3900,
    max_new_tokens=300,
    do_sample=True,
    top_k=10,
    pad_token_id=50256,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)
llm2 = HuggingFacePipeline(pipeline = pipeline2, model_kwargs = {'temperature':0})

In [None]:
template1 = """Generate a meaningful summary of the following text delimited by triple backquotes.
              Return your response in short and crisp meaningful summary.
              ```{text}```
              Summary:
           """
template2 = """Generate a meaningful summary of the following text delimited by triple backquotes.
              Return your response in short and crisp meaningful summary.
              ```{text}```
              Summary:
           """
prompt1 = PromptTemplate(template=template1, input_variables=["text"])
llm_chain1 = LLMChain(prompt=prompt1, llm=llm1)

prompt2 = PromptTemplate(template=template2, input_variables=["text"])
llm_chain2 = LLMChain(prompt=prompt2, llm=llm2)

In [None]:
def summarize_story(story):
    article = []
    model_generated = []
    try:
        pages = story.load_and_split()
        short_summary = ''
        if len(pages)>50:
            for i in range(1,len(pages),5):
                text = pages[i].page_content
                article.append(text)
                if len(text.split(' '))>20:
                    response = llm_chain1.run(text)
                    model_generated.append(' '.join(response.split(' ')))
                    short_summary += " " + ' '.join(response.split(' '))
                else:
                    continue
        else:
            for i in range(1,len(pages)):
                text = pages[i].page_content
                article.append(text)
                if len(text.split(' '))>20:
                    response = llm_chain1.run(text)
                    model_generated.append(' '.join(response.split(' ')))
                    short_summary += " " + ' '.join(response.split(' '))
                else:
                    continue

    except:
        remaining_text = short_summary

    final_summary = ''
    remaining_text = short_summary
    try:
        if len(remaining_text.split(' ')) <= 3500 and len(remaining_text.split(' ')) > 1000:
            remaining_text = llm_chain2.run(remaining_text)
        elif len(remaining_text.split(' ')) <= 1000:
            response_chunk = llm_chain1.run(remaining_text)
            remaining_text = ' '.join(response_chunk.split(' '))
        else:
            while remaining_text:
                text_splitter = TokenTextSplitter(chunk_size=800, chunk_overlap=0)
                texts = text_splitter.split_text(remaining_text)
                final_summary = ''
                for i in range(len(texts)):
                    text_chunk = texts[i]
                    response_chunk = llm_chain1.run(text_chunk)
                    final_summary = final_summary+ ". " + ' '.join(response_chunk.split(' '))
                if len(final_summary.split(' ')) <= 3500 and len(final_summary.split(' ')) > 1000:
                    remaining_text = llm_chain2.run(final_summary)
                    break
                elif len(final_summary.split(' ')) > 3500:
                    remaining_text = final_summary
                    continue
                else:
                    remaining_text = final_summary
                    break
    except:
        story_summary = ' '.join(remaining_text.split(' '))
        return [story_summary,article,model_generated]
    story_summary = ' '.join(remaining_text.split(' '))
    return [story_summary,article,model_generated]

In [None]:
%%time
data_frame = summarize_story(loader)

**Model evalution**

In [None]:
# !pip install rouge
# !pip install bert_score
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge
import pandas as pd
from bert_score import score

In [None]:
lio=[data_frame[1][0:25]]
lio1 =[data_frame[2][0:25]]
gh1 = {'article':lio[0] ,'model_generated':lio1[0] ,'gpt-3.5': gpt_summary1}
data1 = pd.DataFrame(gh1)

In [None]:
generated_summaries = data2['model_generated'].tolist()
reference_summaries = data2['gpt-3.5'].tolist()

# Calculate ROUGE scores for the selected samples
rouge_scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
rouge = Rouge()
print("ROUGE Scores:", rouge_scores)

In [None]:
bleu_score = corpus_bleu(reference_summaries, generated_summaries)
print("BLEU Score for 25 Summaries:", bleu_score)

In [None]:
P, R, F1 = score(generated_summaries, reference_summaries, lang="en", verbose=True)
print("BERT Precision:", P.mean().item())
print("BERT Recall:", R.mean().item())
print("BERT F1 Score:", F1.mean().item())

****Mistral 7b****

In [None]:
mistral_model_id = "mistralai/Mistral-7B-v0.1"
mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_model_id)
mistral_model = AutoModelForCausalLM.from_pretrained(
    mistral_model_id,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
#     load_in_8bit=True,
    device_map="auto",
)

In [None]:
mistral_pipeline1 = pipeline(
    "text-generation",
    model=mistral_model,
    tokenizer=mistral_tokenizer,
    trust_remote_code=True,
    device_map="auto",
    max_length=750,
    max_new_tokens=70,
    do_sample=True,
    top_k=10,
    pad_token_id=50256,
    num_return_sequences=1,
    eos_token_id=mistral_tokenizer.eos_token_id
)
mistral_llm1 = HuggingFacePipeline(pipeline = mistral_pipeline1, model_kwargs = {'temperature':0})

In [None]:
mistral_pipeline2 = pipeline(
    "text-generation",
    model=mistral_model,
    tokenizer=mistral_tokenizer,
    trust_remote_code=True,
    device_map="auto",
    max_length=3900,
    max_new_tokens=290,
    do_sample=True,
    top_k=10,
    pad_token_id=50256,
    num_return_sequences=1,
    eos_token_id=mistral_tokenizer.eos_token_id
)
mistral_llm2 = HuggingFacePipeline(pipeline = mistral_pipeline2, model_kwargs = {'temperature':0})

In [None]:
mistral_template1 = """Generate a meaningful summary of the following text delimited by triple backquotes.
              Return your response in short and crisp meaningful summary.
              ```{text}```
           """
mistral_template2 = """Generate a meaningful summary of the following text delimited by triple backquotes.
              Return your response in short and crisp meaningful summary.
              ```{text}```
           """
mistral_prompt1 = PromptTemplate(template=mistral_template1, input_variables=["text"])
mistral_llm_chain1 = LLMChain(prompt=mistral_prompt1, llm=mistral_llm1)

mistral_prompt2 = PromptTemplate(template=mistral_template2, input_variables=["text"])
mistral_llm_chain2 = LLMChain(prompt=mistral_prompt2, llm=mistral_llm2)

In [None]:
def mistral_summarize_story(story):
    article = []
    model_generated = []
    try:
        pages = story.load_and_split()
        short_summary = ''
        if len(pages)>50:
            for i in range(1,len(pages),5):
                print(i)
                text = pages[i].page_content
                article.append(text)
                if len(text.split(' '))>20:
                    response = mistral_llm_chain1.run(text)
                    model_generated.append(' '.join(response.split(' ')))
                    short_summary += " " + ' '.join(response.split(' '))
                else:
                    continue
        else:
            for i in range(1,len(pages)):
                text = pages[i].page_content
                article.append(text)
                if len(text.split(' '))>20:
                    response = mistral_llm_chain1.run(text)
                    model_generated.append(' '.join(response.split(' ')))
                    short_summary += " " + ' '.join(response.split(' '))
                else:
                    continue

    except:
        remaining_text = short_summary

    final_summary = ''
    remaining_text = short_summary
    try:
        if len(remaining_text.split(' ')) <= 3500 and len(remaining_text.split(' ')) > 1000:
            remaining_text = mistral_llm_chain2.run(remaining_text)
        elif len(remaining_text.split(' ')) <= 1000:
            response_chunk = mistral_llm_chain1.run(remaining_text)
            remaining_text = ' '.join(response_chunk.split(' '))
        else:
            while remaining_text:
                text_splitter = TokenTextSplitter(chunk_size=800, chunk_overlap=0)
                texts = text_splitter.split_text(remaining_text)
                final_summary = ''
                for i in range(len(texts)):
                    text_chunk = texts[i]
                    response_chunk = mistral_llm_chain1.run(text_chunk)
                    final_summary = final_summary+ ". " + ' '.join(response_chunk.split(' '))
                if len(final_summary.split(' ')) <= 3500 and len(final_summary.split(' ')) > 1000:
                    remaining_text = mistral_llm_chain2.run(final_summary)
                    break
                elif len(final_summary.split(' ')) > 3500:
                    remaining_text = final_summary
                    continue
                else:
                    remaining_text = final_summary
                    break
    except:
        story_summary = ' '.join(remaining_text.split(' '))
        return [story_summary,model_generated]
    story_summary = ' '.join(remaining_text.split(' '))
    return [story_summary,model_generated]

In [None]:
%%time
mistral_data_frame = mistral_summarize_story(loader)

**Model evalution of mistral 7b model**

In [None]:
data2 = pd.read_csv('/kaggle/input/sample/model_generated.csv')
data2['mistral_7b'] = mistral_data_frame[1][0:25]

In [None]:
generated_summaries = data2['mistral_7b'].tolist()
reference_summaries = data2['gpt-3.5'].tolist()

# Calculate ROUGE scores for the selected samples
rouge = Rouge()
rouge_scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
print("ROUGE Scores:", rouge_scores)

In [None]:
bleu_score = corpus_bleu(reference_summaries, generated_summaries)
print("BLEU Score for 25 Summaries:", bleu_score)

In [None]:
P, R, F1 = score(generated_summaries, reference_summaries, lang="en", verbose=True)
print("BERT Precision:", P.mean().item())
print("BERT Recall:", R.mean().item())
print("BERT F1 Score:", F1.mean().item())

**Gemini pro**

In [2]:
import google.generativeai as genai

Gemini_pro_API_Key = 'API_KEY'
genai.configure(api_key=Gemini_pro_API_Key)
gemini_model = genai.GenerativeModel('gemini-pro')

In [None]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

In [None]:
prompt_template = f"""
    Generate a meaningful summary of the following text delimited by triple backquotes.
    Return your response in short and crisp meaningful summary consist of atleast 500.

    ```{text}```
    """

In [None]:
hyper_parameter = genai.types.GenerationConfig(
                                  candidate_count=1,
                                  max_output_tokens=500,
                                  top_p = 0.7,
                                  top_k = 4,
                                  temperature=0.2)

In [None]:
def gemini_pro_summary(story,prompt_template,hyper_parameter):
    pages = story.load_and_split()
    short_pages = ''
    for i in range(0,len(pages)):
        text = pages[i].page_content
        short_pages = short_pages +' '+text
    if len(short_pages.split(' ')) < 28000:
        text3 = short_pages
        prompt1 = prompt_template.format(text=text3)
        response = gemini_model.generate_content(prompt1,generation_config=hyper_parameter)
        return response.text
    else:
        while short_pages:
            text_splitter = TokenTextSplitter(chunk_size=20000, chunk_overlap=0)
            texts = text_splitter.split_text(short_pages)
            final_summary = ''
            for i in range(1,len(texts)):
                concatenated_text = texts[i]
                prompt2 = prompt_template.format(text=concatenated_text)
                response = gemini_model.generate_content(prompt2,generation_config=hyper_parameter)
                print(response.text)
                final_summary = final_summary+ ". " + str(response.parts)

            if len(final_summary.split(' ')) > 28000:
                short_pages = final_summary
                count = count + 1
                continue
            else:
                text2 = final_summary
                prompt3 = prompt_template.format(text=text2)
                response = gemini_model.generate_content(prompt3,generation_config=hyper_parameter)
                break

    return response.text


In [1]:
%%time
gemini_pro_summary(loader,prompt_template,hyper_parameter)

**Gemini pro model evalution**

In [None]:
pages = loader.load_and_split()
gemini_output1 = []
for i in range(1,10):
    text = pages[i].page_content
    response = gemini_model.generate_content(prompt_template,generation_config=hyper_parameter)
    gemini_output1.append(response.text)

In [None]:
data2 = pd.read_csv('/kaggle/input/sample/model_generated.csv')
data2['gemini_pro_advance'] = gemini_output1[0:25]

In [None]:
generated_summaries = data2['gemini_pro_advance'].tolist()
reference_summaries = data2['gpt-3.5'].tolist()
# Calculate ROUGE scores for the selected samples
rouge = Rouge()
rouge_scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
print("ROUGE Scores:", rouge_scores)

In [None]:
bleu_score = corpus_bleu(reference_summaries, generated_summaries)
print("BLEU Score for 25 Summaries:", bleu_score)

In [None]:
P, R, F1 = score(generated_summaries, reference_summaries, lang="en", verbose=True)
print("BERT Precision:", P.mean().item())
print("BERT Recall:", R.mean().item())
print("BERT F1 Score:", F1.mean().item())