# Candidate Informaton:
## Name: Matti Ur Rehman
## Email: matti.mansha@gmail.com
## Linkedin: https://www.linkedin.com/in/mattimansha/
## Phone No.: +92 (312) 6075344

In [116]:
# installation required
! pip install langchain
! pip install unstructured
! pip install pdf2image
! pip install pdfminer
! pip install pdfminer.six
! pip install markdown
! pip install openai
! pip install reportlab





# Source Material:
URL: https://drive.google.com/uc?export=download&id=1WYaEoSozvUmJ_Lt5clAVs8NuSu43h9ps

# Data Loading

In [117]:
from langchain.document_loaders import OnlinePDFLoader
import re

In [118]:
loader = OnlinePDFLoader("https://drive.google.com/uc?export=download&id=1WYaEoSozvUmJ_Lt5clAVs8NuSu43h9ps")

In [119]:
data = loader.load()

In [120]:
data[0].dict().keys()

dict_keys(['page_content', 'metadata'])

In [121]:
data[0].dict()["page_content"][:1000]

'Crime and Punishment\n\nBy Fyodor Dostoevsky\n\nDownload free eBooks of classic literature, books and novels at Planet eBook. Subscribe to our free eBooks blog and email newsletter.\n\nTranslator’s Preface\n\nA few words about Dostoevsky himself may help the Eng-\n\nDostoevsky was the son of a doctor. His parents were very hard- working and deeply religious people, but so poor that they lived with their five children in only two rooms. The father and mother spent their evenings in reading aloud to their children, generally from books of a serious charac- ter.\n\nlish reader to understand his work.\n\nThough always sickly and delicate Dostoevsky came out third in the final examination of the Petersburg school of Engineering. There he had already begun his first work, ‘Poor Folk.’\n\nThis story was published by the poet Nekrassov in his review and was received with acclamations. The shy, un- known youth found himself instantly something of a celebrity. A brilliant and successful career 

In [122]:
print("Total characters: {}".format(len(data[0].dict()["page_content"])))

Total characters: 1162137


# Data Preprocessing

In [123]:
# Function to clean and preprocess text
def preprocess_text(text):
    # Remove pattern 1: "\n1\x18\x18\nCrime and Punishment"
    text = re.sub(r'\n\d+\x18\x18\n', '', text)

    # Remove pattern 2: "\nFree eBooks at Planet eBook.com\n1\x18\x18\n"
    text = re.sub(r'\nFree eBooks at Planet eBook.com\n', '', text)
    
    # Remove pattern 3: "\nCrime and Punishment\n"
    text = text.replace('\nCrime and Punishment\n', '')

    # Remove pattern 4: "\n\x18\x18\x18\n"
    text = text.replace('\n\x18\x18\x18\n', '')
    
    # Remove pattern 5: "\n\x18"
    text = text.replace('\n\x18', '')

    # Remove pattern 6: "\n\x180"
    text = text.replace('\n\x180', '')

    # Remove pattern 7: "\n\x181"
    text = text.replace('\n\x181', '')

    # Remove pattern 8: "\n\x18\x18"
    text = text.replace('\n\x18\x18', '')

    # Remove pattern 9: "\n\x18\x18"
    text = text.replace('\n\x18\x18', '')
    
    # Remove pattern 10: "\n\x18\n"
    text = text.replace('\n\x18\n', ' ')    
    

    # Example: Remove extra line breaks
    text = text.replace('\n\n', '\n')

    return text



# Function to detect part boundaries
def detect_part_boundaries(text):
    # Define a regex pattern to match part headings (e.g., "Part I")
    part_pattern = r'Part [IVXLCDM]+\n'

    # Find all part headings in the text
    part_headings = re.findall(part_pattern, text)

    # Initialize a list to store parts with their positions
    parts = []

    # Add the "Translator’s Preface" section as the first part
    translator_preface_start = 0
    part_start = text.find(part_headings[0])
    if part_start != 0:
        parts.append(("Translator’s Preface", text[translator_preface_start:part_start]))

    for i in range(len(part_headings) - 1):
        part_start = text.find(part_headings[i])
        part_end = text.find(part_headings[i + 1])
        part_text = text[part_start:part_end]
        parts.append((part_headings[i].strip(), part_text))

    # Add the last part
    part_start = text.find(part_headings[-1])
    part_text = text[part_start:]
    parts.append((part_headings[-1].strip(), part_text))

    return parts


# Function to detect chapter boundaries within a part
def detect_chapter_boundaries(part_text):
    # Define a regex pattern to match chapter headings (e.g., "Chapter I")
    chapter_pattern = r'Chapter [IVXLCDM]+\n'

    # Split the part text into chapters using the chapter pattern
    chapters = re.split(chapter_pattern, part_text)

    # Remove any empty strings from the split
    chapters = [chapter.strip() for chapter in chapters if chapter.strip()]

    return chapters

In [124]:
part1 = detect_part_boundaries(preprocess_text(data[0].dict()["page_content"]))[0]

In [125]:
part1

('Translator’s Preface',
 'Crime and Punishment\nBy Fyodor Dostoevsky\nDownload free eBooks of classic literature, books and novels at Planet eBook. Subscribe to our free eBooks blog and email newsletter.\nTranslator’s Preface\nA few words about Dostoevsky himself may help the Eng-\nDostoevsky was the son of a doctor. His parents were very hard- working and deeply religious people, but so poor that they lived with their five children in only two rooms. The father and mother spent their evenings in reading aloud to their children, generally from books of a serious charac- ter.\nlish reader to understand his work.\nThough always sickly and delicate Dostoevsky came out third in the final examination of the Petersburg school of Engineering. There he had already begun his first work, ‘Poor Folk.’\nThis story was published by the poet Nekrassov in his review and was received with acclamations. The shy, un- known youth found himself instantly something of a celebrity. A brilliant and successf

# Formatting and Saving in correct format

In [127]:
# Function to create a Markdown file from parts and chapters
def create_markdown_file(parts):
    for part_num, (part_heading, part_text) in enumerate(parts):
        # Extract the part number from the part heading (e.g., "Part I" => "I")
        part_match = re.search(r'Part ([IVXLCDM]+)\n', part_heading)
        if part_match:
            part_number = part_match.group(1)
        else:
            part_number = str(part_num)

        # Special handling for part 0 (Translator’s Preface)
        if part_number == '0':
            part_filename = f'translator_preface.md'
            # Create a Markdown file for the Translator’s Preface
            with open(part_filename, 'w', encoding='utf-8') as part_file:
                # Write the part heading to the Markdown file
                part_file.write(f"# {part_heading}\n\n")
                # Write the entire Translator’s Preface text
                part_file.write(part_text)
        else:
            # Remove leading newline characters from the part heading and "Part" label
            part_heading = part_heading.lstrip('\n').replace('Part', '').strip()

            # Create a Markdown file for the part
            part_filename = f'part_{part_number}.md'

            with open(part_filename, 'w', encoding='utf-8') as part_file:
                # Write the part heading to the Markdown file
                part_file.write(f"# Part {part_heading}\n\n")  # Include "Part" in the chapter heading

                # Detect chapter boundaries within the part
                chapters = detect_chapter_boundaries(part_text)

                # Skip the first chapter heading that repeats the part heading
                first_chapter = True

                for i, chapter_text in enumerate(chapters):
                    # Skip the first chapter heading
                    if first_chapter:
                        first_chapter = False
                        continue

                    # Create a Markdown heading for each chapter within the part
                    chapter_heading = f"## Chapter {i}\n\n"

                    # Write the chapter heading and chapter text to the Markdown file
                    part_file.write(chapter_heading)
                    part_file.write(chapter_text)
                    part_file.write('\n\n')  # Add an extra newline for separation between chapters


In [128]:
# Preprocess the text
text = preprocess_text(data[0].dict()["page_content"])

# Detect part boundaries
parts = detect_part_boundaries(text)

# Create Markdown files for each part and write the chapters in Markdown format
create_markdown_file(parts)

# Splitting

In [129]:
import os
import glob
import markdown
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
markdown_directory = os.getcwd()

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
# chunk_size = 250
# chunk_overlap = 30
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=chunk_size, chunk_overlap=chunk_overlap
# )

splits = []

for markdown_file in glob.glob(os.path.join(markdown_directory, "*.md")):
    # Read the Markdown file
    print(markdown_file.split('\\')[-1].split('.')[0].replace("_", ' '))
    with open(markdown_file, 'r', encoding='utf-8') as file:
        markdown_text = file.read()
    md_header_splits = markdown_splitter.split_text(markdown_text)
    
    for split in md_header_splits:
        splits.append(split)

part 1
part 2
part 3
part 4
part 5
part 6
translator preface


In [135]:
print("Total Documents: {}".format(len(splits)))

Total Documents: 40


In [136]:
splits[10]

Document(page_content='Zossimov was a tall, fat man with a puffy, colourless, clean-\nshaven face and straight flaxen hair. He wore spectacles, and a big gold ring on his fat finger. He was twenty-seven. He had on a light grey fashionable loose coat, light sum- mer trousers, and everything about him loose, fashionable and spick and span; his linen was irreproachable, his watch- chain was massive. In manner he was slow and, as it were, nonchalant, and at the same time studiously free and easy; he made efforts to conceal his self-importance, but it was apparent at every instant. All his acquaintances found him tedious, but said he was clever at his work.\n‘I’ve been to you twice to-day, brother. You see, he’s come\nto himself,’ cried Razumihin.\n‘I see, I see; and how do we feel now, eh?’ said Zossimov to Raskolnikov, watching him carefully and, sitting down at the foot of the sofa, he settled himself as comfortably as he could.\n‘He is still depressed,’ Razumihin went on. ‘We’ve just\nc

In [137]:
splits[-1]

Document(page_content='Crime and Punishment\nBy Fyodor Dostoevsky\nDownload free eBooks of classic literature, books and novels at Planet eBook. Subscribe to our free eBooks blog and email newsletter.\nTranslator’s Preface\nA few words about Dostoevsky himself may help the Eng-\nDostoevsky was the son of a doctor. His parents were very hard- working and deeply religious people, but so poor that they lived with their five children in only two rooms. The father and mother spent their evenings in reading aloud to their children, generally from books of a serious charac- ter.\nlish reader to understand his work.\nThough always sickly and delicate Dostoevsky came out third in the final examination of the Petersburg school of Engineering. There he had already begun his first work, ‘Poor Folk.’\nThis story was published by the poet Nekrassov in his review and was received with acclamations. The shy, un- known youth found himself instantly something of a celebrity. A brilliant and successful c

In [139]:
len(splits[0].dict()["page_content"])

18263

In [140]:
splits[0].dict()["metadata"]

{'Header 1': 'Part I', 'Header 2': 'Chapter 1'}

# OpenAI API Setup

In [141]:
import configparser
import openai

# Create a ConfigParser object
config = configparser.ConfigParser()

# Read the config.ini file
config.read('config.ini')

my_openai_key = config['openai']['OPENAI_API_KEY']
openai.api_key = my_openai_key
os.environ['OPENAI_API_KEY'] = my_openai_key

# Prompt, LLM, Chain, Callback

In [32]:
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.callbacks import get_openai_callback
from langchain.chat_models import ChatOpenAI


# handler = StdOutCallbackHandler()


# Define prompt
prompt_template = """Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

# Define LLM chain
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Define StuffDocumentsChain
stuff_chain = StuffDocumentsChain(
    llm_chain=llm_chain, document_variable_name="text"
)

In [33]:
with get_openai_callback() as cb:
    stuff_chain.run([md_header_splits[0]])
    print(cb)

Tokens Used: 1214
	Prompt Tokens: 1080
	Completion Tokens: 134
Successful Requests: 1
Total Cost (USD): $0.0037760000000000003


In [44]:
summarize_document = {}

def insert_summary(part_no, chapter_no, summary):
    if part_no not in summarize_document:
        summarize_document[part_no] = {}
    summarize_document[part_no][chapter_no] = summary

count = 1
cost = 0

for split_doc in splits:
    with get_openai_callback() as cb:
        print(count)
        print(split_doc.dict()["metadata"]['Header 1'])
        print(split_doc.dict()["metadata"]['Header 2'])
        partNo = split_doc.dict()["metadata"]['Header 1']
        chapterNo = split_doc.dict()["metadata"]['Header 2']
        summary = stuff_chain.run([split_doc])
        print(cb)
        cost += cb.total_cost
    
        insert_summary(partNo, chapterNo, summary)
    
    count += 1
#     break

1
Part I
Chapter 1
Tokens Used: 4501
	Prompt Tokens: 4361
	Completion Tokens: 140
Successful Requests: 1
Total Cost (USD): $0.013642999999999999
2
Part I
Chapter 2
Tokens Used: 9907
	Prompt Tokens: 9773
	Completion Tokens: 134
Successful Requests: 1
Total Cost (USD): $0.029855
3
Part I
Chapter 3
Tokens Used: 7265
	Prompt Tokens: 7170
	Completion Tokens: 95
Successful Requests: 1
Total Cost (USD): $0.02189
4
Part I
Chapter 4
Tokens Used: 6915
	Prompt Tokens: 6808
	Completion Tokens: 107
Successful Requests: 1
Total Cost (USD): $0.020852000000000002
5
Part I
Chapter 5
Tokens Used: 5696
	Prompt Tokens: 5587
	Completion Tokens: 109
Successful Requests: 1
Total Cost (USD): $0.017196999999999997
6
Part I
Chapter 6
Tokens Used: 6665
	Prompt Tokens: 6426
	Completion Tokens: 239
Successful Requests: 1
Total Cost (USD): $0.020234
7
Part I
Chapter 7
Tokens Used: 6373
	Prompt Tokens: 6248
	Completion Tokens: 125
Successful Requests: 1
Total Cost (USD): $0.019244
8
Part II
Chapter 1
Tokens Used: 93

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002016D335250>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002016D337910>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<lo

Tokens Used: 7579
	Prompt Tokens: 7391
	Completion Tokens: 188
Successful Requests: 1
Total Cost (USD): $0.022925
16
Part III
Chapter 2
Tokens Used: 6436
	Prompt Tokens: 6312
	Completion Tokens: 124
Successful Requests: 1
Total Cost (USD): $0.019432
17
Part III
Chapter 3
Tokens Used: 7438
	Prompt Tokens: 7218
	Completion Tokens: 220
Successful Requests: 1
Total Cost (USD): $0.022534
18
Part III
Chapter 4
Tokens Used: 6417
	Prompt Tokens: 6190
	Completion Tokens: 227
Successful Requests: 1
Total Cost (USD): $0.019478000000000002
19
Part III
Chapter 5
Tokens Used: 10032
	Prompt Tokens: 9901
	Completion Tokens: 131
Successful Requests: 1
Total Cost (USD): $0.030227
20
Part III
Chapter 6
Tokens Used: 5870
	Prompt Tokens: 5766
	Completion Tokens: 104
Successful Requests: 1
Total Cost (USD): $0.017714
21
Part IV
Chapter 1
Tokens Used: 7148
	Prompt Tokens: 7001
	Completion Tokens: 147
Successful Requests: 1
Total Cost (USD): $0.021591
22
Part IV
Chapter 2
Tokens Used: 6664
	Prompt Tokens: 656

KeyError: 'Header 2'

In [48]:
with get_openai_callback() as cb:
        print(splits[-1].dict()["metadata"]['Header 1'])
        partNo = splits[-1].dict()["metadata"]['Header 1']
        summary = stuff_chain.run([splits[-1]])
        print(cb)
        cost += cb.total_cost
    
        insert_summary(partNo, " ", summary)

Translator’s Preface
Tokens Used: 1214
	Prompt Tokens: 1080
	Completion Tokens: 134
Successful Requests: 1
Total Cost (USD): $0.0037760000000000003


In [49]:
print(f"Total Cost (USD): ${cost}")

Total Cost (USD): $0.870677


# Summarized Document (Dictionary)

In [50]:
summarize_document

{'Part I': {'Chapter 1': 'On a hot evening in July, a young man leaves his garret and walks towards a bridge, avoiding his landlady. He is in debt to her and afraid of meeting her. The young man is in an irritable and isolated state, and fears interacting with anyone. He is absorbed in himself and has given up on practical matters. He is going to attempt something, but is unsure and frightened. The heat and unpleasant surroundings exacerbate his already strained nerves. He enters a tavern and drinks beer, feeling a temporary relief from his worries. However, he senses that his newfound cheerfulness is not normal. The tavern is mostly empty, with a few drunk men and a suspicious-looking man.',
  'Chapter 2': "Raskolnikov, who usually avoids crowds, suddenly feels a desire to be around people. He goes to a tavern and notices a retired clerk who seems interested in talking to him. The clerk, Marmeladov, tells Raskolnikov about his struggles with poverty and his wife's mistreatment. Marmel

In [55]:
import json
# Specify the file path where you want to save the JSON data
json_file_path = "summarize_document.json"

# Convert the dictionary to JSON and save it to the file
with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(summarize_document, json_file, ensure_ascii=False, indent=4)

# Print a message indicating that the data has been saved
print(f"JSON data saved to {json_file_path}")

JSON data saved to summarize_document.json


In [66]:
# Reorder the dictionary by moving "Translator’s Preface" to the beginning
reordered_data = {
    "Translator’s Preface": summarize_document["Translator’s Preface"],
    **{key: summarize_document[key] for key in summarize_document if key != "Translator’s Preface"}
}

In [67]:
reordered_data

{'Translator’s Preface': {' ': 'Fyodor Dostoevsky was a Russian author known for his novel "Crime and Punishment." He had a difficult upbringing and was arrested in 1849 for his involvement in conversations against censorship. His death sentence was commuted to hard labor, and he spent several years in Siberia. Dostoevsky suffered from epilepsy and experienced intense suffering during his imprisonment, which influenced his writing. He faced financial struggles throughout his life but was beloved by readers in Russia. He died in 1880 and was mourned by many. Dostoevsky\'s insight and wisdom in his writing resonated with readers and made him a highly regarded author.'},
 'Part I': {'Chapter 1': 'On a hot evening in July, a young man leaves his garret and walks towards a bridge, avoiding his landlady. He is in debt to her and afraid of meeting her. The young man is in an irritable and isolated state, and fears interacting with anyone. He is absorbed in himself and has given up on practica

# PDF Generation of Summary

In [113]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet

# Create a PDF document
pdf_file_path = "summarize_document.pdf"
doc = SimpleDocTemplate(pdf_file_path, pagesize=letter)

story = []

# Define padding values
padding_left = 50  # Adjust the left padding
padding_right = 50  # Adjust the right padding
padding_top = 50  # Adjust the top padding
padding_bottom = 50  # Adjust the bottom padding

# Define a style for the title
title_style = getSampleStyleSheet()["Title"]
title_style.fontName = "Helvetica-Bold"
title_style.alignment = 1  # Centered
title_style.fontSize = 42  # Increase the font size for the title
title_style.leading = 56  # Increase the line spacing (leading) for the title

# Define a style for the top-level section titles (Heading1)
styles = getSampleStyleSheet()
heading1_style = styles["Heading1"]
heading1_style.fontSize = 36  # Increase the font size for Heading1
heading1_style.leading = 48  # Increase the line spacing (leading) for the title

heading2_style = styles["Heading2"]
heading2_style.fontSize = 28  # Increase the font size for Heading1
heading2_style.leading = 36  # Increase the line spacing (leading) for the title

# Define a style for the paragraphs
style = styles["Normal"]
style.fontSize = 14  # Increase the font size for the paragraphs
style.leading = 20  # Increase the line spacing (leading) for the title

# Add the title to the PDF
title_text = "Summary: Crime and Punishment"
title = Paragraph(title_text, title_style)
story.append(title)

# Add spacing above the title
story.append(Spacer(1, padding_top))

# Iterate through the JSON data and add the "Translator’s Preface," parts, chapters, and summaries to the PDF
for section, chapters in reordered_data.items():
    # Add the section title to the PDF
    section_title = Paragraph(section, heading1_style)  # Use the Heading1 style for top-level titles
    story.append(section_title)
    
    for chapter, summary in chapters.items():
        # Skip the empty chapter title for the "Translator’s Preface"
        if chapter:
            # Add the chapter title to the PDF
            chapter_title = Paragraph(chapter, heading2_style)
            story.append(chapter_title)
        
        # Add the chapter summary to the PDF
        chapter_summary = Paragraph(summary, style)
        story.append(chapter_summary)
        
        # Add space between chapters
    story.append(Spacer(1, padding_bottom + 20))  # Add spacing below each chapter

# Build the PDF document
doc.build(story)

print(f"PDF created: {pdf_file_path}")


PDF created: summarize_document.pdf
