In [1]:
%pip install nltk docx
%pip install python-docx pyppeteer
%pip install langchain
%pip install --quiet langchain-google-genai
%pip install -U langchain-community
%pip install --upgrade --quiet  docx2txt
%pip install bert_score
%pip install evaluate


Collecting docx
  Downloading docx-0.2.4.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m911.4 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: docx
  Building wheel for docx (setup.py) ... [?25ldone
[?25h  Created wheel for docx: filename=docx-0.2.4-py3-none-any.whl size=53895 sha256=f533b1367348fb1595aa38a2df89de5549b28dc75eccf09236b7e6648d34211f
  Stored in directory: /Users/lokeshrepaka/Library/Caches/pip/wheels/c1/3e/c3/e81c11effd0be5658a035947c66792dd993bcff317eae0e1ed
Successfully built docx
Installing collected packages: docx
Successfully installed docx-0.2.4
Note: you may need to restart the kernel to use updated packages.
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pyppeteer
  Downloading pyppeteer-2.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyee<12.0.0,>=11.0.0 (from p

In [3]:
import os
import re
import docx
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

input_directory = '/Users/lokeshrepaka/Downloads/project4'
output_directory = './preprocessed_texts'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def preprocess_text(text: str) -> str:
    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove special chars and numbers
    text = re.sub("[^A-Za-z]+", " ", text)
    
    # remove stopwords
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
    text = " ".join(tokens)
    text = text.lower().strip()
    
    return text

def read_docx(file_path):
    doc = docx.Document(file_path)
    text = []
    for paragraph in doc.paragraphs:
        text.append(paragraph.text)
    return "\n".join(text)

def save_to_docx(text, filename):
    doc = docx.Document()
    doc.add_paragraph(text)
    doc.save(filename)

def preprocess_and_save(input_directory, output_directory):
    for filename in os.listdir(input_directory):
        if filename.endswith(".docx"):
            file_path = os.path.join(input_directory, filename)
            text = read_docx(file_path)
            preprocessed_text = preprocess_text(text)
            output_path = os.path.join(output_directory, filename)
            save_to_docx(preprocessed_text, output_path)
            print(f"Preprocessed file saved: {output_path}")

# Run the preprocessing and saving
preprocess_and_save(input_directory, output_directory)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lokeshrepaka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lokeshrepaka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessed file saved: ./preprocessed_texts/Doc_1.docx
Preprocessed file saved: ./preprocessed_texts/Divislaboratories2 1.docx
Preprocessed file saved: ./preprocessed_texts/Sun Pharmaceutical Industries Limited1 1.docx
Preprocessed file saved: ./preprocessed_texts/Doc_7.docx
Preprocessed file saved: ./preprocessed_texts/Doc_6.docx
Preprocessed file saved: ./preprocessed_texts/Samsung Electronics Co2 2.docx
Preprocessed file saved: ./preprocessed_texts/Sun Pharmaceutical Industries Limited 2 1.docx
Preprocessed file saved: ./preprocessed_texts/Doc_9.docx
Preprocessed file saved: ./preprocessed_texts/Doc_5.docx
Preprocessed file saved: ./preprocessed_texts/Samsung Electronics Co 1 2.docx
Preprocessed file saved: ./preprocessed_texts/Doc_4.docx
Preprocessed file saved: ./preprocessed_texts/Doc_8.docx
Preprocessed file saved: ./preprocessed_texts/Divislaboratories1 1.docx
Preprocessed file saved: ./preprocessed_texts/Doc_3.docx
Preprocessed file saved: ./preprocessed_texts/Doc_2.docx


In [4]:
import os
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from docx import Document

def preprocess_text(text):
    # Placeholder for actual preprocessing function
    # Implement the actual text preprocessing logic here
    return text

def read_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

def write_docx(text, file_path):
    doc = Document()
    for paragraph in text.split('\n'):
        doc.add_paragraph(paragraph)
    doc.save(file_path)

def generate_clusters_and_save_summaries(preprocessed_texts_directory, output_directory, eps=0.25, min_samples=2):
    # Get the list of preprocessed text files
    file_names = [f for f in os.listdir(preprocessed_texts_directory) if f.endswith('.docx')]
    
    # Read and preprocess the documents
    documents = []
    original_documents = {}
    for file_name in file_names:
        file_path = os.path.join(preprocessed_texts_directory, file_name)
        text = read_docx(file_path)
        preprocessed_text = preprocess_text(text)
        documents.append(preprocessed_text)
        original_documents[file_name] = text
    
    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(documents)
 
    # Calculate cosine similarity matrix
    cosine_sim = cosine_similarity(X)
 
    # Initialize DBSCAN clustering algorithm
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
    cluster_labels = dbscan.fit_predict(cosine_sim)
 
    # Initialize clustered_documents dictionary to store clusters
    clustered_documents = defaultdict(list)
 
    # Assign texts to clusters based on cluster labels
    for i, label in enumerate(cluster_labels):
        clustered_documents[label].append(file_names[i])
 
    # Create and save merged documents for each cluster
    for cluster_id, docs in clustered_documents.items():
        merged_text = "\n".join([original_documents[doc] for doc in docs])
        output_file_path = os.path.join(output_directory, f'Cluster_{cluster_id}.docx')
        write_docx(merged_text, output_file_path)
        print(f'Cluster {cluster_id} -> {output_file_path}')

# Example usage
preprocessed_texts_directory = './preprocessed_texts'
output_directory = './output'
os.makedirs(output_directory, exist_ok=True)
generate_clusters_and_save_summaries(preprocessed_texts_directory, output_directory)


Cluster 0 -> ./output/Cluster_0.docx
Cluster 1 -> ./output/Cluster_1.docx
Cluster 2 -> ./output/Cluster_2.docx
Cluster 3 -> ./output/Cluster_3.docx
Cluster -1 -> ./output/Cluster_-1.docx
Cluster 4 -> ./output/Cluster_4.docx


In [5]:
from langchain import PromptTemplate
from langchain.document_loaders import WebBaseLoader
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [6]:
import os 
import getpass 
os.environ['GOOGLE_API_KEY'] = getpass.getpass('api key:')
#AIzaSyBSyhQnpBcHhl1Jo_ZPoI94A_gm3LbGXvQ

In [7]:
from langchain_google_genai import ChatGoogleGenerativeAI


llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest",
                 temperature=0.7, top_p=0.85)

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
chunks_prompt="""
Please summarize the below text:
text:`{text}'
Summary:
"""
map_prompt_template=PromptTemplate(input_variables=['text'],
                                    template=chunks_prompt)

In [28]:
final_combine_prompt='''
Provide a final summary of the entire speech with these important points and give output in  output format
Objective:
The goal of this task is to create a single, unique summarized document from a collection of documents by eliminating duplicate and similar content.

Deduplication Criteria:
Identify and remove exact duplicate sentences or paragraphs.
Detect and eliminate paraphrased or semantically similar content. Consider content to be similar if it conveys the same information using different wording.
Retain unique and important information only once.
Summarization Guidelines:

The final summary should be concise yet comprehensive, capturing all unique and important information from the input documents.
Ensure that key points and critical information are included.
Maintain the original meaning and context of the information.

Formatting Requirements:

Use clear and logical headings to organize the summary.
Bullet points may be used for lists or key points.
Ensure the final document is well-structured and easy to read.
Please avoid using any symbols such as asterisks or hash signs.

Title Format:
Generate two titles: one for the document and one for the summary.
Document_Title: Used to rename the document.
Summary_Title: A concise, descriptive title for the overall summary of the document.
For example, if the document talks about Apple Inc. as follows:
"Apple Inc., headquartered in Cupertino, California, is a multinational tech giant and the world's largest company by market capitalization. Founded in 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne, Apple began by selling personal computers like the Apple II, which became a bestseller."
Then the document title and summary title should be:
Document Title: Apple
Summary Title: Apple Inc.: From Garage Startup to Tech Giant


Examples:
Input Documents:
"The new AI model improves accuracy in image recognition. It can process images faster than previous versions."
"The AI model can process images at a faster rate and improves accuracy in recognizing images compared to earlier versions."

Output Format:
Document_Title: Image Recognition
Summary_Title: Improved AI Model for Image Recognition

"The new AI model improves accuracy in image recognition and can process images faster than previous versions."":\n `{text}`
'''
final_combine_prompt_template=PromptTemplate(input_variables=['text'],
                                             template=final_combine_prompt)

In [29]:
import os
import re
from langchain_community.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain import PromptTemplate
from docx import Document
from evaluate import load
import time
import pandas as pd

# Function to extract document name from its path
def extract_docx_name(docx_path):
    return docx_path.split('/')[-1]

# Function to load the document using the Docx2txtLoader
def load_docx(docx_path):
    loader = Docx2txtLoader(docx_path)
    return loader.load()

# Function to read text from a .docx file
def read_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Define the document prompt template
doc_prompt = PromptTemplate.from_template("{page_content}")

# Function to format the document for the prompt
def format_document(doc_content, prompt):
    return prompt.format(page_content=doc_content)

# Function to preprocess the text by removing specified symbols
def preprocess_text(text):
    # Remove symbols like *, # etc.
    cleaned_text = re.sub(r'[*#]', '', text)
    return cleaned_text

# Function to extract the title from the text
def extract_title_and_content(text):
    match = re.search(r'Document_Title:\s*(.*?)(\r\n|\n|$)', text)
    if match:
        title = match.group(1).strip()
        # Remove the matched title line from the content
        text = text.replace(match.group(0), '', 1)
        # Remove invalid characters for filenames
        title = re.sub(r'[\/:*?"<>|]', '', title)
        return title, text
    return "Summary", text

# Directory paths
directory_path = "./output"
summary_directory_path = "./summary_output"

# Create the summary directory if it doesn't exist
if not os.path.exists(summary_directory_path):
    os.makedirs(summary_directory_path)

# Collect all .docx files in the directory
docx_files = [f for f in os.listdir(directory_path) if f.endswith('.docx')]

# Prepare the results list for storing evaluation metrics
results_list = []

# Loop over each .docx file
for docx_file in docx_files:
    docx_path = os.path.join(directory_path, docx_file)
    docx_name = extract_docx_name(docx_path)
    time.sleep(15)
    print(f"Processing document: {docx_name}")

    # Load the document
    doc_contents = load_docx(docx_path)  # Assuming load_docx returns a list of Document objects

    # Extract text content from each Document object and process separately
    for doc_content in doc_contents:
        text = doc_content.page_content
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=300)
        chunks = text_splitter.create_documents([text])
        #print("Number of chunks:", len(chunks))
        
        summary_chain = load_summarize_chain(
            llm=llm,
            chain_type='map_reduce',
            map_prompt=map_prompt_template,
            combine_prompt=final_combine_prompt_template,
            verbose=False
        )
        output = summary_chain.run(chunks)
        
        # Preprocess the summary output
        cleaned_output = preprocess_text(output)
        title, text = extract_title_and_content(cleaned_output)
        print("Document title:",title)
        print(text)
        
        #print(cleaned_output)
        
        # Create a new document to save the summary
        summary_doc = Document()
        summary_doc.add_heading(f'Summary of {title}', level=1)
        summary_doc.add_paragraph(text)
        
        # Define the output path for the summary
        summary_docx_path = os.path.join(summary_directory_path, f'{title}.docx')
        
        # Save the summarized document
        summary_doc.save(summary_docx_path)
        print(f"Summary saved to: {summary_docx_path}")

        # Perform BERTScore evaluation
        predictions = [read_docx(summary_docx_path)]
        references = [read_docx(docx_path)]
        
        bertscore = load("bertscore")
        results = bertscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased")
        print(results)

        precision = results['precision'][0]
        recall = results['recall'][0]
        f1_score = results['f1'][0]
        
        # Append the results to the list
        results_list.append({
            'document_name': title,
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score
        })

# Save the results to an Excel file
results_df = pd.DataFrame(results_list)
results_df.to_excel('summary_evaluation_results.xlsx', index=False)
print("Evaluation results saved to 'summary_evaluation_results.xlsx'")


Processing document: Cluster_-1.docx
Document title: Apple
Summary_Title: Apple: From Garage Startup to Tech Giant

This document summarizes the history of Apple Inc., from its founding in 1976 to its current position as a global technology leader. 

Founding and Early Years:

 Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976.
 The company's early success was driven by the Apple II, a personal computer designed by Wozniak.
 The Macintosh, a groundbreaking computer with a graphical user interface (GUI), was introduced in 1984.
 Apple experienced rapid growth but also faced challenges, including the failure of the Lisa computer.

Challenges and Internal Conflicts:

 The Macintosh's early sales slump led to power struggles between Jobs and CEO John Sculley.
 Jobs was removed from the Macintosh division and later resigned from Apple.
 Wozniak also left the company, expressing frustration with its direction.

Decline and Resurgence:

 Apple's high prices for Macintos

KeyboardInterrupt: 