In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#from cleantext import clean
import chromadb
from sentence_transformers import SentenceTransformer
from pathlib import Path
import numpy as np
from sklearn.cluster import HDBSCAN

from dataclasses import dataclass
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.schema import Document

import umap
import plotly.express as px
import logging

# Setup logger
logger = logging.getLogger(__name__)

import os
os.environ['OPENAI_API_KEY'] = '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'


In [36]:
@dataclass
class DocumentData:
    filename: str
    content: list[Document]
    page_count: int
    cleaned_text: str | None = None
    embedding: np.ndarray | None = None

class FileLoader:
    def __init__(self, files: list[Path]) -> None:
        self.files_loaded =  [Path(f) if isinstance(f, str) else f for f in files]
        self.loaded_documents: list[DocumentData] = []

    def _load_single_file(self, file_path: Path) -> DocumentData | None:
        try:
            if not file_path.exists():
                raise FileNotFoundError(f"File {file_path} not found")

            loader = PyMuPDFLoader(str(file_path))
            data = loader.load()

            return DocumentData(
                filename=file_path.name,
                content=data,
                page_count=len(data)
            )

        except Exception as e:
            print(f"Failed to load {file_path}: {e}")
            return None

    def load_files(self) -> list[DocumentData]:
        self.loaded_documents.clear()

        for file_path in self.files_loaded:
            doc = self._load_single_file(file_path)
            if doc:
                self.loaded_documents.append(doc)

        logger.info(f"Successfully loaded {len(self.loaded_documents)} out of {len(self.files_loaded)} files")
        return self.loaded_documents

class TextExtractorCleaner:
    def __init__(self, texts: list[DocumentData]) -> None:
        self.texts = texts

    def _clean_single_text(self, single_text: str) -> str | None:
        try:
            if not single_text:
                logger.info("An error ocured while cleaning text")
                return None

            cleaned_text = clean(single_text,
                fix_unicode=True,
                to_ascii=False,
                lower=False,
                no_line_breaks=False,
                normalize_whitespace=True,
                no_urls=True,
                no_emails=True,
                no_phone_numbers=True,
                no_numbers=False,
                no_digits=False,
                no_currency_symbols=True,
                no_punct=False
            )

            return cleaned_text

        except Exception as e:
            logger.info(f"Failed to clean {single_text}: {e}")
            return None

    def clean_texts(self) -> list[DocumentData]:

        for doc_data in self.texts:
            all_pages_text = []
            for document in doc_data.content:
                cleaned = self._clean_single_text(document.page_content)
                if cleaned:
                    all_pages_text.append(cleaned)
            doc_data.cleaned_text = " ".join(all_pages_text)

        logger.info(f"{len(self.texts)} texts cleaned")

        return self.texts


class EmbeddingEngine:
    def __init__(self, text_list: list[DocumentData]) -> None:
        self.embedding_model = SentenceTransformer("malteos/scincl")
        self.texts = text_list

    def _embedding(self, text_embedding: str) -> np.ndarray | None:
        try:
            if not text_embedding:
                logger.info("An error ocured while embedding")
                return None

            embedding = self.embedding_model.encode(text_embedding)
            return embedding

        except Exception as e:
            logger.info(f"Failed to embed {text_embedding}: {e}")
            return None

    def create_embeddings(self) -> list[DocumentData]:
        for doc_data in self.texts:
            embedded_doc = self._embedding(doc_data.cleaned_text)
            doc_data.embedding = embedded_doc

        logger.info(f"{len(self.texts)} texts embedded")

        return self.texts


class CreateVectorStore:
    def __init__(self, document_db: list[DocumentData]) -> None:
        self.raw_data = document_db
        self.client = chromadb.Client()
        self.collection = None
        self.database = []

    def _save_to_vector_store(self) -> None:
        successful_saves = 0  # ← TUTAJ na początku metody

        for doc_data in self.raw_data:
            if doc_data.embedding is None:
                logger.warning(f"Skipping {doc_data.filename} - embedding failed")
                continue

            self.collection.add(
                embeddings=doc_data.embedding.tolist(),
                documents=doc_data.cleaned_text or "No content",
                ids=doc_data.filename.replace('.pdf', ''),
                metadatas={
                    "filename": doc_data.filename,
                    "total_pages": doc_data.page_count,
                    "doc_type": "full_document"
                }
            )
            successful_saves += 1

        logger.info(f"Saved {successful_saves}/{len(self.raw_data)} documents")

    def create_vector_store(self) -> chromadb.api.models.Collection.Collection:
        collection_name = "papers"

        # Sprawdź czy kolekcja istnieje i usuń ją
        try:
            existing_collections = self.client.list_collections()
            collection_exists = any(col.name == collection_name for col in existing_collections)

            if collection_exists:
                self.client.delete_collection(collection_name)
                print(f"Usunięto istniejącą kolekcję: {collection_name}")
        except Exception as e:
            print(f"Błąd podczas sprawdzania/usuwania kolekcji: {e}")

        # Utwórz nową kolekcję
        self.collection = self.client.create_collection(collection_name)
        self._save_to_vector_store()
        return self.collection

class DimensionalityReduction:
    def __init__(self, vector_store, metric='cosine', n_neighbors=15, min_dist=0.1) -> None:
        self.vector_store = vector_store
        self.reducer = umap.UMAP(metric=metric, n_neighbors=n_neighbors, min_dist=min_dist, random_state=1)
        self.results = []

    def umap(self) -> pd.DataFrame:
        results = self.vector_store.get(include=['embeddings'])
        embedding = results['embeddings']
        embedding_umap = self.reducer.fit_transform(embedding)
        self.results = pd.DataFrame(embedding_umap, columns=["UMAP1", "UMAP2"])
        logger.info("UMAP dimensionality reduction done.")
        return self.results


class Clustering:
    def __init__(self, reducted_results: pd.DataFrame, min_cluster_size=10) -> None:
        self.reducted_results = reducted_results
        self.clustering_algo = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=3)
        self.clustering_results = None

    def hdbscan(self) -> pd.DataFrame:
        fitted_model = self.clustering_algo.fit(self.reducted_results)
        self.clustering_results = self.reducted_results.copy()
        self.clustering_results["HDBCLUSTER"] = fitted_model.labels_
        logger.info("HDB clustering reduction done.")
        return self.clustering_results

class UMAPVisualization:
    def __init__(self, reducted_results: pd.DataFrame) -> None:
        self.reducted_results = reducted_results

    def visualize_umap(self) -> None:
        sns.set_theme(style="white")
        plt.figure(figsize=(8, 8))
        sns.scatterplot(x="UMAP1", y="UMAP2", hue="HDBCLUSTER", palette="pastel", data=self.reducted_results)
        plt.title("Document Clusters in UMAP Space")
        plt.show()

class OpenAISummarizer:
    def __init__(self,  combined_texts : str) -> None:
        self.combined_texts = combined_texts
    def _open_AI_API(self, ):
        import os
        from openai import OpenAI

        client = OpenAI(
            api_key=os.environ.get("OPENAI_API_KEY"),
        )

        response = client.responses.create(
            model="gpt-4.1-nano",
            instructions="""Analyze the following collection of academic documents that have been grouped together by similarity clustering.

            Return your response as valid JSON in this exact format:
            {
                "title": "descriptive title (5-10 words maximum) of all documents",
                "summary": "3-4 sentence overview of main themes and topics",
            }

            Make sure to return ONLY valid JSON, nothing else.""",
            input=self.combined_texts,
        )

        import json
        return json.loads(response.output_text)


class ClusterSummary:
    def __init__(self, clustered_vectors, vector_store):
        self.clustered_vectors = clustered_vectors
        self.vector_store = vector_store

    def _analyze_single_cluster(self, cluster_id, all_docs, max_chars_per_doc=5000):
        """Ukryta metoda do analizy pojedynczego klastra"""
        all_metadata = self.vector_store.get(include=['metadatas'])

        cluster_mask = self.clustered_vectors['HDBCLUSTER'] == cluster_id
        cluster_indices = self.clustered_vectors[cluster_mask].index.tolist()

        # Pobierz dokumenty z ograniczeniem znaków
        truncated_docs = [all_docs['documents'][i][:max_chars_per_doc] for i in cluster_indices]
        combined_text = " ".join(truncated_docs)

        ai_result = OpenAISummarizer(combined_text)._open_AI_API()

        # Dodaj AI wyniki do danych klastra
        cluster_data = self.clustered_vectors[cluster_mask].copy()
        cluster_data['filename'] = [all_metadata['metadatas'][i]['filename'] for i in cluster_indices]
        cluster_data['cluster_title'] = ai_result['title']
        cluster_data['cluster_summary'] = ai_result['summary']

        return cluster_data

    def get_dataframe(self):
        """Tworzy DataFrame z analizą wszystkich klastrów"""
        all_docs = self.vector_store.get(include=['documents'])
        results = []

        for cluster_id in self.clustered_vectors['HDBCLUSTER'].unique():
            if cluster_id == -1:  # pomiń noise
                continue

            print(f"Przetwarzam klaster {cluster_id}...")
            cluster_data = self._analyze_single_cluster(cluster_id, all_docs)
            results.append(cluster_data)

        return pd.concat(results, ignore_index=True)


In [3]:
import spacy
from spacy_cleaner import Cleaner, processing
logger = logging.getLogger(__name__)

import spacy
from spacy_cleaner import Cleaner, processing


class TextExtractorCleaner:
    def __init__(self, texts: list[DocumentData]) -> None:
        self.texts = texts
        logger.info("Loading spaCy model...")
        self.nlp = spacy.load("en_core_web_sm")

        logger.info("Initializing text cleaner...")
        self.cleaner = Cleaner(
            self.nlp,
            processing.remove_stopword_token,
            processing.remove_punctuation_token,
            processing.mutate_lemma_token,
        )

    def _clean_single_text(self, single_text: str) -> str:
        try:
            cleaned_result = self.cleaner.clean([single_text])

            if isinstance(cleaned_result, list) and len(cleaned_result) > 0:
                cleaned_text = cleaned_result[0]

                if isinstance(cleaned_text, str):
                    return " ".join(cleaned_text.split())  # Usuń nadmiarowe spacje
                elif isinstance(cleaned_text, list):
                    return " ".join(cleaned_text)

            return None

        except Exception as e:
            logger.info(f"Failed to clean {single_text}: {e}")
            return None

    def clean_texts(self) -> list[DocumentData]:
        logger.info(f"Starting text cleaning for {len(self.texts)} documents...")

        for doc_data in self.texts:
            # Batch processing
            page_texts = [doc.page_content for doc in doc_data.content if doc.page_content.strip()]

            if page_texts:
                try:
                    # Przetwórz wszystkie strony naraz (bardziej wydajne niż pojedynczo)
                    cleaned_results = self.cleaner.clean(page_texts)

                    processed_pages = []
                    for cleaned_text in cleaned_results:
                        if isinstance(cleaned_text, str):
                            processed_pages.append(" ".join(cleaned_text.split()))
                        elif isinstance(cleaned_text, list):
                            processed_pages.append(" ".join(cleaned_text))

                    doc_data.cleaned_text = " ".join(filter(None, processed_pages))

                except Exception as e:
                    logger.warning(f"Batch processing failed for {doc_data.filename}: {e}")
                    # Fallback do przetwarzania pojedynczego
                    all_pages_text = []
                    for document in doc_data.content:
                        cleaned = self._clean_single_text(document.page_content)
                        if cleaned:
                            all_pages_text.append(cleaned)
                    doc_data.cleaned_text = " ".join(all_pages_text)
            else:
                doc_data.cleaned_text = ""

        logger.info(f"{len(self.texts)} texts cleaned successfully")
        return self.texts

In [38]:
file_path = Path(r"C:\Users\pawel\OneDrive\Python projekty\AWS projects\PDF_organiser\pdf\test")
#file_path = Path(r"C:\Users\pawel\OneDrive\Soil\Papers")
pdf_files = list(file_path.glob("*.pdf"))

files_loades = FileLoader(pdf_files).load_files()
extracted_texts = TextExtractorCleaner(files_loades).clean_texts()
embedded_texts = EmbeddingEngine(files_loades).create_embeddings()
vector_store = CreateVectorStore(embedded_texts).create_vector_store()

Skipping 1.pdf - embedding failed
Skipping 10.pdf - embedding failed
Skipping 11.pdf - embedding failed
Skipping 12.pdf - embedding failed
Skipping 13.pdf - embedding failed
Skipping 14.pdf - embedding failed
Skipping 15.pdf - embedding failed
Skipping 16.pdf - embedding failed
Skipping 17.pdf - embedding failed
Skipping 18.pdf - embedding failed
Skipping 19.pdf - embedding failed
Skipping 2.pdf - embedding failed
Skipping 20.pdf - embedding failed
Skipping 21.pdf - embedding failed
Skipping 22.pdf - embedding failed
Skipping 23.pdf - embedding failed
Skipping 24.pdf - embedding failed
Skipping 25.pdf - embedding failed
Skipping 26.pdf - embedding failed
Skipping 27.pdf - embedding failed
Skipping 28.pdf - embedding failed
Skipping 29.pdf - embedding failed
Skipping 3.pdf - embedding failed
Skipping 30.pdf - embedding failed
Skipping 31.pdf - embedding failed
Skipping 32.pdf - embedding failed
Skipping 33.pdf - embedding failed
Skipping 34.pdf - embedding failed
Skipping 35.pdf - embed

Usunięto istniejącą kolekcję: papers


In [35]:
results = vector_store.query(
    query_texts=["viruses"], # Chroma will embed this for you
    n_results=2 # how many results to return
)

print(results)

{'ids': [[]], 'embeddings': None, 'documents': [[]], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[]], 'distances': [[]]}


In [11]:
reduced_vectorstore = DimensionalityReduction(vector_store, metric='cosine', n_neighbors=10, min_dist=0.1).umap()
clustered_vectors = Clustering(reduced_vectorstore, min_cluster_size=5).hdbscan()
UMAPVisualization(clustered_vectors).visualize_umap()



ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [14]:
df_summary = ClusterSummary(clustered_vectors, vector_store).get_dataframe()

NameError: name 'clustered_vectors' is not defined

In [18]:
fig = px.scatter(
    df_summary,
    x='UMAP1',
    y='UMAP2',
    color='HDBCLUSTER',
    hover_data=['filename', 'cluster_title'],  # Bez summary
    title='Document Clusters'
)

fig.update_layout(width=800, height=600)
fig.show()

In [8]:
cluster_indices = clustered_vectors[clustered_vectors['HDBCLUSTER'] == 0].index.tolist()

all_docs = vector_store.get(include=['documents'])

cluster_documents = [all_docs['documents'][i] for i in cluster_indices]
combined_text = " ".join(cluster_documents)

wynik = ClusterSummarizer(combined_text)._open_AI_API()

NameError: name 'clustered_vectors' is not defined

In [7]:
results = vector_store.get(include=['embeddings', 'metadatas', 'documents'])
df_meta = pd.DataFrame(results['metadatas'])
results

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'included': ['embeddings', 'metadatas', 'documents'],
 'data': None,
 'metadatas': []}

In [16]:
results = vector_store.get(include=['embeddings', 'metadatas', 'documents'])
embedding = results['embeddings']

reducer = umap.UMAP(metric='cosine')
embedding_umap = reducer.fit_transform(embedding)

embedding_umap_df = pd.DataFrame(embedding_umap)
embedding_umap_df.columns = ["UMAP1", "UMAP2"]

hdb = HDBSCAN(min_cluster_size=10, min_samples=1).fit(embedding_umap_df)
hdb.fit(embedding_umap_df)

embedding_umap_df["HDBCLUSTER"] = hdb.labels_
embedding_umap_df

sns.set_theme(style="whitegrid")

plt.figure(figsize=(10, 10))
sns.scatterplot(x="UMAP1", y="UMAP2", hue="HDBCLUSTER", data=embedding_umap_df)
plt.show()



In [None]:
#Files → TextExtraction → TextCleanup → Embedding → VectorStore → Search/Export

In [67]:
# Encode query tym samym modelem
query_text = "vaterite crystal structure"
query_embedding = model.encode([query_text])

# Search używając query embedding
results = collection.query(
    query_embeddings=query_embedding.tolist(),  # Nie query_texts!
    n_results=5
)

print("Znalezione dokumenty:")
for i, doc in enumerate(results['documents'][0]):
    metadata = results['metadatas'][0][i]
    distance = results['distances'][0][i]

    print(f"📄 {metadata['filename']}")
    print(f"📊 Similarity: {1-distance:.3f}")
    print(f"📝 Fragment: {doc[:200]}...")
    print("---")

Znalezione dokumenty:
📄 1.pdf
📊 Similarity: -98.942
📝 Fragment: ['Subscriber access provided by Eastern Michigan University | Bruce T. Halle Library\nCrystal Growth & Design is published by the American Chemical\nSociety. 1155 Sixteenth Street N.W., Washington, DC...
---
