#Data Preparation


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import shutil

# Set paths
source_folder = '/content/drive/My Drive/n8n-bot-project'
local_download_path = '/content/n8n-pdfs'

# Make local folder if not exists
os.makedirs(local_download_path, exist_ok=True)

# Copy all PDFs
for filename in os.listdir(source_folder):
    if filename.lower().endswith('.pdf'):
        src = os.path.join(source_folder, filename)
        dst = os.path.join(local_download_path, filename)
        shutil.copy2(src, dst)
        print(f'Copied: {filename}')


In [None]:
!pip install pymupdf
import fitz  # PyMuPDF
import pandas as pd

# Extract text and metadata from all PDFs
pdf_data = []

for filename in os.listdir(local_download_path):
    if filename.lower().endswith('.pdf'):
        file_path = os.path.join(local_download_path, filename)
        doc = fitz.open(file_path)

        # Extract full text from all pages
        text = ""
        for page in doc:
            text += page.get_text()

        # Store filename, full text, and number of pages
        pdf_data.append({
            "filename": filename,
            "text": text,
            "num_pages": len(doc)
        })

# Convert to DataFrame for inspection or processing
df_pdf = pd.DataFrame(pdf_data)
print(" Extracted text and metadata from all PDFs:")
df_pdf.head()


#Preprocessing

In [8]:
import re
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    """
    Clean and normalize raw PDF text.
    Designed for sentence embeddings with all-MiniLM-L6-v2.
    """
    # Remove common header/footer noise
    text = re.sub(r'(?i)page\s*\d+|printed in .*|©\s*\d{4}.*|DOI:.*', '', text)

    # Remove email addresses and affiliation clutter
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\b(university|institute|dept\.?|department|school of|faculty of)[^\n]*', '', text, flags=re.I)

    # Fix broken hyphenated words (e.g., "exam-\nple" -> "example")
    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)

    # Normalize newlines to spaces
    text = re.sub(r'\s*\n\s*', ' ', text)

    # Collapse multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)

    # Strip leading/trailing whitespace
    text = text.strip()

    return text

def lemmatize_text(text):
    """
    Lemmatize the cleaned text.
    """
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_punct])

In [None]:

import pandas as pd


pkl_path = "df_pdf.pkl"

# If cleaned text already exits, just load it
if os.path.exists(pkl_path):
    print("Loading cached df_pdf from df_pdf.pkl...")
    df_pdf = pd.read_pickle(pkl_path)
else:
    print("Processing and cleaning text...")

    cleaned_docs = []
    for doc in tqdm(df_pdf['text']):
        cleaned = clean_text(doc)
        lemmatized = lemmatize_text(cleaned)
        cleaned_docs.append(lemmatized)

    df_pdf['cleaned_text'] = cleaned_docs

    # otherwise save as pickle
    df_pdf.to_pickle(pkl_path)
    print("Saved df_pdf to df_pdf.pkl.")



In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split into chunks with metadata
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
chunks = []

for _, row in df_pdf.iterrows():
    splits = text_splitter.split_text(row['cleaned_text'])
    for i, chunk in enumerate(splits):
        chunks.append({
            "filename": row['filename'],
            "chunk_id": i,
            "chunk": chunk,
            "num_pages": row['num_pages']
        })

# Create a DataFrame of chunks
df_chunks = pd.DataFrame(chunks)
print(" Text split into chunks with metadata:")
print(df_chunks.head())

#Feature Engineering

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
!pip install qdrant-client
import qdrant_client
from qdrant_client.models import PointStruct, VectorParams, Distance


# Load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
print("Generating embeddings for all chunks...")
embeddings = model.encode(df_chunks['chunk'].tolist(), show_progress_bar=True)



!!Only run if first time

In [None]:
import pickle

if os.path.exists("embeddings.pkl"):
    print("Loading embeddings from cache...")
    with open("embeddings.pkl", "rb") as f:
        embeddings = pickle.load(f)
else:
    print("Generating embeddings for all chunks...")
    with open("embeddings.pkl", "wb") as f:
        pickle.dump(embeddings, f)


In [None]:
print("len(embeddings[0]): ",len(embeddings[0]))
print("len(df_pdf): ", len(df_pdf))

#Perform KMeans clustering on all chunk embeddings

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

#Find best K using silhouette_score
def auto_kmeans(embeddings, min_k=2, max_k=15):
    best_k = min_k
    best_score = -1
    for k in range(min_k, min(max_k + 1, len(df_pdf))):  # prevent K > number of samples
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(embeddings)
        score = silhouette_score(embeddings, labels)
        if score > best_score:
            best_k = k
            best_score = score
    return best_k

# Auto finding the best # of clusters
n_clusters = auto_kmeans(embeddings)
print(f"Auto-selected best n_clusters = {n_clusters}")

# Perform KMeans
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)
df_chunks['cluster'] = cluster_labels


In [None]:
print(df_chunks.head())

#Pseudo-Labelling Classification and 5-fold cross validation


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


models = {
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC(),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

cv_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, embeddings, cluster_labels, cv=5)
    cv_scores[name] = np.mean(scores)
    print(f"{name} average CV score: {cv_scores[name]:.4f}")

#Select best model

In [None]:

best_model_name = max(cv_scores, key=cv_scores.get)
best_model = models[best_model_name]
best_model.fit(embeddings, cluster_labels)
print(f"Best model selected: {best_model_name}")

# Predict refined labels
optimized_labels = best_model.predict(embeddings)


#Evaluation

In [None]:
from sklearn.metrics import silhouette_score, confusion_matrix, classification_report, adjusted_rand_score, homogeneity_score, completeness_score, v_measure_score, adjusted_mutual_info_score

original_silhouette = silhouette_score(embeddings, cluster_labels)
optimized_silhouette = silhouette_score(embeddings, optimized_labels)


ari_score = adjusted_rand_score(cluster_labels, optimized_labels)
print(f"Adjusted Rand Index between original and optimized clusters: {ari_score:.4f}")


# Evaluate with additional metrics
print("\n Clustering Evaluation Metrics:")
true_labels = cluster_labels  # treating KMeans output as pseudo ground truth for comparison

metrics_original = {
    "Silhouette Score": original_silhouette,
    "Homogeneity Score": homogeneity_score(true_labels, cluster_labels),
    "Completeness Score": completeness_score(true_labels, cluster_labels),
    "V-measure Score": v_measure_score(true_labels, cluster_labels),
    "Adjusted Rand Score": adjusted_rand_score(true_labels, cluster_labels),
    "Adjusted Mutual Info Score": adjusted_mutual_info_score(true_labels, cluster_labels)
}

metrics_optimized = {
    "Silhouette Score": optimized_silhouette,
    "Homogeneity Score": homogeneity_score(true_labels, optimized_labels),
    "Completeness Score": completeness_score(true_labels, optimized_labels),
    "V-measure Score": v_measure_score(true_labels, optimized_labels),
    "Adjusted Rand Score": adjusted_rand_score(true_labels, optimized_labels),
    "Adjusted Mutual Info Score": adjusted_mutual_info_score(true_labels, optimized_labels)
}

# Display metrics as DataFrames
df_eval = pd.DataFrame({
    "Metric": list(metrics_original.keys()),
    "Original": list(metrics_original.values()),
    "Optimized": list(metrics_optimized.values())
})

print("\nClustering Evaluation Comparison:")
print(df_eval.to_markdown(index=False))







#Clustering quality comparison

In [None]:
print(f"Original silhouette score: {original_silhouette:.4f}")
print(f"Optimized silhouette score: {optimized_silhouette:.4f}")

# Highlight silhouette improvement
improvement = optimized_silhouette - original_silhouette
print(f"Silhouette change {improvement:.4f} using pseudo-supervised learning.")

# Replace labels only if silhouette improved
if optimized_silhouette > original_silhouette - 0.01:
    df_chunks['optimized_cluster'] = optimized_labels
    print("Optimized labels used (silhouette improved)")
else:
    df_chunks['optimized_cluster'] = cluster_labels
    print("Silhouette did not improve. Retaining original cluster labels")

#Visualization of classification

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

conf_mat = confusion_matrix(cluster_labels, optimized_labels)

report = classification_report(cluster_labels, optimized_labels)
print("Classification Report:")
print(report)

# Visualize clusters (assumes 2D embeddings for simplicity, or you can reduce with PCA/UMAP)
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(embeddings[:, 0], embeddings[:, 1], c=cluster_labels, cmap='viridis', s=10)
plt.title("Original Clusters")
plt.subplot(1, 2, 2)
plt.scatter(embeddings[:, 0], embeddings[:, 1], c=optimized_labels, cmap='viridis', s=10)
plt.title("Optimized Clusters")
plt.tight_layout()
plt.show()

# Heatmap of confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix Heatmap")
plt.xlabel("Optimized")
plt.ylabel("Original")
plt.show()




In [None]:
print(df_chunks.head())

#Error analysis

In [None]:
# Error analysis — show misclassified examples as DataFrame
def perform_error_analysis(texts, predicted_labels, true_labels):
    misclassified_indices = [i for i, (pred, true) in enumerate(zip(predicted_labels, true_labels)) if pred != true]
    errors = []
    for i in misclassified_indices:
        errors.append({
            "Index": i,
            "True Label": true_labels[i],
            "Predicted": predicted_labels[i],
            "Chunk": texts[i][:300] + "..."
        })
    return pd.DataFrame(errors)

error_df = perform_error_analysis(df_chunks['chunk'].tolist(), optimized_labels, cluster_labels)

print("\n Error Analysis (Top 5 examples):")
print(error_df.head().to_markdown(index=False))

#Uploading vectors and metadata to Qdrant

In [None]:
# Upload to Qdrant
qdrant = qdrant_client.QdrantClient(
    url="...",
    #Replace with your Qdrant URL & API key.
    api_key="..."
)

#print("len(embeddings): ", len(embeddings))
collection_name = "chatbot_chunks"

# Check and create collection if needed
if not qdrant.collection_exists(collection_name):
    qdrant.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=len(embeddings[0]), distance=Distance.COSINE)
    )

# Prepare Qdrant points
points = []
for i, row in df_chunks.iterrows():
    vector = embeddings[i].tolist()
    payload = {
        "content": row["chunk"], # <--map to n8n pageContent
        "metadata": {
              "filename": row["filename"],
              "chunk_id": row["chunk_id"],
              "num_pages": row["num_pages"],
              "cluster_label": int(row["optimized_cluster"])
              }
    }
    #print(type(row["chunk"]))
    points.append(PointStruct(id=i, vector=vector, payload=payload))

# Upload in batch
qdrant.upload_points(collection_name=collection_name, points=points)
print(" Uploaded chunks with cluster labels and embeddings to Qdrant!")



In [None]:
# create df_eval，which includes vector
df_eval = df_chunks.copy()
df_eval["vector"] = [vec.tolist() for vec in embeddings]

# save as pkl
df_eval.to_pickle("df_eval.pkl")

In [None]:
if os.path.exists("df_eval.pkl"):
  df_eval = pd.read_pickle("df_eval.pkl")