In [None]:
import re
import uuid
import time
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from collections import defaultdict

import requests
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, Range, SearchRequest
from fastapi import FastAPI, HTTPException, UploadFile, File
from pydantic import BaseModel
import markdown
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# Constants
COLLECTION_NAME = "readme_sections"
VECTOR_SIZE = 768  # Size of nomic-embed-text embeddings
OLLAMA_API_URL = "http://localhost:11434/api/embeddings"

@dataclass
class ReadmeSection:
    content: str
    heading: str
    level: int
    parent: Optional[str]
    children: List[str]
    metadata: Dict[str, Any]

class READMEProcessor:
    def __init__(self):
        self.qdrant_client = QdrantClient("localhost", port=6333)
        self._setup_collection()
        self.tfidf_vectorizer = TfidfVectorizer()

    def _setup_collection(self):
        if not self.qdrant_client.get_collection(COLLECTION_NAME):
            self.qdrant_client.create_collection(
                collection_name=COLLECTION_NAME,
                vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
            )

    def _get_embedding(self, text: str) -> List[float]:
        response = requests.post(OLLAMA_API_URL, json={
            "model": "nomic-embed-text",
            "prompt": text
        })
        response.raise_for_status()
        return response.json()['embedding']

    def parse_readme(self, content: str) -> List[ReadmeSection]:
        html = markdown.markdown(content)
        soup = BeautifulSoup(html, 'html.parser')
        sections = []
        section_stack = []
        current_section = None

        for elem in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre', 'ul', 'ol']):
            if elem.name.startswith('h'):
                level = int(elem.name[1])
                while section_stack and section_stack[-1].level >= level:
                    section_stack.pop()

                parent = section_stack[-1] if section_stack else None
                current_section = ReadmeSection(
                    content=elem.text,
                    heading=elem.text,
                    level=level,
                    parent=parent.heading if parent else None,
                    children=[],
                    metadata={}
                )
                if parent:
                    parent.children.append(current_section.heading)
                sections.append(current_section)
                section_stack.append(current_section)
            else:
                if current_section:
                    current_section.content += "\n" + elem.text

        return sections

    def process_readme(self, content: str):
        sections = self.parse_readme(content)
        section_graph = self._build_section_graph(sections)
        for section in sections:
            self._add_section_to_qdrant(section, section_graph)

    def _build_section_graph(self, sections: List[ReadmeSection]) -> nx.DiGraph:
        G = nx.DiGraph()
        for section in sections:
            G.add_node(section.heading, level=section.level)
            if section.parent:
                G.add_edge(section.parent, section.heading)
        return G

    def _add_section_to_qdrant(self, section: ReadmeSection, section_graph: nx.DiGraph):
        vector = self._get_embedding(section.content)
        point_id = str(uuid.uuid4())
        timestamp = time.time()

        # Calculate centrality and other graph-based features
        centrality = nx.degree_centrality(section_graph)[section.heading]
        depth = nx.shortest_path_length(section_graph, source=list(section_graph.nodes)[0], target=section.heading)

        payload = {
            "content": section.content,
            "heading": section.heading,
            "level": section.level,
            "parent": section.parent,
            "children": section.children,
            "metadata": {
                **section.metadata,
                "timestamp": timestamp,
                "centrality": centrality,
                "depth": depth,
                "access_count": 0,
                "relevance_score": 1.0
            }
        }

        self.qdrant_client.upsert(
            collection_name=COLLECTION_NAME,
            points=[PointStruct(id=point_id, vector=vector, payload=payload)]
        )

    def search_sections(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        query_vector = self._get_embedding(query)

        # Perform semantic search
        search_result = self.qdrant_client.search(
            collection_name=COLLECTION_NAME,
            query_vector=query_vector,
            limit=top_k * 2  # Retrieve more results for re-ranking
        )

        # Extract contents for TF-IDF re-ranking
        contents = [hit.payload['content'] for hit in search_result]
        tfidf_matrix = self.tfidf_vectorizer.fit_transform([query] + contents)
        
        # Calculate TF-IDF similarities
        tfidf_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
        
        # Combine semantic and TF-IDF scores
        combined_scores = [(hit, 0.7 * hit.score + 0.3 * tfidf_sim) 
                           for hit, tfidf_sim in zip(search_result, tfidf_similarities)]
        
        # Sort by combined score and take top_k
        combined_scores.sort(key=lambda x: x[1], reverse=True)
        top_results = combined_scores[:top_k]

        results = []
        for hit, score in top_results:
            section = hit.payload
            section['score'] = score
            self._update_section_relevance(hit.id, score)
            results.append(section)

        return results

    def _update_section_relevance(self, point_id: str, score: float):
        current_payload = self.qdrant_client.retrieve(COLLECTION_NAME, [point_id])[0].payload
        current_payload['metadata']['access_count'] += 1
        current_payload['metadata']['relevance_score'] = (current_payload['metadata']['relevance_score'] + score) / 2

        self.qdrant_client.upsert(
            collection_name=COLLECTION_NAME,
            points=[PointStruct(id=point_id, payload=current_payload)]
        )

    def get_context(self, section_heading: str, depth: int = 1) -> Dict[str, Any]:
        filter_condition = Filter(
            must=[FieldCondition(key="heading", match={'value': section_heading})]
        )
        results = self.qdrant_client.scroll(
            collection_name=COLLECTION_NAME,
            scroll_filter=filter_condition,
            limit=1
        )
        if not results.points:
            return {}

        section = results.points[0].payload
        context = {
            "current": section,
            "parent": None,
            "children": [],
            "siblings": []
        }

        if section['parent']:
            parent_filter = Filter(
                must=[FieldCondition(key="heading", match={'value': section['parent']})]
            )
            parent_results = self.qdrant_client.scroll(
                collection_name=COLLECTION_NAME,
                scroll_filter=parent_filter,
                limit=1
            )
            if parent_results.points:
                context["parent"] = parent_results.points[0].payload

        if depth > 0:
            for child_heading in section['children']:
                child_context = self.get_context(child_heading, depth - 1)
                if child_context:
                    context["children"].append(child_context["current"])

            if context["parent"]:
                for sibling_heading in context["parent"]["children"]:
                    if sibling_heading != section_heading:
                        sibling_context = self.get_context(sibling_heading, 0)
                        if sibling_context:
                            context["siblings"].append(sibling_context["current"])

        return context

    def prune_sections(self, threshold: float = 0.5, max_age_days: int = 30):
        current_time = time.time()
        max_age_seconds = max_age_days * 24 * 60 * 60

        filter_condition = Filter(
            must=[
                FieldCondition(
                    key="metadata.relevance_score",
                    range=Range(lt=threshold)
                ),
                FieldCondition(
                    key="metadata.timestamp",
                    range=Range(lt=current_time - max_age_seconds)
                )
            ]
        )

        self.qdrant_client.delete(
            collection_name=COLLECTION_NAME,
            points_selector=filter_condition
        )

# FastAPI app
app = FastAPI()
readme_processor = READMEProcessor()

@app.post("/process_readme")
async def process_readme(file: UploadFile = File(...)):
    content = await file.read()
    readme_processor.process_readme(content.decode())
    return {"message": "README processed successfully"}

@app.post("/search")
async def search(query: str, top_k: int = 5):
    results = readme_processor.search_sections(query, top_k)
    return {"results": results}

@app.get("/context/{section_heading}")
async def get_context(section_heading: str, depth: int = 1):
    context = readme_processor.get_context(section_heading, depth)
    return {"context": context}

@app.post("/prune")
async def prune(threshold: float = 0.5, max_age_days: int = 30):
    readme_processor.prune_sections(threshold, max_age_days)
    return {"message": "Pruning completed successfully"}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

In [7]:
import os
import uuid
import time
import logging
from typing import List, Dict, Any, Optional
from dataclasses import dataclass

import requests
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, Range
from fastapi import FastAPI, HTTPException, UploadFile, File
from pydantic import BaseModel
import markdown
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from xgboost import XGBRanker
import networkx as nx

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Qdrant client
qdrant_client = QdrantClient(host="localhost", port=6333)

# Constants
COLLECTION_NAME = "advanced_readme_sections"
VECTOR_SIZE = 768

# Create collection if it doesn't exist
try:
    qdrant_client.get_collection(COLLECTION_NAME)
    logger.info(f"Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    logger.info(f"Creating collection '{COLLECTION_NAME}'.")
    qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.EUCLID)
    )

# Initialize FastAPI app
app = FastAPI()

@dataclass
class ReadmeSection:
    content: str
    heading: str
    level: int
    parent: Optional[str]
    children: List[str]
    metadata: Dict[str, Any]
    vector: List[float] = None

def get_embedding(text: str) -> np.ndarray:
    OLLAMA_API_URL = "http://localhost:11434/api/embeddings"
    response = requests.post(OLLAMA_API_URL, json={
        "model": "nomic-embed-text",
        "prompt": text
    })
    response.raise_for_status()
    return np.array(response.json()['embedding'])

def parse_readme(content: str) -> List[ReadmeSection]:
    html = markdown.markdown(content)
    soup = BeautifulSoup(html, 'html.parser')
    sections = []
    section_stack = []
    current_section = None

    for elem in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre', 'ul', 'ol']):
        if elem.name.startswith('h'):
            level = int(elem.name[1])
            while section_stack and section_stack[-1].level >= level:
                section_stack.pop()

            parent = section_stack[-1] if section_stack else None
            current_section = ReadmeSection(
                content='',
                heading=elem.text,
                level=level,
                parent=parent.heading if parent else None,
                children=[],
                metadata={}
            )
            if parent:
                parent.children.append(current_section.heading)
            sections.append(current_section)
            section_stack.append(current_section)
        else:
            if current_section:
                current_section.content += "\n" + elem.text

    return sections

def build_section_graph(sections: List[ReadmeSection]) -> nx.DiGraph:
    G = nx.DiGraph()
    for section in sections:
        G.add_node(section.heading, level=section.level)
        if section.parent:
            G.add_edge(section.parent, section.heading)
    return G

def cluster_sections(sections: List[ReadmeSection], n_clusters: int = 10):
    embeddings = np.array([section.vector for section in sections])
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings)
    for section, label in zip(sections, cluster_labels):
        section.metadata['cluster'] = int(label)

def add_section_to_qdrant(section: ReadmeSection, section_graph: nx.DiGraph):
    try:
        vector = get_embedding(section.content)
    except Exception as e:
        logger.error(f"Failed to get embedding for section '{section.heading}': {e}")
        return
    
    point_id = str(uuid.uuid4())
    timestamp = time.time()

    centrality = nx.degree_centrality(section_graph).get(section.heading, 0)
    try:
        depth = nx.shortest_path_length(section_graph, source=list(section_graph.nodes)[0], target=section.heading)
    except nx.NetworkXNoPath:
        depth = 0

    payload = {
        "content": section.content,
        "heading": section.heading,
        "level": section.level,
        "parent": section.parent,
        "children": section.children,
        "metadata": {
            **section.metadata,
            "timestamp": timestamp,
            "centrality": centrality,
            "depth": depth,
            "access_count": 0,
            "relevance_score": 1.0
        }
    }

    qdrant_client.upsert(
        collection_name=COLLECTION_NAME,
        points=[PointStruct(id=point_id, vector=vector.tolist(), payload=payload)]
    )
    logger.info(f"Section '{section.heading}' added to Qdrant with ID {point_id}.")

knn_model: Optional[NearestNeighbors] = None
point_id_mapping: Dict[int, str] = {}

def build_knn_index():
    global knn_model, point_id_mapping
    logger.info("Building KNN index...")
    all_points = qdrant_client.scroll(collection_name=COLLECTION_NAME, limit=10000)
    
    if not all_points or not all_points[0]:
        logger.warning("No points found in the collection. KNN index not built.")
        knn_model = None
        point_id_mapping = {}
        return
    
    embeddings = np.array([point.vector for point in all_points[0]])
    
    if embeddings.size == 0:
        logger.warning("Embeddings array is empty. KNN index not built.")
        knn_model = None
        point_id_mapping = {}
        return
    
    knn_model = NearestNeighbors(n_neighbors=10, algorithm='auto', metric='euclidean')
    knn_model.fit(embeddings)
    point_id_mapping = {i: point.id for i, point in enumerate(all_points[0])}
    logger.info(f"KNN index built successfully with {len(point_id_mapping)} points.")

tfidf_vectorizer = TfidfVectorizer()

def calculate_tfidf_similarity(query: str, document: str) -> float:
    tfidf_matrix = tfidf_vectorizer.fit_transform([query, document])
    return (tfidf_matrix * tfidf_matrix.T).A[0, 1]

def prepare_training_data(query: str, sections: List[ReadmeSection]):
    features = []
    labels = []
    for section in sections:
        feature_vector = [
            section.metadata.get('tfidf_similarity', 0.0),
            section.metadata.get('semantic_similarity', 0.0),
            section.metadata.get('centrality', 0.0),
            section.level,
            section.metadata.get('cluster', 0)
        ]
        features.append(feature_vector)
        labels.append(section.metadata.get('relevance_label', 1))  # Placeholder
    return np.array(features), np.array(labels)

xgb_ranker = XGBRanker(
    objective='rank:pairwise',
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100
)

def search_sections(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
    if knn_model is None:
        logger.warning("KNN model is not built. No search can be performed.")
        return []
    
    query_vector = get_embedding(query).reshape(1, -1)
    distances, indices = knn_model.kneighbors(query_vector)
    nearest_points = [point_id_mapping[idx] for idx in indices[0]]
    
    sections = []
    for point_id in nearest_points:
        point = qdrant_client.retrieve(collection_name=COLLECTION_NAME, ids=[point_id])[0]
        section = point.payload
        section['vector'] = point.vector
        tfidf_sim = calculate_tfidf_similarity(query, section['content'])
        section['metadata']['tfidf_similarity'] = tfidf_sim
        semantic_sim = 1 / (1 + distances[0][indices[0].tolist().index(point_id_mapping.index(point_id))])
        section['metadata']['semantic_similarity'] = semantic_sim
        sections.append(section)
    
    if not sections:
        return []
    
    X_test, _ = prepare_training_data(query, sections)
    relevance_scores = xgb_ranker.predict(X_test)
    
    for section, score in zip(sections, relevance_scores):
        section['score'] = score
    sections.sort(key=lambda x: x['score'], reverse=True)
    
    for section in sections[:top_k]:
        update_section_relevance(section['id'], section['score'])
    return sections[:top_k]

def update_section_relevance(point_id: str, score: float):
    current_payload = qdrant_client.retrieve(
        collection_name=COLLECTION_NAME, ids=[point_id]
    )[0].payload
    current_payload['metadata']['access_count'] += 1
    current_payload['metadata']['relevance_score'] = (
        current_payload['metadata']['relevance_score'] + score
    ) / 2

    qdrant_client.upsert(
        collection_name=COLLECTION_NAME,
        points=[PointStruct(id=point_id, payload=current_payload)]
    )
    logger.info(f"Updated relevance for point ID {point_id}.")

def get_context(section_heading: str, depth: int = 1) -> Dict[str, Any]:
    filter_condition = Filter(
        must=[FieldCondition(key="heading", match={'value': section_heading})]
    )
    results = qdrant_client.scroll(
        collection_name=COLLECTION_NAME,
        scroll_filter=filter_condition,
        limit=1
    )
    if not results.points:
        return {}

    section = results.points[0].payload
    context = {
        "current": section,
        "parent": None,
        "children": [],
        "siblings": []
    }

    if section['parent']:
        parent_filter = Filter(
            must=[FieldCondition(key="heading", match={'value': section['parent']})]
        )
        parent_results = qdrant_client.scroll(
            collection_name=COLLECTION_NAME,
            scroll_filter=parent_filter,
            limit=1
        )
        if parent_results.points:
            context["parent"] = parent_results.points[0].payload

    if depth > 0 and 'children' in section:
        for child_heading in section['children']:
            child_context = get_context(child_heading, depth - 1)
            if child_context:
                context["children"].append(child_context["current"])

    if context["parent"] and 'children' in context["parent"]:
        for sibling_heading in context["parent"]["children"]:
            if sibling_heading != section_heading:
                sibling_context = get_context(sibling_heading, 0)
                if sibling_context:
                    context["siblings"].append(sibling_context["current"])

    return context

def prune_sections(threshold: float = 0.5, max_age_days: int = 30):
    current_time = time.time()
    max_age_seconds = max_age_days * 24 * 60 * 60

    filter_condition = Filter(
        must=[
            FieldCondition(
                key="metadata.relevance_score",
                range=Range(lt=threshold)
            ),
            FieldCondition(
                key="metadata.timestamp",
                range=Range(lt=current_time - max_age_seconds)
            )
        ]
    )

    qdrant_client.delete(
        collection_name=COLLECTION_NAME,
        points_selector=filter_condition
    )
    logger.info("Pruned low-relevance and old sections.")

@app.post("/process_readme")
async def process_readme_api(file: UploadFile = File(...)):
    content = await file.read()
    sections = parse_readme(content.decode())
    section_graph = build_section_graph(sections)
    for section in sections:
        section.vector = get_embedding(section.content).tolist()
    cluster_sections(sections)
    for section in sections:
        add_section_to_qdrant(section, section_graph)
    build_knn_index()
    return {"message": "README processed successfully"}

@app.post("/search")
async def search_api(query: str, top_k: int = 5):
    results = search_sections(query, top_k)
    return {"results": results}

@app.get("/context/{section_heading}")
async def get_context_api(section_heading: str, depth: int = 1):
    context = get_context(section_heading, depth)
    return {"context": context}

@app.post("/prune")
async def prune_api(threshold: float = 0.5, max_age_days: int = 30):
    prune_sections(threshold, max_age_days)
    return {"message": "Pruning completed successfully"}

@app.post("/rebuild_knn_index")
async def rebuild_knn_index():
    build_knn_index()
    return {"message": "KNN index rebuilt successfully"}

if __name__ == "__main__":
    import uvicorn
    build_knn_index()  # This will now handle empty collections gracefully
    uvicorn.run(app, host="0.0.0.0", port=8000)


INFO:httpx:HTTP Request: GET http://localhost:6333/collections/advanced_readme_sections "HTTP/1.1 404 Not Found"
INFO:__main__:Creating collection 'advanced_readme_sections'.
INFO:httpx:HTTP Request: PUT http://localhost:6333/collections/advanced_readme_sections "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/advanced_readme_sections/points/scroll "HTTP/1.1 200 OK"


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [6]:
!pip install xgboost


Collecting xgboost
  Using cached xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.1


