In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json
import os
from datetime import datetime
from nltk.stem import PorterStemmer
from collections import defaultdict
import time


class CompleteSearchEngine:
    """
    Search engine supporting dynamic content addition
    with automatic indexing and ranked retrieval.
    """

    def __init__(self, base_path="/content/drive/MyDrive/DSA-Project/data/processed/"):
        self.base_path = base_path
        self.preprocessing_file = os.path.join(base_path, "preprocessing.json")
        self.lexicon_file = os.path.join(base_path, "lexicon.json")
        self.forward_index_file = os.path.join(base_path, "forward_index.json")
        self.barrels_folder = os.path.join(base_path, "barrels")

        self.stemmer = PorterStemmer()
        self.field_map = {
            "title": 2, "authors": 3, "categories": 4,
            "report_no": 5, "journal": 6, "abstract": 7, "update_date": 8
        }

        self.barrel_cache = {}
        self._load_data()

    def _load_data(self):
        with open(self.preprocessing_file, 'r', encoding='utf-8') as f:
            self.preprocessed_docs = json.load(f)

        with open(self.lexicon_file, 'r', encoding='utf-8') as f:
            self.lexicon = json.load(f)

        with open(self.forward_index_file, 'r', encoding='utf-8') as f:
            self.forward_index = json.load(f)

    # ================= INPUT HANDLING =================

    def add(self, data, format_type="auto"):
        if format_type == "auto":
            format_type = self._detect_format(data)

        if format_type == "dict":
            if isinstance(data, list):
                return self._add_batch(data)
            return self._add_document(self._normalize_fields(data))

        elif format_type == "json":
            doc = json.loads(data)
            if isinstance(doc, list):
                return self._add_batch(doc)
            return self._add_document(self._normalize_fields(doc))

        elif format_type == "json_file":
            with open(data, 'r', encoding='utf-8') as f:
                doc = json.load(f)
            if isinstance(doc, list):
                return self._add_batch(doc)
            return self._add_document(self._normalize_fields(doc))

        elif format_type == "text":
            if os.path.exists(data):
                with open(data, 'r', encoding='utf-8') as f:
                    data = f.read()
            return self._add_from_text(data)

        elif format_type == "csv":
            import csv
            with open(data, 'r', encoding='utf-8') as f:
                docs = list(csv.DictReader(f))
            return self._add_batch(docs)

        return False

    def _detect_format(self, data):
        if isinstance(data, dict) or isinstance(data, list):
            return "dict"
        if isinstance(data, str):
            if os.path.exists(data):
                if data.endswith(".json"):
                    return "json_file"
                if data.endswith(".csv"):
                    return "csv"
                return "text"
            try:
                json.loads(data)
                return "json"
            except:
                return "text"
        return "unknown"

    def _normalize_fields(self, doc):
        mappings = {
            'id': ['id', 'doc_id'],
            'title': ['title'],
            'authors': ['authors', 'author'],
            'categories': ['categories', 'category'],
            'abstract': ['abstract', 'description', 'content'],
            'report-no': ['report-no'],
            'journal-ref': ['journal-ref', 'journal'],
            'update_date': ['update_date', 'date']
        }

        normalized = {}
        for std, vars in mappings.items():
            for v in vars:
                if v in doc:
                    normalized[std] = doc[v]
                    break
            if std not in normalized:
                normalized[std] = datetime.now().strftime("%Y-%m-%d") if std == "update_date" else ""

        if not normalized.get("id"):
            normalized["id"] = f"AUTO_{datetime.now().strftime('%Y%m%d%H%M%S')}"

        return normalized

    def _add_from_text(self, text):
        return self._add_document({
            "id": f"TEXT_{datetime.now().strftime('%Y%m%d%H%M%S')}",
            "title": text.split('\n')[0][:200],
            "authors": "Unknown",
            "categories": "misc",
            "abstract": text[:500],
            "report-no": "",
            "journal-ref": "",
            "update_date": datetime.now().strftime("%Y-%m-%d")
        })

    # ================= CORE INDEXING =================

    def _add_document(self, doc):
        doc_id = str(doc["id"])
        if doc_id in self.forward_index:
            return False

        tokens = self._preprocess_document(doc)
        self.preprocessed_docs.append({
            "id": doc_id,
            "title": doc.get("title", ""),
            "authors": doc.get("authors", ""),
            "categories": doc.get("categories", ""),
            "abstract": doc.get("abstract", ""),
            "tokens": tokens
        })

        next_id = max(self.lexicon.values(), default=0) + 1
        for t in tokens:
            if t["token"] not in self.lexicon:
                self.lexicon[t["token"]] = next_id
                next_id += 1

        self._update_forward_index(doc_id, tokens)
        self._update_barrels(doc_id, tokens)
        self._save_all()
        self.barrel_cache.clear()
        return True

    def _add_batch(self, docs):
        return any(self._add_document(self._normalize_fields(d)) for d in docs)

    def _preprocess_document(self, doc):
        tokens, pos = [], 0
        fields = [
            ("title", doc.get("title", "")),
            ("authors", doc.get("authors", "")),
            ("categories", doc.get("categories", "")),
            ("report_no", doc.get("report-no", "")),
            ("journal", doc.get("journal-ref", "")),
            ("abstract", doc.get("abstract", "")),
        ]

        for field, text in fields:
            for word in text.lower().split():
                word = word.strip(".,!?()[]{}'\"")
                if len(word) > 1:
                    tokens.append({
                        "token": self.stemmer.stem(word),
                        "global_pos": pos,
                        "field": field
                    })
                    pos += 1
        return tokens

    def _update_forward_index(self, doc_id, tokens):
        data = defaultdict(lambda: [0, [], 0, 0, 0, 0, 0, 0, 0])
        for t in tokens:
            wid = str(self.lexicon[t["token"]])
            entry = data[wid]
            entry[0] += 1
            entry[1].append(t["global_pos"])
            if t["field"] in self.field_map:
                entry[self.field_map[t["field"]]] += 1
        self.forward_index[doc_id] = dict(data)

    def _update_barrels(self, doc_id, tokens):
        letters = "abcdefghijklmnopqrstuvwxyz#"
        barrels = {l: {} for l in letters}

        for l in letters:
            path = os.path.join(self.barrels_folder, f"{l}.json")
            if os.path.exists(path):
                with open(path, 'r') as f:
                    barrels[l] = json.load(f)

        postings = defaultdict(lambda: [0, [], 0, 0, 0, 0, 0, 0, 0])
        for t in tokens:
            wid = str(self.lexicon[t["token"]])
            postings[wid][0] += 1

        for wid, post in postings.items():
            word = next(k for k, v in self.lexicon.items() if str(v) == wid)
            key = word[0] if word[0].isalpha() else "#"
            barrels[key].setdefault(wid, {})[doc_id] = post

        for k, v in barrels.items():
            with open(os.path.join(self.barrels_folder, f"{k}.json"), 'w') as f:
                json.dump(v, f, separators=(',', ':'))

    def _save_all(self):
        json.dump(self.preprocessed_docs, open(self.preprocessing_file, 'w'), indent=2)
        json.dump(self.lexicon, open(self.lexicon_file, 'w'), indent=2)
        json.dump(self.forward_index, open(self.forward_index_file, 'w'), separators=(',', ':'))

    # ================= SEARCH =================

    def search(self, query, top_k=10):
        terms = [self.stemmer.stem(w.lower()) for w in query.split()]
        postings = [self._get_postings(t) for t in terms if self._get_postings(t)]

        if not postings:
            return []

        docs = set(postings[0])
        for p in postings[1:]:
            docs &= set(p)

        ranked = []
        for d in docs:
            score = sum(p[d][0] for p in postings if d in p)
            length = sum(v[0] for v in self.forward_index[d].values())
            ranked.append((d, score / length if length else 0))

        ranked.sort(key=lambda x: x[1], reverse=True)
        return ranked[:top_k]

    def _get_postings(self, word):
        wid = self.lexicon.get(word)
        if not wid:
            return {}
        letter = word[0] if word[0].isalpha() else "#"
        if letter not in self.barrel_cache:
            with open(os.path.join(self.barrels_folder, f"{letter}.json")) as f:
                self.barrel_cache[letter] = json.load(f)
        return self.barrel_cache[letter].get(str(wid), {})
